[lld] [llvm] [Support] Support nested parallel TaskGroup via work-stealing (PR #189293)
Fangrui Song via llvm-commits
llvm-commits at lists.llvm.org
Tue Mar 31 23:22:13 PDT 2026
https://github.com/MaskRay updated https://github.com/llvm/llvm-project/pull/189293
>From 4a4bd2ee7f58f46e85289ad873e5d1426b66fb19 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i at maskray.me>
Date: Sat, 28 Mar 2026 12:25:40 -0700
Subject: [PATCH 1/5] [Support] Optimize parallel `TaskGroup`
Two improvements to reduce `TaskGroup::spawn()` overhead:
1. Replace mutex-based `Latch::inc()` with atomic `fetch_add`. `dec()`
retains the mutex to prevent a race where `sync()` observes Count==0
and destroys the Latch while `dec()` is still running.
2. Pass `Latch&` through `Executor::add()` so the worker calls `dec()`
directly, eliminating the wrapper lambda that previously captured
both the user's callable and the Latch reference. This avoids one
`std::function` construction and potential heap allocation per spawn.
---
llvm/include/llvm/Support/Parallel.h | 19 +++++-------
llvm/lib/Support/Parallel.cpp | 44 +++++++++++++++-------------
2 files changed, 31 insertions(+), 32 deletions(-)
diff --git a/llvm/include/llvm/Support/Parallel.h b/llvm/include/llvm/Support/Parallel.h
index b0c9e8f29f970..f3bb9259c5c3a 100644
--- a/llvm/include/llvm/Support/Parallel.h
+++ b/llvm/include/llvm/Support/Parallel.h
@@ -17,6 +17,7 @@
#include "llvm/Support/Threading.h"
#include <algorithm>
+#include <atomic>
#include <condition_variable>
#include <functional>
#include <mutex>
@@ -58,31 +59,27 @@ inline size_t getThreadCount() { return 1; }
namespace detail {
class Latch {
- uint32_t Count;
+ std::atomic<uint32_t> Count;
mutable std::mutex Mutex;
mutable std::condition_variable Cond;
public:
explicit Latch(uint32_t Count = 0) : Count(Count) {}
- ~Latch() {
- // Ensure at least that sync() was called.
- assert(Count == 0);
- }
+ ~Latch() { assert(Count.load(std::memory_order_relaxed) == 0); }
- void inc() {
- std::lock_guard<std::mutex> lock(Mutex);
- ++Count;
- }
+ void inc() { Count.fetch_add(1, std::memory_order_relaxed); }
+ // dec() must hold Mutex so that sync() cannot observe Count==0 and
+ // destroy the Latch while dec() is still running.
void dec() {
std::lock_guard<std::mutex> lock(Mutex);
- if (--Count == 0)
+ if (Count.fetch_sub(1, std::memory_order_acq_rel) == 1)
Cond.notify_all();
}
void sync() const {
std::unique_lock<std::mutex> lock(Mutex);
- Cond.wait(lock, [&] { return Count == 0; });
+ Cond.wait(lock, [&] { return Count.load(std::memory_order_relaxed) == 0; });
}
};
} // namespace detail
diff --git a/llvm/lib/Support/Parallel.cpp b/llvm/lib/Support/Parallel.cpp
index 2348b788d3ea7..80cc6fdb05766 100644
--- a/llvm/lib/Support/Parallel.cpp
+++ b/llvm/lib/Support/Parallel.cpp
@@ -96,10 +96,10 @@ class ThreadPoolExecutor {
static void call(void *Ptr) { ((ThreadPoolExecutor *)Ptr)->stop(); }
};
- void add(std::function<void()> F) {
+ void add(std::function<void()> F, parallel::detail::Latch &L) {
{
std::lock_guard<std::mutex> Lock(Mutex);
- WorkStack.push_back(std::move(F));
+ WorkStack.push_back({std::move(F), std::ref(L)});
}
Cond.notify_one();
}
@@ -141,34 +141,39 @@ class ThreadPoolExecutor {
[&] { TheJobserver->release(std::move(Slot)); });
while (true) {
- std::function<void()> Task;
- {
- std::unique_lock<std::mutex> Lock(Mutex);
- Cond.wait(Lock, [&] { return Stop || !WorkStack.empty(); });
- if (Stop && WorkStack.empty())
- return;
- if (WorkStack.empty())
- break;
- Task = std::move(WorkStack.back());
- WorkStack.pop_back();
- }
- Task();
+ std::unique_lock<std::mutex> Lock(Mutex);
+ Cond.wait(Lock, [&] { return Stop || !WorkStack.empty(); });
+ if (Stop && WorkStack.empty())
+ return;
+ if (WorkStack.empty())
+ break;
+ auto Item = std::move(WorkStack.back());
+ WorkStack.pop_back();
+ Lock.unlock();
+ Item.F();
+ Item.L.get().dec();
}
} else {
std::unique_lock<std::mutex> Lock(Mutex);
Cond.wait(Lock, [&] { return Stop || !WorkStack.empty(); });
if (Stop)
break;
- auto Task = std::move(WorkStack.back());
+ auto Item = std::move(WorkStack.back());
WorkStack.pop_back();
Lock.unlock();
- Task();
+ Item.F();
+ Item.L.get().dec();
}
}
}
+ struct WorkItem {
+ std::function<void()> F;
+ std::reference_wrapper<parallel::detail::Latch> L;
+ };
+
std::atomic<bool> Stop{false};
- std::vector<std::function<void()>> WorkStack;
+ std::vector<WorkItem> WorkStack;
std::mutex Mutex;
std::condition_variable Cond;
std::promise<void> ThreadsCreated;
@@ -228,10 +233,7 @@ void TaskGroup::spawn(std::function<void()> F) {
#if LLVM_ENABLE_THREADS
if (Parallel) {
L.inc();
- getDefaultExecutor()->add([&, F = std::move(F)] {
- F();
- L.dec();
- });
+ getDefaultExecutor()->add(std::move(F), L);
return;
}
#endif
>From e0be3ba3d7a3a87c8daf1d2403d4954e7e519eb0 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i at maskray.me>
Date: Sun, 29 Mar 2026 01:30:45 -0700
Subject: [PATCH 2/5] [Support] Support nested parallel TaskGroup via
work-stealing
Nested TaskGroups run serially to prevent deadlock, as documented by
https://reviews.llvm.org/D61115 and refined by
https://reviews.llvm.org/D148984 to use threadIndex.
Enable nested parallelism by having worker threads actively execute
tasks from the work queue while waiting (work-stealing), instead of
just blocking. Root-level TaskGroups (main thread) keep the efficient
blocking Latch::sync(), so there is no overhead for the common
non-nested case.
In lld, https://reviews.llvm.org/D131247 worked around the limitation
by passing a single root TaskGroup into OutputSection::writeTo and
spawning 4MB-chunked tasks into it. However, SyntheticSection::writeTo
calls with internal parallelism (e.g. GdbIndexSection,
MergeNoTailSection) still ran serially on worker threads. With this
change, their internal parallelFor/parallelForEach calls parallelize
automatically via helpSync work-stealing.
The increased parallelism can reorder error messages from parallel
phases (e.g. relocation processing during section writes), so one lld
test is updated to use --threads=1 for deterministic output.
---
lld/test/ELF/merge-piece-oob.s | 4 +-
llvm/include/llvm/Support/Parallel.h | 6 +-
llvm/lib/Support/Parallel.cpp | 32 ++++++++---
llvm/unittests/Support/ParallelTest.cpp | 75 ++++++-------------------
4 files changed, 45 insertions(+), 72 deletions(-)
diff --git a/lld/test/ELF/merge-piece-oob.s b/lld/test/ELF/merge-piece-oob.s
index 1ff34768a4d13..f798c9eba5a0e 100644
--- a/lld/test/ELF/merge-piece-oob.s
+++ b/lld/test/ELF/merge-piece-oob.s
@@ -3,8 +3,8 @@
## Non-section symbols and offset <= section_size are accepted, matching GNU ld.
# RUN: llvm-mc %s -o %t.o -filetype=obj -triple=x86_64
-# RUN: not ld.lld %t.o -o /dev/null -shared 2>&1 | FileCheck %s -DPREFIX=error --implicit-check-not=error:
-# RUN: ld.lld %t.o -o /dev/null -shared --noinhibit-exec 2>&1 | FileCheck %s -DPREFIX=warning --implicit-check-not=warning:
+# RUN: not ld.lld --threads=1 %t.o -o /dev/null -shared 2>&1 | FileCheck %s -DPREFIX=error --implicit-check-not=error:
+# RUN: ld.lld --threads=1 %t.o -o /dev/null -shared --noinhibit-exec 2>&1 | FileCheck %s -DPREFIX=warning --implicit-check-not=warning:
## .foo is 8 bytes with entsize=8 (1 piece). .foo+8 (offset==size) is accepted.
# CHECK: [[PREFIX]]: {{.*}}:(.foo): offset 0x9 is outside the section
diff --git a/llvm/include/llvm/Support/Parallel.h b/llvm/include/llvm/Support/Parallel.h
index f3bb9259c5c3a..14ddee720a87e 100644
--- a/llvm/include/llvm/Support/Parallel.h
+++ b/llvm/include/llvm/Support/Parallel.h
@@ -77,6 +77,8 @@ class Latch {
Cond.notify_all();
}
+ uint32_t getCount() const { return Count.load(std::memory_order_acquire); }
+
void sync() const {
std::unique_lock<std::mutex> lock(Mutex);
Cond.wait(lock, [&] { return Count.load(std::memory_order_relaxed) == 0; });
@@ -93,12 +95,8 @@ class TaskGroup {
LLVM_ABI ~TaskGroup();
// Spawn a task, but does not wait for it to finish.
- // Tasks marked with \p Sequential will be executed
- // exactly in the order which they were spawned.
LLVM_ABI void spawn(std::function<void()> f);
- void sync() const { L.sync(); }
-
bool isParallel() const { return Parallel; }
};
diff --git a/llvm/lib/Support/Parallel.cpp b/llvm/lib/Support/Parallel.cpp
index 80cc6fdb05766..1459bcb2d868f 100644
--- a/llvm/lib/Support/Parallel.cpp
+++ b/llvm/lib/Support/Parallel.cpp
@@ -104,6 +104,22 @@ class ThreadPoolExecutor {
Cond.notify_one();
}
+ // Execute tasks from the work queue until the latch reaches zero.
+ // Used by nested TaskGroups (on worker threads) to prevent deadlock:
+ // instead of blocking in sync(), actively help drain the queue.
+ void helpSync(const parallel::detail::Latch &L) {
+ while (L.getCount() != 0) {
+ std::unique_lock<std::mutex> Lock(Mutex);
+ if (WorkStack.empty())
+ return;
+ auto Item = std::move(WorkStack.back());
+ WorkStack.pop_back();
+ Lock.unlock();
+ Item.F();
+ Item.L.get().dec();
+ }
+ }
+
size_t getThreadCount() const { return ThreadCount; }
private:
@@ -210,22 +226,24 @@ size_t parallel::getThreadCount() {
}
#endif
-// Latch::sync() called by the dtor may cause one thread to block. If is a dead
-// lock if all threads in the default executor are blocked. To prevent the dead
-// lock, only allow the root TaskGroup to run tasks parallelly. In the scenario
-// of nested parallel_for_each(), only the outermost one runs parallelly.
+// Nested TaskGroups on worker threads use helpSync() which executes tasks from
+// the queue while waiting, preventing deadlock while enabling nested
+// parallelism.
TaskGroup::TaskGroup()
: Parallel(
#if LLVM_ENABLE_THREADS
- strategy.ThreadsRequested != 1 && threadIndex == UINT_MAX
+ strategy.ThreadsRequested != 1
#else
false
#endif
) {
}
TaskGroup::~TaskGroup() {
- // We must ensure that all the workloads have finished before decrementing the
- // instances count.
+#if LLVM_ENABLE_THREADS
+ // In a nested TaskGroup (threadIndex != -1u), actively help drain the queue.
+ if (Parallel && threadIndex != UINT_MAX)
+ getDefaultExecutor()->helpSync(L);
+#endif
L.sync();
}
diff --git a/llvm/unittests/Support/ParallelTest.cpp b/llvm/unittests/Support/ParallelTest.cpp
index c7ecc4eff6c29..ad833419cf1bb 100644
--- a/llvm/unittests/Support/ParallelTest.cpp
+++ b/llvm/unittests/Support/ParallelTest.cpp
@@ -95,73 +95,30 @@ TEST(Parallel, ForEachError) {
#if LLVM_ENABLE_THREADS
TEST(Parallel, NestedTaskGroup) {
- // This test checks:
- // 1. Root TaskGroup is in Parallel mode.
- // 2. Nested TaskGroup is not in Parallel mode.
parallel::TaskGroup tg;
-
- tg.spawn([&]() {
- EXPECT_TRUE(tg.isParallel() || (parallel::strategy.ThreadsRequested == 1));
- });
+ EXPECT_TRUE(tg.isParallel() || (parallel::strategy.ThreadsRequested == 1));
tg.spawn([&]() {
parallel::TaskGroup nestedTG;
- EXPECT_FALSE(nestedTG.isParallel());
-
- nestedTG.spawn([&]() {
- // Check that root TaskGroup is in Parallel mode.
- EXPECT_TRUE(tg.isParallel() ||
- (parallel::strategy.ThreadsRequested == 1));
-
- // Check that nested TaskGroup is not in Parallel mode.
- EXPECT_FALSE(nestedTG.isParallel());
- });
+ EXPECT_TRUE(nestedTG.isParallel() ||
+ (parallel::strategy.ThreadsRequested == 1));
});
}
-TEST(Parallel, ParallelNestedTaskGroup) {
- // This test checks that it is possible to have several TaskGroups
- // run from different threads in Parallel mode.
- std::atomic<size_t> Count{0};
-
- {
- std::function<void()> Fn = [&]() {
- parallel::TaskGroup tg;
-
- tg.spawn([&]() {
- // Check that root TaskGroup is in Parallel mode.
- EXPECT_TRUE(tg.isParallel() ||
- (parallel::strategy.ThreadsRequested == 1));
-
- // Check that nested TaskGroup is not in Parallel mode.
- parallel::TaskGroup nestedTG;
- EXPECT_FALSE(nestedTG.isParallel());
- ++Count;
-
- nestedTG.spawn([&]() {
- // Check that root TaskGroup is in Parallel mode.
- EXPECT_TRUE(tg.isParallel() ||
- (parallel::strategy.ThreadsRequested == 1));
-
- // Check that nested TaskGroup is not in Parallel mode.
- EXPECT_FALSE(nestedTG.isParallel());
- ++Count;
- });
+// Verify nested parallelFor doesn't deadlock. This is a simplified version of
+// the pattern from https://reviews.llvm.org/D61115 that originally motivated
+// serializing nested TaskGroups. With work-stealing in helpSync(), nested
+// parallelism now works without deadlock.
+TEST(Parallel, NestedParallelFor) {
+ std::atomic<uint32_t> count{0};
+ parallelFor(0, 8, [&](size_t i) {
+ parallelFor(0, 8, [&](size_t j) {
+ parallelFor(0, 8, [&](size_t k) {
+ count.fetch_add(1, std::memory_order_relaxed);
});
- };
-
- DefaultThreadPool Pool;
-
- Pool.async(Fn);
- Pool.async(Fn);
- Pool.async(Fn);
- Pool.async(Fn);
- Pool.async(Fn);
- Pool.async(Fn);
-
- Pool.wait();
- }
- EXPECT_EQ(Count, 12ul);
+ });
+ });
+ EXPECT_EQ(count.load(), 512u);
}
#endif
>From 12e9d340eee150ccf42befb3319d81be6102100c Mon Sep 17 00:00:00 2001
From: Fangrui Song <i at maskray.me>
Date: Sun, 29 Mar 2026 19:57:06 -0700
Subject: [PATCH 3/5] rebase
---
llvm/lib/Support/Parallel.cpp | 41 ++++++++++++++++++-----------------
1 file changed, 21 insertions(+), 20 deletions(-)
diff --git a/llvm/lib/Support/Parallel.cpp b/llvm/lib/Support/Parallel.cpp
index 1459bcb2d868f..35c98e140eccd 100644
--- a/llvm/lib/Support/Parallel.cpp
+++ b/llvm/lib/Support/Parallel.cpp
@@ -96,6 +96,15 @@ class ThreadPoolExecutor {
static void call(void *Ptr) { ((ThreadPoolExecutor *)Ptr)->stop(); }
};
+ struct WorkItem {
+ std::function<void()> F;
+ std::reference_wrapper<parallel::detail::Latch> L;
+ void operator()() {
+ F();
+ L.get().dec();
+ }
+ };
+
void add(std::function<void()> F, parallel::detail::Latch &L) {
{
std::lock_guard<std::mutex> Lock(Mutex);
@@ -112,17 +121,22 @@ class ThreadPoolExecutor {
std::unique_lock<std::mutex> Lock(Mutex);
if (WorkStack.empty())
return;
- auto Item = std::move(WorkStack.back());
- WorkStack.pop_back();
- Lock.unlock();
- Item.F();
- Item.L.get().dec();
+ popAndRun(Lock);
}
}
size_t getThreadCount() const { return ThreadCount; }
private:
+ // Pop one task from the queue and run it. Must be called with Lock held;
+ // releases Lock before executing the task.
+ void popAndRun(std::unique_lock<std::mutex> &Lock) {
+ auto Item = std::move(WorkStack.back());
+ WorkStack.pop_back();
+ Lock.unlock();
+ Item();
+ }
+
void work(ThreadPoolStrategy S, unsigned ThreadID) {
threadIndex = ThreadID;
S.apply_thread_strategy(ThreadID);
@@ -163,31 +177,18 @@ class ThreadPoolExecutor {
return;
if (WorkStack.empty())
break;
- auto Item = std::move(WorkStack.back());
- WorkStack.pop_back();
- Lock.unlock();
- Item.F();
- Item.L.get().dec();
+ popAndRun(Lock);
}
} else {
std::unique_lock<std::mutex> Lock(Mutex);
Cond.wait(Lock, [&] { return Stop || !WorkStack.empty(); });
if (Stop)
break;
- auto Item = std::move(WorkStack.back());
- WorkStack.pop_back();
- Lock.unlock();
- Item.F();
- Item.L.get().dec();
+ popAndRun(Lock);
}
}
}
- struct WorkItem {
- std::function<void()> F;
- std::reference_wrapper<parallel::detail::Latch> L;
- };
-
std::atomic<bool> Stop{false};
std::vector<WorkItem> WorkStack;
std::mutex Mutex;
>From 26ca726cebae0ed2a940f02e0935a74c9defadfb Mon Sep 17 00:00:00 2001
From: Fangrui Song <i at maskray.me>
Date: Mon, 30 Mar 2026 10:20:19 -0700
Subject: [PATCH 4/5] comment
---
llvm/include/llvm/Support/Parallel.h | 1 +
llvm/lib/Support/Parallel.cpp | 14 ++++++++++----
2 files changed, 11 insertions(+), 4 deletions(-)
diff --git a/llvm/include/llvm/Support/Parallel.h b/llvm/include/llvm/Support/Parallel.h
index 14ddee720a87e..4c69b950cb082 100644
--- a/llvm/include/llvm/Support/Parallel.h
+++ b/llvm/include/llvm/Support/Parallel.h
@@ -73,6 +73,7 @@ class Latch {
// destroy the Latch while dec() is still running.
void dec() {
std::lock_guard<std::mutex> lock(Mutex);
+ // fetch_sub returns the previous value; == 1 means Count is now 0.
if (Count.fetch_sub(1, std::memory_order_acq_rel) == 1)
Cond.notify_all();
}
diff --git a/llvm/lib/Support/Parallel.cpp b/llvm/lib/Support/Parallel.cpp
index 35c98e140eccd..0cbba3878bb3a 100644
--- a/llvm/lib/Support/Parallel.cpp
+++ b/llvm/lib/Support/Parallel.cpp
@@ -227,9 +227,14 @@ size_t parallel::getThreadCount() {
}
#endif
-// Nested TaskGroups on worker threads use helpSync() which executes tasks from
-// the queue while waiting, preventing deadlock while enabling nested
-// parallelism.
+static bool isNested() {
+#if LLVM_ENABLE_THREADS
+ return threadIndex != UINT_MAX;
+#else
+ return false;
+#endif
+}
+
TaskGroup::TaskGroup()
: Parallel(
#if LLVM_ENABLE_THREADS
@@ -239,10 +244,11 @@ TaskGroup::TaskGroup()
#endif
) {
}
+
TaskGroup::~TaskGroup() {
#if LLVM_ENABLE_THREADS
// In a nested TaskGroup (threadIndex != -1u), actively help drain the queue.
- if (Parallel && threadIndex != UINT_MAX)
+ if (Parallel && isNested())
getDefaultExecutor()->helpSync(L);
#endif
L.sync();
>From c3d689a3336aa5d02ee39272024951130dd62463 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i at maskray.me>
Date: Tue, 31 Mar 2026 23:22:00 -0700
Subject: [PATCH 5/5] add Stop
---
llvm/lib/Support/Parallel.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/lib/Support/Parallel.cpp b/llvm/lib/Support/Parallel.cpp
index 0cbba3878bb3a..a7ca21e89f619 100644
--- a/llvm/lib/Support/Parallel.cpp
+++ b/llvm/lib/Support/Parallel.cpp
@@ -119,7 +119,7 @@ class ThreadPoolExecutor {
void helpSync(const parallel::detail::Latch &L) {
while (L.getCount() != 0) {
std::unique_lock<std::mutex> Lock(Mutex);
- if (WorkStack.empty())
+ if (Stop || WorkStack.empty())
return;
popAndRun(Lock);
}
More information about the llvm-commits
mailing list