[lld] [llvm] [Support] Support nested parallel TaskGroup via work-stealing (PR #189293)

Tue Mar 31 23:22:13 PDT 2026

https://github.com/MaskRay updated https://github.com/llvm/llvm-project/pull/189293

>From 4a4bd2ee7f58f46e85289ad873e5d1426b66fb19 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i at maskray.me>
Date: Sat, 28 Mar 2026 12:25:40 -0700
Subject: [PATCH 1/5] [Support] Optimize parallel `TaskGroup`

Two improvements to reduce `TaskGroup::spawn()` overhead:

1. Replace mutex-based `Latch::inc()` with atomic `fetch_add`. `dec()`
   retains the mutex to prevent a race where `sync()` observes Count==0
   and destroys the Latch while `dec()` is still running.

2. Pass `Latch&` through `Executor::add()` so the worker calls `dec()`
   directly, eliminating the wrapper lambda that previously captured
   both the user's callable and the Latch reference. This avoids one
   `std::function` construction and potential heap allocation per spawn.
---
 llvm/include/llvm/Support/Parallel.h | 19 +++++-------
 llvm/lib/Support/Parallel.cpp        | 44 +++++++++++++++-------------
 2 files changed, 31 insertions(+), 32 deletions(-)

diff --git a/llvm/include/llvm/Support/Parallel.h b/llvm/include/llvm/Support/Parallel.h
index b0c9e8f29f970..f3bb9259c5c3a 100644
--- a/llvm/include/llvm/Support/Parallel.h
+++ b/llvm/include/llvm/Support/Parallel.h
@@ -17,6 +17,7 @@
 #include "llvm/Support/Threading.h"
 
 #include <algorithm>
+#include <atomic>
 #include <condition_variable>
 #include <functional>
 #include <mutex>
@@ -58,31 +59,27 @@ inline size_t getThreadCount() { return 1; }
 
 namespace detail {
 class Latch {
-  uint32_t Count;
+  std::atomic<uint32_t> Count;
   mutable std::mutex Mutex;
   mutable std::condition_variable Cond;
 
 public:
   explicit Latch(uint32_t Count = 0) : Count(Count) {}
-  ~Latch() {
-    // Ensure at least that sync() was called.
-    assert(Count == 0);
-  }
+  ~Latch() { assert(Count.load(std::memory_order_relaxed) == 0); }
 
-  void inc() {
-    std::lock_guard<std::mutex> lock(Mutex);
-    ++Count;
-  }
+  void inc() { Count.fetch_add(1, std::memory_order_relaxed); }
 
+  // dec() must hold Mutex so that sync() cannot observe Count==0 and
+  // destroy the Latch while dec() is still running.
   void dec() {
     std::lock_guard<std::mutex> lock(Mutex);
-    if (--Count == 0)
+    if (Count.fetch_sub(1, std::memory_order_acq_rel) == 1)
       Cond.notify_all();
   }
 
   void sync() const {
     std::unique_lock<std::mutex> lock(Mutex);
-    Cond.wait(lock, [&] { return Count == 0; });
+    Cond.wait(lock, [&] { return Count.load(std::memory_order_relaxed) == 0; });
   }
 };
 } // namespace detail
diff --git a/llvm/lib/Support/Parallel.cpp b/llvm/lib/Support/Parallel.cpp
index 2348b788d3ea7..80cc6fdb05766 100644
--- a/llvm/lib/Support/Parallel.cpp
+++ b/llvm/lib/Support/Parallel.cpp
@@ -96,10 +96,10 @@ class ThreadPoolExecutor {
     static void call(void *Ptr) { ((ThreadPoolExecutor *)Ptr)->stop(); }
   };
 
-  void add(std::function<void()> F) {
+  void add(std::function<void()> F, parallel::detail::Latch &L) {
     {
       std::lock_guard<std::mutex> Lock(Mutex);
-      WorkStack.push_back(std::move(F));
+      WorkStack.push_back({std::move(F), std::ref(L)});
     }
     Cond.notify_one();
   }
@@ -141,34 +141,39 @@ class ThreadPoolExecutor {
             [&] { TheJobserver->release(std::move(Slot)); });
 
         while (true) {
-          std::function<void()> Task;
-          {
-            std::unique_lock<std::mutex> Lock(Mutex);
-            Cond.wait(Lock, [&] { return Stop || !WorkStack.empty(); });
-            if (Stop && WorkStack.empty())
-              return;
-            if (WorkStack.empty())
-              break;
-            Task = std::move(WorkStack.back());
-            WorkStack.pop_back();
-          }
-          Task();
+          std::unique_lock<std::mutex> Lock(Mutex);
+          Cond.wait(Lock, [&] { return Stop || !WorkStack.empty(); });
+          if (Stop && WorkStack.empty())
+            return;
+          if (WorkStack.empty())
+            break;
+          auto Item = std::move(WorkStack.back());
+          WorkStack.pop_back();
+          Lock.unlock();
+          Item.F();
+          Item.L.get().dec();
         }
       } else {
         std::unique_lock<std::mutex> Lock(Mutex);
         Cond.wait(Lock, [&] { return Stop || !WorkStack.empty(); });
         if (Stop)
           break;
-        auto Task = std::move(WorkStack.back());
+        auto Item = std::move(WorkStack.back());
         WorkStack.pop_back();
         Lock.unlock();
-        Task();
+        Item.F();
+        Item.L.get().dec();
       }
     }
   }
 
+  struct WorkItem {
+    std::function<void()> F;
+    std::reference_wrapper<parallel::detail::Latch> L;
+  };
+
   std::atomic<bool> Stop{false};
-  std::vector<std::function<void()>> WorkStack;
+  std::vector<WorkItem> WorkStack;
   std::mutex Mutex;
   std::condition_variable Cond;
   std::promise<void> ThreadsCreated;
@@ -228,10 +233,7 @@ void TaskGroup::spawn(std::function<void()> F) {
 #if LLVM_ENABLE_THREADS
   if (Parallel) {
     L.inc();
-    getDefaultExecutor()->add([&, F = std::move(F)] {
-      F();
-      L.dec();
-    });
+    getDefaultExecutor()->add(std::move(F), L);
     return;
   }
 #endif

>From e0be3ba3d7a3a87c8daf1d2403d4954e7e519eb0 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i at maskray.me>
Date: Sun, 29 Mar 2026 01:30:45 -0700
Subject: [PATCH 2/5] [Support] Support nested parallel TaskGroup via
 work-stealing

Nested TaskGroups run serially to prevent deadlock, as documented by
https://reviews.llvm.org/D61115 and refined by
https://reviews.llvm.org/D148984 to use threadIndex.

Enable nested parallelism by having worker threads actively execute
tasks from the work queue while waiting (work-stealing), instead of
just blocking. Root-level TaskGroups (main thread) keep the efficient
blocking Latch::sync(), so there is no overhead for the common
non-nested case.

In lld, https://reviews.llvm.org/D131247 worked around the limitation
by passing a single root TaskGroup into OutputSection::writeTo and
spawning 4MB-chunked tasks into it. However, SyntheticSection::writeTo
calls with internal parallelism (e.g. GdbIndexSection,
MergeNoTailSection) still ran serially on worker threads. With this
change, their internal parallelFor/parallelForEach calls parallelize
automatically via helpSync work-stealing.

The increased parallelism can reorder error messages from parallel
phases (e.g. relocation processing during section writes), so one lld
test is updated to use --threads=1 for deterministic output.
---
 lld/test/ELF/merge-piece-oob.s          |  4 +-
 llvm/include/llvm/Support/Parallel.h    |  6 +-
 llvm/lib/Support/Parallel.cpp           | 32 ++++++++---
 llvm/unittests/Support/ParallelTest.cpp | 75 ++++++-------------------
 4 files changed, 45 insertions(+), 72 deletions(-)

diff --git a/lld/test/ELF/merge-piece-oob.s b/lld/test/ELF/merge-piece-oob.s
index 1ff34768a4d13..f798c9eba5a0e 100644
--- a/lld/test/ELF/merge-piece-oob.s
+++ b/lld/test/ELF/merge-piece-oob.s
@@ -3,8 +3,8 @@
 ## Non-section symbols and offset <= section_size are accepted, matching GNU ld.
 
 # RUN: llvm-mc %s -o %t.o -filetype=obj -triple=x86_64
-# RUN: not ld.lld %t.o -o /dev/null -shared 2>&1 | FileCheck %s -DPREFIX=error --implicit-check-not=error:
-# RUN: ld.lld %t.o -o /dev/null -shared --noinhibit-exec 2>&1 | FileCheck %s -DPREFIX=warning --implicit-check-not=warning:
+# RUN: not ld.lld --threads=1 %t.o -o /dev/null -shared 2>&1 | FileCheck %s -DPREFIX=error --implicit-check-not=error:
+# RUN: ld.lld --threads=1 %t.o -o /dev/null -shared --noinhibit-exec 2>&1 | FileCheck %s -DPREFIX=warning --implicit-check-not=warning:
 
 ## .foo is 8 bytes with entsize=8 (1 piece). .foo+8 (offset==size) is accepted.
 # CHECK:      [[PREFIX]]: {{.*}}:(.foo): offset 0x9 is outside the section
diff --git a/llvm/include/llvm/Support/Parallel.h b/llvm/include/llvm/Support/Parallel.h
index f3bb9259c5c3a..14ddee720a87e 100644
--- a/llvm/include/llvm/Support/Parallel.h
+++ b/llvm/include/llvm/Support/Parallel.h
@@ -77,6 +77,8 @@ class Latch {
       Cond.notify_all();
   }
 
+  uint32_t getCount() const { return Count.load(std::memory_order_acquire); }
+
   void sync() const {
     std::unique_lock<std::mutex> lock(Mutex);
     Cond.wait(lock, [&] { return Count.load(std::memory_order_relaxed) == 0; });
@@ -93,12 +95,8 @@ class TaskGroup {
   LLVM_ABI ~TaskGroup();
 
   // Spawn a task, but does not wait for it to finish.
-  // Tasks marked with \p Sequential will be executed
-  // exactly in the order which they were spawned.
   LLVM_ABI void spawn(std::function<void()> f);
 
-  void sync() const { L.sync(); }
-
   bool isParallel() const { return Parallel; }
 };
 
diff --git a/llvm/lib/Support/Parallel.cpp b/llvm/lib/Support/Parallel.cpp
index 80cc6fdb05766..1459bcb2d868f 100644
--- a/llvm/lib/Support/Parallel.cpp
+++ b/llvm/lib/Support/Parallel.cpp
@@ -104,6 +104,22 @@ class ThreadPoolExecutor {
     Cond.notify_one();
   }
 
+  // Execute tasks from the work queue until the latch reaches zero.
+  // Used by nested TaskGroups (on worker threads) to prevent deadlock:
+  // instead of blocking in sync(), actively help drain the queue.
+  void helpSync(const parallel::detail::Latch &L) {
+    while (L.getCount() != 0) {
+      std::unique_lock<std::mutex> Lock(Mutex);
+      if (WorkStack.empty())
+        return;
+      auto Item = std::move(WorkStack.back());
+      WorkStack.pop_back();
+      Lock.unlock();
+      Item.F();
+      Item.L.get().dec();
+    }
+  }
+
   size_t getThreadCount() const { return ThreadCount; }
 
 private:
@@ -210,22 +226,24 @@ size_t parallel::getThreadCount() {
 }
 #endif
 
-// Latch::sync() called by the dtor may cause one thread to block. If is a dead
-// lock if all threads in the default executor are blocked. To prevent the dead
-// lock, only allow the root TaskGroup to run tasks parallelly. In the scenario
-// of nested parallel_for_each(), only the outermost one runs parallelly.
+// Nested TaskGroups on worker threads use helpSync() which executes tasks from
+// the queue while waiting, preventing deadlock while enabling nested
+// parallelism.
 TaskGroup::TaskGroup()
     : Parallel(
 #if LLVM_ENABLE_THREADS
-          strategy.ThreadsRequested != 1 && threadIndex == UINT_MAX
+          strategy.ThreadsRequested != 1
 #else
           false
 #endif
       ) {
 }
 TaskGroup::~TaskGroup() {
-  // We must ensure that all the workloads have finished before decrementing the
-  // instances count.
+#if LLVM_ENABLE_THREADS
+  // In a nested TaskGroup (threadIndex != -1u), actively help drain the queue.
+  if (Parallel && threadIndex != UINT_MAX)
+    getDefaultExecutor()->helpSync(L);
+#endif
   L.sync();
 }
 
diff --git a/llvm/unittests/Support/ParallelTest.cpp b/llvm/unittests/Support/ParallelTest.cpp
index c7ecc4eff6c29..ad833419cf1bb 100644
--- a/llvm/unittests/Support/ParallelTest.cpp
+++ b/llvm/unittests/Support/ParallelTest.cpp
@@ -95,73 +95,30 @@ TEST(Parallel, ForEachError) {
 
 #if LLVM_ENABLE_THREADS
 TEST(Parallel, NestedTaskGroup) {
-  // This test checks:
-  // 1. Root TaskGroup is in Parallel mode.
-  // 2. Nested TaskGroup is not in Parallel mode.
   parallel::TaskGroup tg;
-
-  tg.spawn([&]() {
-    EXPECT_TRUE(tg.isParallel() || (parallel::strategy.ThreadsRequested == 1));
-  });
+  EXPECT_TRUE(tg.isParallel() || (parallel::strategy.ThreadsRequested == 1));
 
   tg.spawn([&]() {
     parallel::TaskGroup nestedTG;
-    EXPECT_FALSE(nestedTG.isParallel());
-
-    nestedTG.spawn([&]() {
-      // Check that root TaskGroup is in Parallel mode.
-      EXPECT_TRUE(tg.isParallel() ||
-                  (parallel::strategy.ThreadsRequested == 1));
-
-      // Check that nested TaskGroup is not in Parallel mode.
-      EXPECT_FALSE(nestedTG.isParallel());
-    });
+    EXPECT_TRUE(nestedTG.isParallel() ||
+                (parallel::strategy.ThreadsRequested == 1));
   });
 }
 
-TEST(Parallel, ParallelNestedTaskGroup) {
-  // This test checks that it is possible to have several TaskGroups
-  // run from different threads in Parallel mode.
-  std::atomic<size_t> Count{0};
-
-  {
-    std::function<void()> Fn = [&]() {
-      parallel::TaskGroup tg;
-
-      tg.spawn([&]() {
-        // Check that root TaskGroup is in Parallel mode.
-        EXPECT_TRUE(tg.isParallel() ||
-                    (parallel::strategy.ThreadsRequested == 1));
-
-        // Check that nested TaskGroup is not in Parallel mode.
-        parallel::TaskGroup nestedTG;
-        EXPECT_FALSE(nestedTG.isParallel());
-        ++Count;
-
-        nestedTG.spawn([&]() {
-          // Check that root TaskGroup is in Parallel mode.
-          EXPECT_TRUE(tg.isParallel() ||
-                      (parallel::strategy.ThreadsRequested == 1));
-
-          // Check that nested TaskGroup is not in Parallel mode.
-          EXPECT_FALSE(nestedTG.isParallel());
-          ++Count;
-        });
+// Verify nested parallelFor doesn't deadlock. This is a simplified version of
+// the pattern from https://reviews.llvm.org/D61115 that originally motivated
+// serializing nested TaskGroups. With work-stealing in helpSync(), nested
+// parallelism now works without deadlock.
+TEST(Parallel, NestedParallelFor) {
+  std::atomic<uint32_t> count{0};
+  parallelFor(0, 8, [&](size_t i) {
+    parallelFor(0, 8, [&](size_t j) {
+      parallelFor(0, 8, [&](size_t k) {
+        count.fetch_add(1, std::memory_order_relaxed);
       });
-    };
-
-    DefaultThreadPool Pool;
-
-    Pool.async(Fn);
-    Pool.async(Fn);
-    Pool.async(Fn);
-    Pool.async(Fn);
-    Pool.async(Fn);
-    Pool.async(Fn);
-
-    Pool.wait();
-  }
-  EXPECT_EQ(Count, 12ul);
+    });
+  });
+  EXPECT_EQ(count.load(), 512u);
 }
 #endif
 

>From 12e9d340eee150ccf42befb3319d81be6102100c Mon Sep 17 00:00:00 2001
From: Fangrui Song <i at maskray.me>
Date: Sun, 29 Mar 2026 19:57:06 -0700
Subject: [PATCH 3/5] rebase

---
 llvm/lib/Support/Parallel.cpp | 41 ++++++++++++++++++-----------------
 1 file changed, 21 insertions(+), 20 deletions(-)

diff --git a/llvm/lib/Support/Parallel.cpp b/llvm/lib/Support/Parallel.cpp
index 1459bcb2d868f..35c98e140eccd 100644
--- a/llvm/lib/Support/Parallel.cpp
+++ b/llvm/lib/Support/Parallel.cpp
@@ -96,6 +96,15 @@ class ThreadPoolExecutor {
     static void call(void *Ptr) { ((ThreadPoolExecutor *)Ptr)->stop(); }
   };
 
+  struct WorkItem {
+    std::function<void()> F;
+    std::reference_wrapper<parallel::detail::Latch> L;
+    void operator()() {
+      F();
+      L.get().dec();
+    }
+  };
+
   void add(std::function<void()> F, parallel::detail::Latch &L) {
     {
       std::lock_guard<std::mutex> Lock(Mutex);
@@ -112,17 +121,22 @@ class ThreadPoolExecutor {
       std::unique_lock<std::mutex> Lock(Mutex);
       if (WorkStack.empty())
         return;
-      auto Item = std::move(WorkStack.back());
-      WorkStack.pop_back();
-      Lock.unlock();
-      Item.F();
-      Item.L.get().dec();
+      popAndRun(Lock);
     }
   }
 
   size_t getThreadCount() const { return ThreadCount; }
 
 private:
+  // Pop one task from the queue and run it. Must be called with Lock held;
+  // releases Lock before executing the task.
+  void popAndRun(std::unique_lock<std::mutex> &Lock) {
+    auto Item = std::move(WorkStack.back());
+    WorkStack.pop_back();
+    Lock.unlock();
+    Item();
+  }
+
   void work(ThreadPoolStrategy S, unsigned ThreadID) {
     threadIndex = ThreadID;
     S.apply_thread_strategy(ThreadID);
@@ -163,31 +177,18 @@ class ThreadPoolExecutor {
             return;
           if (WorkStack.empty())
             break;
-          auto Item = std::move(WorkStack.back());
-          WorkStack.pop_back();
-          Lock.unlock();
-          Item.F();
-          Item.L.get().dec();
+          popAndRun(Lock);
         }
       } else {
         std::unique_lock<std::mutex> Lock(Mutex);
         Cond.wait(Lock, [&] { return Stop || !WorkStack.empty(); });
         if (Stop)
           break;
-        auto Item = std::move(WorkStack.back());
-        WorkStack.pop_back();
-        Lock.unlock();
-        Item.F();
-        Item.L.get().dec();
+        popAndRun(Lock);
       }
     }
   }
 
-  struct WorkItem {
-    std::function<void()> F;
-    std::reference_wrapper<parallel::detail::Latch> L;
-  };
-
   std::atomic<bool> Stop{false};
   std::vector<WorkItem> WorkStack;
   std::mutex Mutex;

>From 26ca726cebae0ed2a940f02e0935a74c9defadfb Mon Sep 17 00:00:00 2001
From: Fangrui Song <i at maskray.me>
Date: Mon, 30 Mar 2026 10:20:19 -0700
Subject: [PATCH 4/5] comment

---
 llvm/include/llvm/Support/Parallel.h |  1 +
 llvm/lib/Support/Parallel.cpp        | 14 ++++++++++----
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/llvm/include/llvm/Support/Parallel.h b/llvm/include/llvm/Support/Parallel.h
index 14ddee720a87e..4c69b950cb082 100644
--- a/llvm/include/llvm/Support/Parallel.h
+++ b/llvm/include/llvm/Support/Parallel.h
@@ -73,6 +73,7 @@ class Latch {
   // destroy the Latch while dec() is still running.
   void dec() {
     std::lock_guard<std::mutex> lock(Mutex);
+    // fetch_sub returns the previous value; == 1 means Count is now 0.
     if (Count.fetch_sub(1, std::memory_order_acq_rel) == 1)
       Cond.notify_all();
   }
diff --git a/llvm/lib/Support/Parallel.cpp b/llvm/lib/Support/Parallel.cpp
index 35c98e140eccd..0cbba3878bb3a 100644
--- a/llvm/lib/Support/Parallel.cpp
+++ b/llvm/lib/Support/Parallel.cpp
@@ -227,9 +227,14 @@ size_t parallel::getThreadCount() {
 }
 #endif
 
-// Nested TaskGroups on worker threads use helpSync() which executes tasks from
-// the queue while waiting, preventing deadlock while enabling nested
-// parallelism.
+static bool isNested() {
+#if LLVM_ENABLE_THREADS
+  return threadIndex != UINT_MAX;
+#else
+  return false;
+#endif
+}
+
 TaskGroup::TaskGroup()
     : Parallel(
 #if LLVM_ENABLE_THREADS
@@ -239,10 +244,11 @@ TaskGroup::TaskGroup()
 #endif
       ) {
 }
+
 TaskGroup::~TaskGroup() {
 #if LLVM_ENABLE_THREADS
   // In a nested TaskGroup (threadIndex != -1u), actively help drain the queue.
-  if (Parallel && threadIndex != UINT_MAX)
+  if (Parallel && isNested())
     getDefaultExecutor()->helpSync(L);
 #endif
   L.sync();

>From c3d689a3336aa5d02ee39272024951130dd62463 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i at maskray.me>
Date: Tue, 31 Mar 2026 23:22:00 -0700
Subject: [PATCH 5/5] add Stop

---
 llvm/lib/Support/Parallel.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Support/Parallel.cpp b/llvm/lib/Support/Parallel.cpp
index 0cbba3878bb3a..a7ca21e89f619 100644
--- a/llvm/lib/Support/Parallel.cpp
+++ b/llvm/lib/Support/Parallel.cpp
@@ -119,7 +119,7 @@ class ThreadPoolExecutor {
   void helpSync(const parallel::detail::Latch &L) {
     while (L.getCount() != 0) {
       std::unique_lock<std::mutex> Lock(Mutex);
-      if (WorkStack.empty())
+      if (Stop || WorkStack.empty())
         return;
       popAndRun(Lock);
     }