[lld] [ELF] Parallelize --gc-sections mark phase (PR #189321)

Wed Apr 1 23:24:47 PDT 2026

https://github.com/MaskRay updated https://github.com/llvm/llvm-project/pull/189321

>From 0d85ffa7336b1738db2a3e664c6e775f7ebc3bf2 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i at maskray.me>
Date: Sun, 29 Mar 2026 17:11:14 -0700
Subject: [PATCH] [ELF] Parallelize --gc-sections mark phase

Add `markParallel` using level-synchronized `parallelFor`. Each BFS
level is processed in parallel; newly discovered sections are collected
in per-thread queues and merged for the next level.

The parallel path is used when `!TrackWhyLive && partitions.size()==1`.
`parallelFor` naturally degrades to serial when `--threads=1`.

Uses depth-limited inline recursion (depth<3) and optimistic
load-then-exchange dedup for best performance.

Linking clang-23 with --gc-sections (8 threads, --time-trace):
markLive 257ms -> 83ms. Total link time 1140ms -> 970ms.

Alternative: parallel-gc-taskgroup uses TaskGroup work-stealing instead
of level-sync BFS. markLive 111ms (vs 83ms here). GC roots are many
but shallow, so TaskGroup spawn overhead exceeds the work-stealing
benefit. Level-sync BFS with inline recursion is better suited.
---
 lld/ELF/MarkLive.cpp | 98 ++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 95 insertions(+), 3 deletions(-)

diff --git a/lld/ELF/MarkLive.cpp b/lld/ELF/MarkLive.cpp
index 1727bd1db0690..0ec72cc4f2c95 100644
--- a/lld/ELF/MarkLive.cpp
+++ b/lld/ELF/MarkLive.cpp
@@ -30,6 +30,7 @@
 #include "lld/Common/Strings.h"
 #include "llvm/ADT/DenseMapInfoVariant.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/Parallel.h"
 #include "llvm/Support/TimeProfiler.h"
 #include <variant>
 #include <vector>
@@ -66,6 +67,7 @@ template <class ELFT, bool TrackWhyLive> class MarkLive {
                LiveReason reason);
   void markSymbol(Symbol *sym, StringRef reason);
   void mark();
+  void markParallel();
 
   template <class RelTy>
   void resolveReloc(InputSectionBase &sec, const RelTy &rel, bool fromFDE);
@@ -225,6 +227,51 @@ void MarkLive<ELFT, TrackWhyLive>::scanEhFrameSection(EhInputSection &eh) {
   }
 }
 
+// Walk all GC edges from sec and call fn(target, offset) for each edge.
+template <class ELFT, class Fn>
+static void
+forEachEdge(Ctx &ctx, InputSectionBase &sec,
+            const DenseMap<StringRef, SmallVector<InputSectionBase *, 0>>
+                &cNamedSections,
+            Fn fn) {
+  auto resolveEdge = [&](const auto &rel) {
+    Symbol &sym = sec.file->getRelocTargetSym(rel);
+    if (!sym.used)
+      sym.used = true;
+    if (auto *d = dyn_cast<Defined>(&sym)) {
+      if (auto *relSec = dyn_cast_or_null<InputSectionBase>(d->section)) {
+        uint64_t offset = d->value;
+        if (d->isSection()) {
+          offset += getAddend<ELFT>(ctx, sec, rel);
+          if (auto *ms = dyn_cast<MergeInputSection>(relSec);
+              ms && offset >= ms->content().size())
+            return;
+        }
+        if (auto *ms = dyn_cast<MergeInputSection>(relSec))
+          ms->getSectionPiece(offset).live = true;
+        fn(relSec, offset);
+      }
+      return;
+    }
+    if (auto *ss = dyn_cast<SharedSymbol>(&sym))
+      if (!ss->isWeak())
+        cast<SharedFile>(ss->file)->isNeeded = true;
+    for (InputSectionBase *csec : cNamedSections.lookup(sym.getName()))
+      fn(csec, 0);
+  };
+  const RelsOrRelas<ELFT> rels = sec.template relsOrRelas<ELFT>();
+  for (const typename ELFT::Rel &rel : rels.rels)
+    resolveEdge(rel);
+  for (const typename ELFT::Rela &rel : rels.relas)
+    resolveEdge(rel);
+  for (const typename ELFT::Crel &rel : rels.crels)
+    resolveEdge(rel);
+  for (InputSectionBase *isec : sec.dependentSections)
+    fn(isec, 0);
+  if (sec.nextInSectionGroup)
+    fn(sec.nextInSectionGroup, 0);
+}
+
 // Some sections are used directly by the loader, so they should never be
 // garbage-collected. This function returns true if a given section is such
 // section.
@@ -464,7 +511,12 @@ void MarkLive<ELFT, TrackWhyLive>::run() {
 
 template <class ELFT, bool TrackWhyLive>
 void MarkLive<ELFT, TrackWhyLive>::mark() {
-  // Mark all reachable sections.
+  if constexpr (!TrackWhyLive) {
+    if (ctx.partitions.size() == 1) {
+      markParallel();
+      return;
+    }
+  }
   while (!queue.empty()) {
     InputSectionBase &sec = *queue.pop_back_val();
 
@@ -487,6 +539,46 @@ void MarkLive<ELFT, TrackWhyLive>::mark() {
   }
 }
 
+// Parallel mark using level-synchronized BFS with depth-limited inline
+// recursion. Each parallelFor iteration processes a subtree up to depth 3
+// (DFS for cache locality), then queues deeper discoveries for the next level.
+template <class ELFT, bool TrackWhyLive>
+void MarkLive<ELFT, TrackWhyLive>::markParallel() {
+  const size_t numThreads = parallel::getThreadCount();
+  auto visit = [&](InputSection *sec, int depth,
+                   SmallVector<InputSection *, 0> &localQueue,
+                   auto &self) -> void {
+    forEachEdge<ELFT>(ctx, *sec, cNamedSections,
+                      [&](InputSectionBase *target, uint64_t offset) {
+                        auto &part = reinterpret_cast<std::atomic<uint8_t> &>(
+                            target->partition);
+                        // Optimistic load-then-exchange avoids expensive atomic
+                        // RMW on already-visited sections.
+                        if (part.load(std::memory_order_relaxed) != 0 ||
+                            part.exchange(1, std::memory_order_relaxed) != 0)
+                          return;
+                        if (auto *s = dyn_cast<InputSection>(target)) {
+                          if (depth < 3)
+                            self(s, depth + 1, localQueue, self);
+                          else
+                            localQueue.push_back(s);
+                        }
+                      });
+  };
+
+  while (!queue.empty()) {
+    auto queues =
+        std::make_unique<SmallVector<InputSection *, 0>[]>(numThreads);
+    parallelFor(0, queue.size(), [&](size_t i) {
+      const unsigned tid = parallel::getThreadIndex();
+      visit(queue[i], 0, queues[tid], visit);
+    });
+    queue.clear();
+    for (size_t t = 0; t < numThreads; ++t)
+      queue.append(std::move(queues[t]));
+  }
+}
+
 // Move the sections for some symbols to the main partition, specifically ifuncs
 // (because they can result in an IRELATIVE being added to the main partition's
 // GOT, which means that the ifunc must be available when the main partition is
@@ -531,8 +623,8 @@ template <class ELFT> void elf::markLive(Ctx &ctx) {
     return;
   }
 
-  for (InputSectionBase *sec : ctx.inputSections)
-    sec->markDead();
+  parallelForEach(ctx.inputSections,
+                  [](InputSectionBase *sec) { sec->markDead(); });
 
   // Follow the graph to mark all live sections.
   for (unsigned i = 1, e = ctx.partitions.size(); i <= e; ++i)