[lld] [ELF] Parallelize --gc-sections mark phase (PR #189321)
Fangrui Song via llvm-commits
llvm-commits at lists.llvm.org
Wed Apr 1 23:24:47 PDT 2026
https://github.com/MaskRay updated https://github.com/llvm/llvm-project/pull/189321
>From 0d85ffa7336b1738db2a3e664c6e775f7ebc3bf2 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i at maskray.me>
Date: Sun, 29 Mar 2026 17:11:14 -0700
Subject: [PATCH] [ELF] Parallelize --gc-sections mark phase
Add `markParallel` using level-synchronized `parallelFor`. Each BFS
level is processed in parallel; newly discovered sections are collected
in per-thread queues and merged for the next level.
The parallel path is used when `!TrackWhyLive && partitions.size()==1`.
`parallelFor` naturally degrades to serial when `--threads=1`.
Uses depth-limited inline recursion (depth<3) and optimistic
load-then-exchange dedup for best performance.
Linking clang-23 with --gc-sections (8 threads, --time-trace):
markLive 257ms -> 83ms. Total link time 1140ms -> 970ms.
Alternative: parallel-gc-taskgroup uses TaskGroup work-stealing instead
of level-sync BFS. markLive 111ms (vs 83ms here). GC roots are many
but shallow, so TaskGroup spawn overhead exceeds the work-stealing
benefit. Level-sync BFS with inline recursion is better suited.
---
lld/ELF/MarkLive.cpp | 98 ++++++++++++++++++++++++++++++++++++++++++--
1 file changed, 95 insertions(+), 3 deletions(-)
diff --git a/lld/ELF/MarkLive.cpp b/lld/ELF/MarkLive.cpp
index 1727bd1db0690..0ec72cc4f2c95 100644
--- a/lld/ELF/MarkLive.cpp
+++ b/lld/ELF/MarkLive.cpp
@@ -30,6 +30,7 @@
#include "lld/Common/Strings.h"
#include "llvm/ADT/DenseMapInfoVariant.h"
#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/Parallel.h"
#include "llvm/Support/TimeProfiler.h"
#include <variant>
#include <vector>
@@ -66,6 +67,7 @@ template <class ELFT, bool TrackWhyLive> class MarkLive {
LiveReason reason);
void markSymbol(Symbol *sym, StringRef reason);
void mark();
+ void markParallel();
template <class RelTy>
void resolveReloc(InputSectionBase &sec, const RelTy &rel, bool fromFDE);
@@ -225,6 +227,51 @@ void MarkLive<ELFT, TrackWhyLive>::scanEhFrameSection(EhInputSection &eh) {
}
}
+// Walk all GC edges from sec and call fn(target, offset) for each edge.
+template <class ELFT, class Fn>
+static void
+forEachEdge(Ctx &ctx, InputSectionBase &sec,
+ const DenseMap<StringRef, SmallVector<InputSectionBase *, 0>>
+ &cNamedSections,
+ Fn fn) {
+ auto resolveEdge = [&](const auto &rel) {
+ Symbol &sym = sec.file->getRelocTargetSym(rel);
+ if (!sym.used)
+ sym.used = true;
+ if (auto *d = dyn_cast<Defined>(&sym)) {
+ if (auto *relSec = dyn_cast_or_null<InputSectionBase>(d->section)) {
+ uint64_t offset = d->value;
+ if (d->isSection()) {
+ offset += getAddend<ELFT>(ctx, sec, rel);
+ if (auto *ms = dyn_cast<MergeInputSection>(relSec);
+ ms && offset >= ms->content().size())
+ return;
+ }
+ if (auto *ms = dyn_cast<MergeInputSection>(relSec))
+ ms->getSectionPiece(offset).live = true;
+ fn(relSec, offset);
+ }
+ return;
+ }
+ if (auto *ss = dyn_cast<SharedSymbol>(&sym))
+ if (!ss->isWeak())
+ cast<SharedFile>(ss->file)->isNeeded = true;
+ for (InputSectionBase *csec : cNamedSections.lookup(sym.getName()))
+ fn(csec, 0);
+ };
+ const RelsOrRelas<ELFT> rels = sec.template relsOrRelas<ELFT>();
+ for (const typename ELFT::Rel &rel : rels.rels)
+ resolveEdge(rel);
+ for (const typename ELFT::Rela &rel : rels.relas)
+ resolveEdge(rel);
+ for (const typename ELFT::Crel &rel : rels.crels)
+ resolveEdge(rel);
+ for (InputSectionBase *isec : sec.dependentSections)
+ fn(isec, 0);
+ if (sec.nextInSectionGroup)
+ fn(sec.nextInSectionGroup, 0);
+}
+
// Some sections are used directly by the loader, so they should never be
// garbage-collected. This function returns true if a given section is such
// section.
@@ -464,7 +511,12 @@ void MarkLive<ELFT, TrackWhyLive>::run() {
template <class ELFT, bool TrackWhyLive>
void MarkLive<ELFT, TrackWhyLive>::mark() {
- // Mark all reachable sections.
+ if constexpr (!TrackWhyLive) {
+ if (ctx.partitions.size() == 1) {
+ markParallel();
+ return;
+ }
+ }
while (!queue.empty()) {
InputSectionBase &sec = *queue.pop_back_val();
@@ -487,6 +539,46 @@ void MarkLive<ELFT, TrackWhyLive>::mark() {
}
}
+// Parallel mark using level-synchronized BFS with depth-limited inline
+// recursion. Each parallelFor iteration processes a subtree up to depth 3
+// (DFS for cache locality), then queues deeper discoveries for the next level.
+template <class ELFT, bool TrackWhyLive>
+void MarkLive<ELFT, TrackWhyLive>::markParallel() {
+ const size_t numThreads = parallel::getThreadCount();
+ auto visit = [&](InputSection *sec, int depth,
+ SmallVector<InputSection *, 0> &localQueue,
+ auto &self) -> void {
+ forEachEdge<ELFT>(ctx, *sec, cNamedSections,
+ [&](InputSectionBase *target, uint64_t offset) {
+ auto &part = reinterpret_cast<std::atomic<uint8_t> &>(
+ target->partition);
+ // Optimistic load-then-exchange avoids expensive atomic
+ // RMW on already-visited sections.
+ if (part.load(std::memory_order_relaxed) != 0 ||
+ part.exchange(1, std::memory_order_relaxed) != 0)
+ return;
+ if (auto *s = dyn_cast<InputSection>(target)) {
+ if (depth < 3)
+ self(s, depth + 1, localQueue, self);
+ else
+ localQueue.push_back(s);
+ }
+ });
+ };
+
+ while (!queue.empty()) {
+ auto queues =
+ std::make_unique<SmallVector<InputSection *, 0>[]>(numThreads);
+ parallelFor(0, queue.size(), [&](size_t i) {
+ const unsigned tid = parallel::getThreadIndex();
+ visit(queue[i], 0, queues[tid], visit);
+ });
+ queue.clear();
+ for (size_t t = 0; t < numThreads; ++t)
+ queue.append(std::move(queues[t]));
+ }
+}
+
// Move the sections for some symbols to the main partition, specifically ifuncs
// (because they can result in an IRELATIVE being added to the main partition's
// GOT, which means that the ifunc must be available when the main partition is
@@ -531,8 +623,8 @@ template <class ELFT> void elf::markLive(Ctx &ctx) {
return;
}
- for (InputSectionBase *sec : ctx.inputSections)
- sec->markDead();
+ parallelForEach(ctx.inputSections,
+ [](InputSectionBase *sec) { sec->markDead(); });
// Follow the graph to mark all live sections.
for (unsigned i = 1, e = ctx.partitions.size(); i <= e; ++i)
More information about the llvm-commits
mailing list