[lld] 3b4d800 - [ELF] Parallelize writes of different OutputSections
Fangrui Song via llvm-commits
llvm-commits at lists.llvm.org
Wed Aug 24 09:40:14 PDT 2022
Author: Fangrui Song
Date: 2022-08-24T09:40:03-07:00
New Revision: 3b4d800911b52ae23da1a1e3f9105f53d8053397
URL: https://github.com/llvm/llvm-project/commit/3b4d800911b52ae23da1a1e3f9105f53d8053397
DIFF: https://github.com/llvm/llvm-project/commit/3b4d800911b52ae23da1a1e3f9105f53d8053397.diff
LOG: [ELF] Parallelize writes of different OutputSections
We currently process one OutputSection at a time and for each OutputSection
write contained input sections in parallel. This strategy does not leverage
multi-threading well. Instead, parallelize writes of different OutputSections.
The default TaskSize for parallelFor often leads to inferior sharding. We
prepare the task in the caller instead.
* Move llvm::parallel::detail::TaskGroup to llvm::parallel::TaskGroup
* Add llvm::parallel::TaskGroup::execute.
* Change writeSections to declare TaskGroup and pass it to writeTo.
Speed-up with --threads=8:
* clang -DCMAKE_BUILD_TYPE=Release: 1.11x as fast
* clang -DCMAKE_BUILD_TYPE=Debug: 1.10x as fast
* chrome -DCMAKE_BUILD_TYPE=Release: 1.04x as fast
* scylladb build/release: 1.09x as fast
On M1, many benchmarks are a small fraction of a percentage faster. Mozilla showed the largest difference with the patch being about 1.03x as fast.
Differential Revision: https://reviews.llvm.org/D131247
Added:
Modified:
lld/ELF/OutputSections.cpp
lld/ELF/OutputSections.h
lld/ELF/Writer.cpp
lld/test/ELF/arm-thumb-interwork-notfunc.s
lld/test/ELF/hexagon-jump-error.s
lld/test/ELF/linkerscript/overlapping-sections.s
llvm/include/llvm/Support/Parallel.h
llvm/lib/Support/Parallel.cpp
Removed:
################################################################################
diff --git a/lld/ELF/OutputSections.cpp b/lld/ELF/OutputSections.cpp
index cbde8ac800d39..48515b7be29a9 100644
--- a/lld/ELF/OutputSections.cpp
+++ b/lld/ELF/OutputSections.cpp
@@ -332,7 +332,10 @@ template <class ELFT> void OutputSection::maybeCompress() {
// Write uncompressed data to a temporary zero-initialized buffer.
auto buf = std::make_unique<uint8_t[]>(size);
- writeTo<ELFT>(buf.get());
+ {
+ parallel::TaskGroup tg;
+ writeTo<ELFT>(buf.get(), tg);
+ }
// We chose 1 (Z_BEST_SPEED) as the default compression level because it is
// the fastest. If -O2 is given, we use level 6 to compress debug info more by
// ~15%. We found that level 7 to 9 doesn't make much
diff erence (~1% more
@@ -386,7 +389,8 @@ static void writeInt(uint8_t *buf, uint64_t data, uint64_t size) {
llvm_unreachable("unsupported Size argument");
}
-template <class ELFT> void OutputSection::writeTo(uint8_t *buf) {
+template <class ELFT>
+void OutputSection::writeTo(uint8_t *buf, parallel::TaskGroup &tg) {
llvm::TimeTraceScope timeScope("Write sections", name);
if (type == SHT_NOBITS)
return;
@@ -419,41 +423,68 @@ template <class ELFT> void OutputSection::writeTo(uint8_t *buf) {
}
// Write leading padding.
- SmallVector<InputSection *, 0> storage;
ArrayRef<InputSection *> sections = getInputSections(*this, storage);
std::array<uint8_t, 4> filler = getFiller();
bool nonZeroFiller = read32(filler.data()) != 0;
if (nonZeroFiller)
fill(buf, sections.empty() ? size : sections[0]->outSecOff, filler);
- parallelFor(0, sections.size(), [&](size_t i) {
- InputSection *isec = sections[i];
- if (auto *s = dyn_cast<SyntheticSection>(isec))
- s->writeTo(buf + isec->outSecOff);
- else
- isec->writeTo<ELFT>(buf + isec->outSecOff);
-
- // Fill gaps between sections.
- if (nonZeroFiller) {
- uint8_t *start = buf + isec->outSecOff + isec->getSize();
- uint8_t *end;
- if (i + 1 == sections.size())
- end = buf + size;
+ auto fn = [=](size_t begin, size_t end) {
+ size_t numSections = sections.size();
+ for (size_t i = begin; i != end; ++i) {
+ InputSection *isec = sections[i];
+ if (auto *s = dyn_cast<SyntheticSection>(isec))
+ s->writeTo(buf + isec->outSecOff);
else
- end = buf + sections[i + 1]->outSecOff;
- if (isec->nopFiller) {
- assert(target->nopInstrs);
- nopInstrFill(start, end - start);
- } else
- fill(start, end - start, filler);
+ isec->writeTo<ELFT>(buf + isec->outSecOff);
+
+ // Fill gaps between sections.
+ if (nonZeroFiller) {
+ uint8_t *start = buf + isec->outSecOff + isec->getSize();
+ uint8_t *end;
+ if (i + 1 == numSections)
+ end = buf + size;
+ else
+ end = buf + sections[i + 1]->outSecOff;
+ if (isec->nopFiller) {
+ assert(target->nopInstrs);
+ nopInstrFill(start, end - start);
+ } else
+ fill(start, end - start, filler);
+ }
}
- });
+ };
- // Linker scripts may have BYTE()-family commands with which you
- // can write arbitrary bytes to the output. Process them if any.
+ // If there is any BYTE()-family command (rare), write the section content
+ // first then process BYTE to overwrite the filler content. The write is
+ // serial due to the limitation of llvm/Support/Parallel.h.
+ bool written = false;
+ size_t numSections = sections.size();
for (SectionCommand *cmd : commands)
- if (auto *data = dyn_cast<ByteCommand>(cmd))
+ if (auto *data = dyn_cast<ByteCommand>(cmd)) {
+ if (!std::exchange(written, true))
+ fn(0, numSections);
writeInt(buf + data->offset, data->expression().getValue(), data->size);
+ }
+ if (written || !numSections)
+ return;
+
+ // There is no data command. Write content asynchronously to overlap the write
+ // time with other output sections. Note, if a linker script specifies
+ // overlapping output sections (needs --noinhibit-exec or --no-check-sections
+ // to supress the error), the output may be non-deterministic.
+ const size_t taskSizeLimit = 4 << 20;
+ for (size_t begin = 0, i = 0, taskSize = 0;;) {
+ taskSize += sections[i]->getSize();
+ bool done = ++i == numSections;
+ if (done || taskSize >= taskSizeLimit) {
+ tg.execute([=] { fn(begin, i); });
+ if (done)
+ break;
+ begin = i;
+ taskSize = 0;
+ }
+ }
}
static void finalizeShtGroup(OutputSection *os, InputSection *section) {
@@ -673,10 +704,14 @@ template void OutputSection::writeHeaderTo<ELF32BE>(ELF32BE::Shdr *Shdr);
template void OutputSection::writeHeaderTo<ELF64LE>(ELF64LE::Shdr *Shdr);
template void OutputSection::writeHeaderTo<ELF64BE>(ELF64BE::Shdr *Shdr);
-template void OutputSection::writeTo<ELF32LE>(uint8_t *Buf);
-template void OutputSection::writeTo<ELF32BE>(uint8_t *Buf);
-template void OutputSection::writeTo<ELF64LE>(uint8_t *Buf);
-template void OutputSection::writeTo<ELF64BE>(uint8_t *Buf);
+template void OutputSection::writeTo<ELF32LE>(uint8_t *,
+ llvm::parallel::TaskGroup &);
+template void OutputSection::writeTo<ELF32BE>(uint8_t *,
+ llvm::parallel::TaskGroup &);
+template void OutputSection::writeTo<ELF64LE>(uint8_t *,
+ llvm::parallel::TaskGroup &);
+template void OutputSection::writeTo<ELF64BE>(uint8_t *,
+ llvm::parallel::TaskGroup &);
template void OutputSection::maybeCompress<ELF32LE>();
template void OutputSection::maybeCompress<ELF32BE>();
diff --git a/lld/ELF/OutputSections.h b/lld/ELF/OutputSections.h
index 328daaddeb697..2ba8652981624 100644
--- a/lld/ELF/OutputSections.h
+++ b/lld/ELF/OutputSections.h
@@ -12,6 +12,7 @@
#include "InputSection.h"
#include "LinkerScript.h"
#include "lld/Common/LLVM.h"
+#include "llvm/Support/Parallel.h"
#include <array>
@@ -104,7 +105,8 @@ class OutputSection final : public SectionBase {
bool relro = false;
void finalize();
- template <class ELFT> void writeTo(uint8_t *buf);
+ template <class ELFT>
+ void writeTo(uint8_t *buf, llvm::parallel::TaskGroup &tg);
// Check that the addends for dynamic relocations were written correctly.
void checkDynRelAddends(const uint8_t *bufStart);
template <class ELFT> void maybeCompress();
@@ -114,6 +116,8 @@ class OutputSection final : public SectionBase {
void sortCtorsDtors();
private:
+ SmallVector<InputSection *, 0> storage;
+
// Used for implementation of --compress-debug-sections option.
CompressedData compressed;
diff --git a/lld/ELF/Writer.cpp b/lld/ELF/Writer.cpp
index a0c743b56c32b..6ec9fc7040931 100644
--- a/lld/ELF/Writer.cpp
+++ b/lld/ELF/Writer.cpp
@@ -2839,9 +2839,10 @@ template <class ELFT> void Writer<ELFT>::openFile() {
}
template <class ELFT> void Writer<ELFT>::writeSectionsBinary() {
+ parallel::TaskGroup tg;
for (OutputSection *sec : outputSections)
if (sec->flags & SHF_ALLOC)
- sec->writeTo<ELFT>(Out::bufferStart + sec->offset);
+ sec->writeTo<ELFT>(Out::bufferStart + sec->offset, tg);
}
static void fillTrap(uint8_t *i, uint8_t *end) {
@@ -2884,16 +2885,21 @@ template <class ELFT> void Writer<ELFT>::writeTrapInstr() {
template <class ELFT> void Writer<ELFT>::writeSections() {
llvm::TimeTraceScope timeScope("Write sections");
- // In -r or --emit-relocs mode, write the relocation sections first as in
- // ELf_Rel targets we might find out that we need to modify the relocated
- // section while doing it.
- for (OutputSection *sec : outputSections)
- if (sec->type == SHT_REL || sec->type == SHT_RELA)
- sec->writeTo<ELFT>(Out::bufferStart + sec->offset);
-
- for (OutputSection *sec : outputSections)
- if (sec->type != SHT_REL && sec->type != SHT_RELA)
- sec->writeTo<ELFT>(Out::bufferStart + sec->offset);
+ {
+ // In -r or --emit-relocs mode, write the relocation sections first as in
+ // ELf_Rel targets we might find out that we need to modify the relocated
+ // section while doing it.
+ parallel::TaskGroup tg;
+ for (OutputSection *sec : outputSections)
+ if (sec->type == SHT_REL || sec->type == SHT_RELA)
+ sec->writeTo<ELFT>(Out::bufferStart + sec->offset, tg);
+ }
+ {
+ parallel::TaskGroup tg;
+ for (OutputSection *sec : outputSections)
+ if (sec->type != SHT_REL && sec->type != SHT_RELA)
+ sec->writeTo<ELFT>(Out::bufferStart + sec->offset, tg);
+ }
// Finally, check that all dynamic relocation addends were written correctly.
if (config->checkDynamicRelocs && config->writeAddends) {
diff --git a/lld/test/ELF/arm-thumb-interwork-notfunc.s b/lld/test/ELF/arm-thumb-interwork-notfunc.s
index 28904a59d0f46..860b59f2328b1 100644
--- a/lld/test/ELF/arm-thumb-interwork-notfunc.s
+++ b/lld/test/ELF/arm-thumb-interwork-notfunc.s
@@ -1,6 +1,7 @@
// REQUIRES: arm
// RUN: llvm-mc -g --triple=armv7a-linux-gnueabihf -arm-add-build-attributes -filetype=obj -o %t.o %s
-// RUN: ld.lld %t.o -o %t 2>&1 | FileCheck %s --check-prefix=WARN
+/// Use --threads=1 to keep emitted warnings across sections sequential.
+// RUN: ld.lld %t.o -o %t --threads=1 2>&1 | FileCheck %s --check-prefix=WARN
// RUN: llvm-objdump --no-show-raw-insn -d %t | FileCheck %s
.syntax unified
diff --git a/lld/test/ELF/hexagon-jump-error.s b/lld/test/ELF/hexagon-jump-error.s
index ad8ddbf8ede5b..fec873827e573 100644
--- a/lld/test/ELF/hexagon-jump-error.s
+++ b/lld/test/ELF/hexagon-jump-error.s
@@ -1,6 +1,7 @@
# REQUIRES: hexagon
# RUN: llvm-mc -filetype=obj -triple=hexagon-unknown-elf %s -o %t.o
-# RUN: not ld.lld %t.o -o /dev/null 2>&1 | FileCheck --implicit-check-not "out of range" %s
+## Use --threads=1 to keep emitted warnings across sections sequential.
+# RUN: not ld.lld %t.o -o /dev/null --threads=1 2>&1 | FileCheck --implicit-check-not "out of range" %s
.globl _start
.type _start, @function
diff --git a/lld/test/ELF/linkerscript/overlapping-sections.s b/lld/test/ELF/linkerscript/overlapping-sections.s
index ad59aa2d472fe..39957ae6e9d1b 100644
--- a/lld/test/ELF/linkerscript/overlapping-sections.s
+++ b/lld/test/ELF/linkerscript/overlapping-sections.s
@@ -88,8 +88,8 @@
# BROKEN-OUTPUT-FILE-NEXT: 8010 01010101 01010101 01010101 01010101
# BROKEN-OUTPUT-FILE-NEXT: 8020 01010101 01010101 01010101 01010101
# BROKEN-OUTPUT-FILE-NEXT: 8030 01010101 01010101 01010101 01010101
-# Starting here the contents of .sec2 overwrites .sec1:
-# BROKEN-OUTPUT-FILE-NEXT: 8040 02020202 02020202 02020202 02020202
+## Starting here the content may be from either .sec1 or .sec2, depending on the write order.
+# BROKEN-OUTPUT-FILE-NEXT: 8040
# RUN: llvm-readelf --sections -l %t.so | FileCheck %s -check-prefix BAD-BOTH
# BAD-BOTH-LABEL: Section Headers:
diff --git a/llvm/include/llvm/Support/Parallel.h b/llvm/include/llvm/Support/Parallel.h
index ff113f9b44c48..6569479674071 100644
--- a/llvm/include/llvm/Support/Parallel.h
+++ b/llvm/include/llvm/Support/Parallel.h
@@ -30,9 +30,6 @@ namespace parallel {
extern ThreadPoolStrategy strategy;
namespace detail {
-
-#if LLVM_ENABLE_THREADS
-
class Latch {
uint32_t Count;
mutable std::mutex Mutex;
@@ -61,20 +58,42 @@ class Latch {
Cond.wait(lock, [&] { return Count == 0; });
}
};
+} // namespace detail
class TaskGroup {
- Latch L;
+ detail::Latch L;
bool Parallel;
public:
TaskGroup();
~TaskGroup();
+ // Spawn a task, but does not wait for it to finish.
void spawn(std::function<void()> f);
+ // Similar to spawn, but execute the task immediately when ThreadsRequested ==
+ // 1. The
diff erence is to give the following pattern a more intuitive order
+ // when single threading is requested.
+ //
+ // for (size_t begin = 0, i = 0, taskSize = 0;;) {
+ // taskSize += ...
+ // bool done = ++i == end;
+ // if (done || taskSize >= taskSizeLimit) {
+ // tg.execute([=] { fn(begin, i); });
+ // if (done)
+ // break;
+ // begin = i;
+ // taskSize = 0;
+ // }
+ // }
+ void execute(std::function<void()> f);
+
void sync() const { L.sync(); }
};
+namespace detail {
+
+#if LLVM_ENABLE_THREADS
const ptr
diff _t MinParallelSize = 1024;
/// Inclusive median.
diff --git a/llvm/lib/Support/Parallel.cpp b/llvm/lib/Support/Parallel.cpp
index 798d7124e7e94..9f13726e36913 100644
--- a/llvm/lib/Support/Parallel.cpp
+++ b/llvm/lib/Support/Parallel.cpp
@@ -19,10 +19,9 @@
llvm::ThreadPoolStrategy llvm::parallel::strategy;
-#if LLVM_ENABLE_THREADS
-
namespace llvm {
namespace parallel {
+#if LLVM_ENABLE_THREADS
namespace detail {
namespace {
@@ -143,6 +142,8 @@ Executor *Executor::getDefaultExecutor() {
return Exec.get();
}
} // namespace
+} // namespace detail
+#endif
static std::atomic<int> TaskGroupInstances;
@@ -159,21 +160,27 @@ TaskGroup::~TaskGroup() {
}
void TaskGroup::spawn(std::function<void()> F) {
+#if LLVM_ENABLE_THREADS
if (Parallel) {
L.inc();
- Executor::getDefaultExecutor()->add([&, F = std::move(F)] {
+ detail::Executor::getDefaultExecutor()->add([&, F = std::move(F)] {
F();
L.dec();
});
- } else {
- F();
+ return;
}
+#endif
+ F();
}
-} // namespace detail
+void TaskGroup::execute(std::function<void()> F) {
+ if (parallel::strategy.ThreadsRequested == 1)
+ F();
+ else
+ spawn(F);
+}
} // namespace parallel
} // namespace llvm
-#endif // LLVM_ENABLE_THREADS
void llvm::parallelFor(size_t Begin, size_t End,
llvm::function_ref<void(size_t)> Fn) {
@@ -190,7 +197,7 @@ void llvm::parallelFor(size_t Begin, size_t End,
if (TaskSize == 0)
TaskSize = 1;
- parallel::detail::TaskGroup TG;
+ parallel::TaskGroup TG;
for (; Begin + TaskSize < End; Begin += TaskSize) {
TG.spawn([=, &Fn] {
for (size_t I = Begin, E = Begin + TaskSize; I != E; ++I)
More information about the llvm-commits
mailing list