[lld] 4cdc441 - [ELF] Parallelize --compress-debug-sections=zlib

Tue Jan 25 10:29:08 PST 2022

Author: Fangrui Song
Date: 2022-01-25T10:29:04-08:00
New Revision: 4cdc4416903bd4a818d70042d479442725eeebcc

URL: https://github.com/llvm/llvm-project/commit/4cdc4416903bd4a818d70042d479442725eeebcc
DIFF: https://github.com/llvm/llvm-project/commit/4cdc4416903bd4a818d70042d479442725eeebcc.diff

LOG: [ELF] Parallelize --compress-debug-sections=zlib

When linking a Debug build clang (265MiB SHF_ALLOC sections, 920MiB uncompressed
debug info), in a --threads=1 link "Compress debug sections" takes 2/3 time and
in a --threads=8 link "Compress debug sections" takes ~70% time.

This patch splits a section into 1MiB shards and calls zlib `deflake` parallelly.

DEFLATE blocks are a bit sequence. We need to ensure every shard starts
at a byte boundary for concatenation. We use Z_SYNC_FLUSH for all shards
but the last to flush the output to a byte boundary. (Z_FULL_FLUSH can
be used as well, but Z_FULL_FLUSH clears the hash table which just
wastes time.)

The last block requires the BFINAL flag. We call deflate with Z_FINISH
to set the flag as well as flush the output to a byte boundary. Under
the hood, all of Z_SYNC_FLUSH, Z_FULL_FLUSH, and Z_FINISH emit a
non-compressed block (called stored block in zlib). RFC1951 says "Any
bits of input up to the next byte boundary are ignored."

In a --threads=8 link, "Compress debug sections" is 5.7x as fast and the total
speed is 2.54x. Because the hash table for one shard is not shared with the next
shard, the output is slightly larger. Better compression ratio can be achieved
by preloading the window size from the previous shard as dictionary
(`deflateSetDictionary`), but that is overkill.

```
# 1MiB shards
% bloaty clang.new -- clang.old
    FILE SIZE        VM SIZE
 --------------  --------------
  +0.3%  +129Ki  [ = ]       0    .debug_str
  +0.1%  +105Ki  [ = ]       0    .debug_info
  +0.3%  +101Ki  [ = ]       0    .debug_line
  +0.2% +2.66Ki  [ = ]       0    .debug_abbrev
  +0.0% +1.19Ki  [ = ]       0    .debug_ranges
  +0.1%  +341Ki  [ = ]       0    TOTAL

# 2MiB shards
% bloaty clang.new -- clang.old
    FILE SIZE        VM SIZE
 --------------  --------------
  +0.2% +74.2Ki  [ = ]       0    .debug_line
  +0.1% +72.3Ki  [ = ]       0    .debug_str
  +0.0% +69.9Ki  [ = ]       0    .debug_info
  +0.1%    +976  [ = ]       0    .debug_abbrev
  +0.0%    +882  [ = ]       0    .debug_ranges
  +0.0%  +218Ki  [ = ]       0    TOTAL
```

Bonus in not using zlib::compress

* we can compress a debug section larger than 4GiB
* peak memory usage is lower because for most shards the output size is less
  than 50% input size (all less than 55% for a large binary I tested, but
  decreasing the initial output size does not decrease memory usage)

Reviewed By: ikudrin

Differential Revision: https://reviews.llvm.org/D117853

Added: 
    

Modified: 
    lld/ELF/CMakeLists.txt
    lld/ELF/OutputSections.cpp
    lld/ELF/OutputSections.h

Removed: 
    


################################################################################
diff  --git a/lld/ELF/CMakeLists.txt b/lld/ELF/CMakeLists.txt
index f85d0fb9f55e3..b37035d3e7429 100644

--- a/lld/ELF/CMakeLists.txt
+++ b/lld/ELF/CMakeLists.txt
@@ -2,6 +2,10 @@ set(LLVM_TARGET_DEFINITIONS Options.td)
 tablegen(LLVM Options.inc -gen-opt-parser-defs)
 add_public_tablegen_target(ELFOptionsTableGen)
 
+if(LLVM_ENABLE_ZLIB)
+  set(imported_libs ZLIB::ZLIB)
+endif()
+
 add_lld_library(lldELF
   AArch64ErrataFix.cpp
   Arch/AArch64.cpp
@@ -58,6 +62,7 @@ add_lld_library(lldELF
 
   LINK_LIBS
   lldCommon
+  ${imported_libs}
   ${LLVM_PTHREAD_LIB}
 
   DEPENDS

diff  --git a/lld/ELF/OutputSections.cpp b/lld/ELF/OutputSections.cpp
index 07ee7d84a2cd3..cffde5d61ac91 100644
--- a/lld/ELF/OutputSections.cpp
+++ b/lld/ELF/OutputSections.cpp
@@ -15,7 +15,7 @@
 #include "lld/Common/Memory.h"
 #include "lld/Common/Strings.h"
 #include "llvm/BinaryFormat/Dwarf.h"
-#include "llvm/Support/Compression.h"
+#include "llvm/Config/config.h" // LLVM_ENABLE_ZLIB
 #include "llvm/Support/MD5.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/Parallel.h"
@@ -23,6 +23,9 @@
 #include "llvm/Support/TimeProfiler.h"
 #include <regex>
 #include <unordered_set>
+#if LLVM_ENABLE_ZLIB
+#include <zlib.h>
+#endif
 
 using namespace llvm;
 using namespace llvm::dwarf;
@@ -284,13 +287,45 @@ static void fill(uint8_t *buf, size_t size,
   memcpy(buf + i, filler.data(), size - i);
 }
 
+#if LLVM_ENABLE_ZLIB
+static SmallVector<uint8_t, 0> deflateShard(ArrayRef<uint8_t> in, int level,
+                                            int flush) {
+  // 15 and 8 are default. windowBits=-15 is negative to generate raw deflate
+  // data with no zlib header or trailer.
+  z_stream s = {};
+  deflateInit2(&s, level, Z_DEFLATED, -15, 8, Z_DEFAULT_STRATEGY);
+  s.next_in = const_cast<uint8_t *>(in.data());
+  s.avail_in = in.size();
+
+  // Allocate a buffer of half of the input size, and grow it by 1.5x if
+  // insufficient.
+  SmallVector<uint8_t, 0> out;
+  size_t pos = 0;
+  out.resize_for_overwrite(std::max<size_t>(in.size() / 2, 64));
+  do {
+    if (pos == out.size())
+      out.resize_for_overwrite(out.size() * 3 / 2);
+    s.next_out = out.data() + pos;
+    s.avail_out = out.size() - pos;
+    (void)deflate(&s, flush);
+    pos = s.next_out - out.data();
+  } while (s.avail_out == 0);
+  assert(s.avail_in == 0);
+
+  out.truncate(pos);
+  deflateEnd(&s);
+  return out;
+}
+#endif
+
 // Compress section contents if this section contains debug info.
 template <class ELFT> void OutputSection::maybeCompress() {
+#if LLVM_ENABLE_ZLIB
   using Elf_Chdr = typename ELFT::Chdr;
 
   // Compress only DWARF debug sections.
   if (!config->compressDebugSections || (flags & SHF_ALLOC) ||
-      !name.startswith(".debug_"))
+      !name.startswith(".debug_") || size == 0)
     return;
 
   llvm::TimeTraceScope timeScope("Compress debug sections");
@@ -309,13 +344,42 @@ template <class ELFT> void OutputSection::maybeCompress() {
   // -O2 is given, we use level 6 to compress debug info more by ~15%. We found
   // that level 7 to 9 doesn't make much 
diff erence (~1% more compression) while
   // they take significant amount of time (~2x), so level 6 seems enough.
-  if (Error e = zlib::compress(toStringRef(buf), compressedData,
-                               config->optimize >= 2 ? 6 : 1))
-    fatal("compress failed: " + llvm::toString(std::move(e)));
+  const int level = config->optimize >= 2 ? 6 : Z_BEST_SPEED;
+
+  // Split input into 1-MiB shards.
+  constexpr size_t shardSize = 1 << 20;
+  const size_t numShards = (size + shardSize - 1) / shardSize;
+  auto shardsIn = std::make_unique<ArrayRef<uint8_t>[]>(numShards);
+  for (size_t i = 0, start = 0, end; start != buf.size(); ++i, start = end) {
+    end = std::min(start + shardSize, buf.size());
+    shardsIn[i] = makeArrayRef<uint8_t>(buf.data() + start, end - start);
+  }
+
+  // Compress shards and compute Alder-32 checksums. Use Z_SYNC_FLUSH for all
+  // shards but the last to flush the output to a byte boundary to be
+  // concatenated with the next shard.
+  auto shardsOut = std::make_unique<SmallVector<uint8_t, 0>[]>(numShards);
+  auto shardsAdler = std::make_unique<uint32_t[]>(numShards);
+  parallelForEachN(0, numShards, [&](size_t i) {
+    shardsOut[i] = deflateShard(shardsIn[i], level,
+                                i != numShards - 1 ? Z_SYNC_FLUSH : Z_FINISH);
+    shardsAdler[i] = adler32(1, shardsIn[i].data(), shardsIn[i].size());
+  });
+
+  // Update section size and combine Alder-32 checksums.
+  uint32_t checksum = 1;       // Initial Adler-32 value
+  size = sizeof(Elf_Chdr) + 2; // Elf_Chdir and zlib header
+  for (size_t i = 0; i != numShards; ++i) {
+    size += shardsOut[i].size();
+    checksum = adler32_combine(checksum, shardsAdler[i], shardsIn[i].size());
+  }
+  size += 4; // checksum
 
-  // Update section headers.
-  size = sizeof(Elf_Chdr) + compressedData.size();
+  compressed.shards = std::move(shardsOut);
+  compressed.numShards = numShards;
+  compressed.checksum = checksum;
   flags |= SHF_COMPRESSED;
+#endif
 }
 
 static void writeInt(uint8_t *buf, uint64_t data, uint64_t size) {
@@ -339,10 +403,25 @@ template <class ELFT> void OutputSection::writeTo(uint8_t *buf) {
   // If --compress-debug-section is specified and if this is a debug section,
   // we've already compressed section contents. If that's the case,
   // just write it down.
-  if (!compressedData.empty()) {
+  if (compressed.shards) {
     memcpy(buf, zDebugHeader.data(), zDebugHeader.size());
-    memcpy(buf + zDebugHeader.size(), compressedData.data(),
-           compressedData.size());
+    buf += zDebugHeader.size();
+    size -= zDebugHeader.size();
+
+    // Compute shard offsets.
+    auto offsets = std::make_unique<size_t[]>(compressed.numShards);
+    offsets[0] = 2; // zlib header
+    for (size_t i = 1; i != compressed.numShards; ++i)
+      offsets[i] = offsets[i - 1] + compressed.shards[i - 1].size();
+
+    buf[0] = 0x78; // CMF
+    buf[1] = 0x01; // FLG: best speed
+    parallelForEachN(0, compressed.numShards, [&](size_t i) {
+      memcpy(buf + offsets[i], compressed.shards[i].data(),
+             compressed.shards[i].size());
+    });
+
+    write32be(buf + size - 4, compressed.checksum);
     return;
   }
 

diff  --git a/lld/ELF/OutputSections.h b/lld/ELF/OutputSections.h
index 4f589d8432e43..957e6768ff6ea 100644
--- a/lld/ELF/OutputSections.h
+++ b/lld/ELF/OutputSections.h
@@ -25,6 +25,12 @@ struct PhdrEntry;
 class InputSection;
 class InputSectionBase;
 
+struct CompressedData {
+  std::unique_ptr<SmallVector<uint8_t, 0>[]> shards;
+  uint32_t numShards = 0;
+  uint32_t checksum = 0;
+};
+
 // This represents a section in an output file.
 // It is composed of multiple InputSections.
 // The writer creates multiple OutputSections and assign them unique,
@@ -113,7 +119,7 @@ class OutputSection final : public SectionCommand, public SectionBase {
 private:
   // Used for implementation of --compress-debug-sections option.
   SmallVector<uint8_t, 0> zDebugHeader;
-  SmallVector<char, 0> compressedData;
+  CompressedData compressed;
 
   std::array<uint8_t, 4> getFiller();
 };