[lld] fa74144 - [ELF] Parallelize --compress-debug-sections=zstd

Wed Sep 21 11:13:10 PDT 2022

Author: Fangrui Song
Date: 2022-09-21T11:13:03-07:00
New Revision: fa74144c64dff6b145b0b3fa9397f913ddaa87bf

URL: https://github.com/llvm/llvm-project/commit/fa74144c64dff6b145b0b3fa9397f913ddaa87bf
DIFF: https://github.com/llvm/llvm-project/commit/fa74144c64dff6b145b0b3fa9397f913ddaa87bf.diff

LOG: [ELF] Parallelize --compress-debug-sections=zstd

See D117853: compressing debug sections is a bottleneck and therefore it
has a large value parallizing the step.

zstd provides multi-threading API and the output is deterministic even with
different numbers of threads (see https://github.com/facebook/zstd/issues/2238).
Therefore we can leverage it instead of using the pigz-style sharding approach.

Also, switch to the default compression level 3. The current level 5
is significantly slower without providing justifying size benefit.

```
  'dash b.sh 1' ran
    1.05 ± 0.01 times faster than 'dash b.sh 3'
    1.18 ± 0.01 times faster than 'dash b.sh 4'
    1.29 ± 0.02 times faster than 'dash b.sh 5'

level=1 size: 358946945
level=3 size: 309002145
level=4 size: 307693204
level=5 size: 297828315
```

Reviewed By: andrewng, peter.smith

Differential Revision: https://reviews.llvm.org/D133679

Added: 
    

Modified: 
    lld/ELF/OutputSections.cpp

Removed: 
    


################################################################################
diff  --git a/lld/ELF/OutputSections.cpp b/lld/ELF/OutputSections.cpp
index 517e8156330a5..42af1159560a3 100644

--- a/lld/ELF/OutputSections.cpp
+++ b/lld/ELF/OutputSections.cpp
@@ -24,6 +24,9 @@
 #if LLVM_ENABLE_ZLIB
 #include <zlib.h>
 #endif
+#if LLVM_ENABLE_ZSTD
+#include <zstd.h>
+#endif
 
 using namespace llvm;
 using namespace llvm::dwarf;
@@ -331,25 +334,60 @@ template <class ELFT> void OutputSection::maybeCompress() {
   llvm::TimeTraceScope timeScope("Compress debug sections");
   compressed.uncompressedSize = size;
   auto buf = std::make_unique<uint8_t[]>(size);
+  // Write uncompressed data to a temporary zero-initialized buffer.
+  {
+    parallel::TaskGroup tg;
+    writeTo<ELFT>(buf.get(), tg);
+  }
+
+#if LLVM_ENABLE_ZSTD
+  // Use ZSTD's streaming compression API which permits parallel workers working
+  // on the stream. See http://facebook.github.io/zstd/zstd_manual.html
+  // "Streaming compression - HowTo".
   if (config->compressDebugSections == DebugCompressionType::Zstd) {
-    {
-      parallel::TaskGroup tg;
-      writeTo<ELFT>(buf.get(), tg);
-    }
+    // Allocate a buffer of half of the input size, and grow it by 1.5x if
+    // insufficient.
     compressed.shards = std::make_unique<SmallVector<uint8_t, 0>[]>(1);
-    compression::zstd::compress(makeArrayRef(buf.get(), size),
-                                compressed.shards[0]);
-    size = sizeof(Elf_Chdr) + compressed.shards[0].size();
+    SmallVector<uint8_t, 0> &out = compressed.shards[0];
+    out.resize_for_overwrite(std::max<size_t>(size / 2, 32));
+    size_t pos = 0;
+
+    ZSTD_CCtx *cctx = ZSTD_createCCtx();
+    size_t ret = ZSTD_CCtx_setParameter(
+        cctx, ZSTD_c_nbWorkers, parallel::strategy.compute_thread_count());
+    if (ZSTD_isError(ret))
+      fatal(Twine("ZSTD_CCtx_setParameter: ") + ZSTD_getErrorName(ret));
+    ZSTD_outBuffer zob = {out.data(), out.size(), 0};
+    ZSTD_EndDirective directive = ZSTD_e_continue;
+    const size_t blockSize = ZSTD_CStreamInSize();
+    do {
+      const size_t n = std::min(size - pos, blockSize);
+      if (n == size - pos)
+        directive = ZSTD_e_end;
+      ZSTD_inBuffer zib = {buf.get() + pos, n, 0};
+      size_t bytesRemaining = 0;
+      while (zib.pos != zib.size ||
+             (directive == ZSTD_e_end && bytesRemaining != 0)) {
+        if (zob.pos == zob.size) {
+          out.resize_for_overwrite(out.size() * 3 / 2);
+          zob.dst = out.data();
+          zob.size = out.size();
+        }
+        bytesRemaining = ZSTD_compressStream2(cctx, &zob, &zib, directive);
+        assert(!ZSTD_isError(bytesRemaining));
+      }
+      pos += n;
+    } while (directive != ZSTD_e_end);
+    out.resize(zob.pos);
+    ZSTD_freeCCtx(cctx);
+
+    size = sizeof(Elf_Chdr) + out.size();
     flags |= SHF_COMPRESSED;
     return;
   }
+#endif
 
 #if LLVM_ENABLE_ZLIB
-  // Write uncompressed data to a temporary zero-initialized buffer.
-  {
-    parallel::TaskGroup tg;
-    writeTo<ELFT>(buf.get(), tg);
-  }
   // We chose 1 (Z_BEST_SPEED) as the default compression level because it is
   // the fastest. If -O2 is given, we use level 6 to compress debug info more by
   // ~15%. We found that level 7 to 9 doesn't make much 
diff erence (~1% more