[PATCH] D133679: [ELF] Parallelize --compress-debug-sections=zstd

Fangrui Song via Phabricator via llvm-commits llvm-commits at lists.llvm.org
Sun Sep 11 17:37:52 PDT 2022


MaskRay created this revision.
Herald added subscribers: StephenFan, arichardson, emaste.
Herald added a project: All.
MaskRay added a comment.
MaskRay updated this revision to Diff 459392.
MaskRay retitled this revision from "[WIP][ELF] Parallelize --compress-debug-sections=zstd" to "[ELF] Parallelize --compress-debug-sections=zstd".
MaskRay added reviewers: ikudrin, andrewng, peter.smith.
MaskRay published this revision for review.
Herald added a project: LLVM.
Herald added a subscriber: llvm-commits.

This patch is derived from the following zstd parallelism experiment:

  # Build zstd with cmake
  git clone https://github.com/facebook/zstd
  cd zstd
  cmake -GNinja -Hbuild/cmake -Bout/release -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/tmp/opt/zstd
  make -j 8 install



  % g++ -O2 -g z.cc -L/tmp/p/zstd/out/release/lib -lzstd -o z
  % time ./z debug_info debug_info.zstd 1
  ./z debug_info debug_info.zstd 1  3.30s user 0.75s system 113% cpu 3.574 total
  % time ./z debug_info debug_info.zstd 2
  ./z debug_info debug_info.zstd 2  3.39s user 0.71s system 182% cpu 2.239 total
  % time ./z debug_info debug_info.zstd 4
  ./z debug_info debug_info.zstd 4  3.47s user 0.63s system 267% cpu 1.533 total
  % time ./z debug_info debug_info.zstd 8
  ./z debug_info debug_info.zstd 8  3.76s user 0.66s system 349% cpu 1.263 total

The cli program is significantly faster. I do not know whether it's the program is async reading or other feature I have missed. Filed https://github.com/llvm/llvm-project/issues/57685

  % time /tmp/p/zstd/out/release/programs/zstd -fq -T1 debug_info
  /tmp/p/zstd/out/release/programs/zstd -fq -T1 debug_info  2.98s user 0.51s system 126% cpu 2.767 total
  % time /tmp/p/zstd/out/release/programs/zstd -fq -T2 debug_info
  /tmp/p/zstd/out/release/programs/zstd -fq -T2 debug_info  3.02s user 0.52s system 235% cpu 1.501 total
  % time /tmp/p/zstd/out/release/programs/zstd -fq -T4 debug_info
  /tmp/p/zstd/out/release/programs/zstd -fq -T4 debug_info  3.02s user 0.51s system 435% cpu 0.811 total



  #include <algorithm>
  #include <vector>
  
  #include <fcntl.h>
  #include <stdint.h>
  #include <stdio.h>
  #include <stdlib.h>
  #include <string.h>
  #include <sys/mman.h>
  #include <sys/stat.h>
  #include <unistd.h>
  #include <zstd.h>
  
  int main(int argc, char *argv[]) {
    int fdin = open(argv[1], O_RDONLY);
    if (fdin < 0) return 1;
    struct stat st;
    if (fstat(fdin, &st) < 0) return 1;
    void *in = mmap(0, st.st_size, PROT_READ, MAP_SHARED, fdin, 0);
    if (in == MAP_FAILED) return 1;
    int fdout = open(argv[2], O_RDWR);
    if (fdout < 0) return 1;
    int th = 0;
    if (argc > 3)
      th = atoi(argv[3]);
  
    std::vector<uint8_t> out;
    out.resize(64);
    size_t pos = 0;
  
    ZSTD_CCtx *cctx = ZSTD_createCCtx();
    if (!cctx)
      return 1;
    if (ZSTD_isError(ZSTD_CCtx_setParameter(cctx, ZSTD_c_nbWorkers, th)))
      return 2;
    ZSTD_outBuffer zob = {out.data(), out.size(), 0};
    auto directive = ZSTD_e_continue;
    do {
      size_t n = std::min(st.st_size-pos, (size_t)1<<20);
      if (n == st.st_size-pos)
        directive = ZSTD_e_end;
      ZSTD_inBuffer zib = { (char*)in+pos, n, 0 };
      size_t more = 1;
      while (zib.pos != zib.size || directive == ZSTD_e_end && more != 0) {
        if (zob.pos == zob.size) {
          out.resize(out.size() * 3 / 2);
          zob.dst = out.data();
          zob.size = out.size();
        }
  
        more = ZSTD_compressStream2(cctx, &zob, &zib, directive);
        if (ZSTD_isError(more)) {
          fprintf(stderr, "%s\n", ZSTD_getErrorName(more));
          return 3;
        }
      }
      pos += n;
    } while (directive != ZSTD_e_end);
  
    out.resize(zob.pos);
    ftruncate(fdout, out.size());
  
    void *mout = mmap(0, out.size(), PROT_READ | PROT_WRITE, MAP_SHARED, fdout, 0);
    memcpy(mout, out.data(), out.size());
    munmap(mout, out.size());
    close(fdout);
  
    ZSTD_freeCCtx(cctx);
  }


MaskRay added a comment.

  git clone https://github.com/llvm/llvm-project.git --depth=1
  cd llvm-project
  curl -L 'https://reviews.llvm.org/D133679?download=1' | patch -p1
  
  # Build lld. See https://llvm.org/docs/GettingStarted.html
  cmake -GNinja -Sllvm -B/tmp/out/custom1 -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_PROJECTS=lld -DLLVM_ENABLE_ZSTD=FORCE_ON -DCMAKE_PREFIX_PATH=/tmp/opt/zstd -DLLVM_ENABLE_LLD=on
  ninja -C /tmp/out/custom1 lld



  No compression
  
  % time /tmp/out/custom1/bin/ld.lld @response.txt -o a.out --threads=2
  /tmp/out/custom1/bin/ld.lld @response.txt -o a.out --threads=2  9.89s user 2.92s system 151% cpu 8.477 total
  % time /tmp/out/custom1/bin/ld.lld @response.txt -o a.out --threads=4
  /tmp/out/custom1/bin/ld.lld @response.txt -o a.out --threads=4  10.82s user 3.08s system 209% cpu 6.640 total
  
  zstd
  
  % time /tmp/out/custom1/bin/ld.lld --compress-debug-sections=zstd @response.txt -o a.out --threads=1
  /tmp/out/custom1/bin/ld.lld --compress-debug-sections=zstd @response.txt -o    14.19s user 3.10s system 104% cpu 16.532 total
  % time /tmp/out/custom1/bin/ld.lld --compress-debug-sections=zstd @response.txt -o a.out --threads=2
  /tmp/out/custom1/bin/ld.lld --compress-debug-sections=zstd @response.txt -o    15.16s user 3.83s system 162% cpu 11.657 total
  % time /tmp/out/custom1/bin/ld.lld --compress-debug-sections=zstd @response.txt -o a.out --threads=4
  /tmp/out/custom1/bin/ld.lld --compress-debug-sections=zstd @response.txt -o    16.73s user 3.77s system 219% cpu 9.323 total
  % time /tmp/out/custom1/bin/ld.lld --compress-debug-sections=zstd @response.txt -o a.out --threads=8
  /tmp/out/custom1/bin/ld.lld --compress-debug-sections=zstd @response.txt -o    18.97s user 4.04s system 280% cpu 8.194 total
  
  zlib
  
  % time /tmp/out/custom1/bin/ld.lld --compress-debug-sections=zlib @response.txt -o a.out --threads=2
  /tmp/out/custom1/bin/ld.lld --compress-debug-sections=zlib @response.txt -o    23.68s user 3.02s system 168% cpu 15.805 total
  % time /tmp/out/custom1/bin/ld.lld --compress-debug-sections=zlib @response.txt -o a.out --threads=4
  /tmp/out/custom1/bin/ld.lld --compress-debug-sections=zlib @response.txt -o    24.55s user 3.43s system 253% cpu 11.036 total


MaskRay added a comment.

update


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D133679

Files:
  lld/ELF/OutputSections.cpp


Index: lld/ELF/OutputSections.cpp
===================================================================
--- lld/ELF/OutputSections.cpp
+++ lld/ELF/OutputSections.cpp
@@ -24,6 +24,9 @@
 #if LLVM_ENABLE_ZLIB
 #include <zlib.h>
 #endif
+#if LLVM_ENABLE_ZSTD
+#include <zstd.h>
+#endif
 
 using namespace llvm;
 using namespace llvm::dwarf;
@@ -331,25 +334,55 @@
   llvm::TimeTraceScope timeScope("Compress debug sections");
   compressed.uncompressedSize = size;
   auto buf = std::make_unique<uint8_t[]>(size);
-  if (config->compressDebugSections == DebugCompressionType::Zstd) {
-    {
-      parallel::TaskGroup tg;
-      writeTo<ELFT>(buf.get(), tg);
-    }
-    compressed.shards = std::make_unique<SmallVector<uint8_t, 0>[]>(1);
-    compression::zstd::compress(makeArrayRef(buf.get(), size),
-                                compressed.shards[0]);
-    size = sizeof(Elf_Chdr) + compressed.shards[0].size();
-    flags |= SHF_COMPRESSED;
-    return;
-  }
-
-#if LLVM_ENABLE_ZLIB
   // Write uncompressed data to a temporary zero-initialized buffer.
   {
     parallel::TaskGroup tg;
     writeTo<ELFT>(buf.get(), tg);
   }
+
+#if LLVM_ENABLE_ZSTD
+  if (config->compressDebugSections == DebugCompressionType::Zstd) {
+    // Allocate a buffer of half of the input size, and grow it by 1.5x if
+    // insufficient.
+    compressed.shards = std::make_unique<SmallVector<uint8_t, 0>[]>(1);
+    SmallVector<uint8_t, 0> &out = compressed.shards[0];
+    out.resize_for_overwrite(std::max<size_t>(size / 2, 32));
+    size_t pos = 0;
+
+    ZSTD_CCtx *cctx = ZSTD_createCCtx();
+    size_t ret = ZSTD_CCtx_setParameter(
+        cctx, ZSTD_c_nbWorkers, parallel::strategy.compute_thread_count());
+    if (ZSTD_isError(ret))
+      fatal(Twine("ZSTD_CCtx_setParameter: ") + ZSTD_getErrorName(ret));
+    ZSTD_outBuffer zob = {out.data(), out.size(), 0};
+    ZSTD_EndDirective directive = ZSTD_e_continue;
+    do {
+      const size_t n = std::min(size - pos, (size_t)1 << 20);
+      if (n == size - pos)
+        directive = ZSTD_e_end;
+      ZSTD_inBuffer zib = {buf.get() + pos, n, 0};
+      size_t more = 1;
+      while (zib.pos != zib.size || (directive == ZSTD_e_end && more != 0)) {
+        if (zob.pos == zob.size) {
+          out.resize_for_overwrite(out.size() * 3 / 2);
+          zob.dst = out.data();
+          zob.size = out.size();
+        }
+        more = ZSTD_compressStream2(cctx, &zob, &zib, directive);
+        assert(!ZSTD_isError(more));
+      }
+      pos += n;
+    } while (directive != ZSTD_e_end);
+    out.resize(zob.pos);
+    ZSTD_freeCCtx(cctx);
+
+    size = sizeof(Elf_Chdr) + out.size();
+    flags |= SHF_COMPRESSED;
+    return;
+  }
+#endif
+
+#if LLVM_ENABLE_ZLIB
   // We chose 1 (Z_BEST_SPEED) as the default compression level because it is
   // the fastest. If -O2 is given, we use level 6 to compress debug info more by
   // ~15%. We found that level 7 to 9 doesn't make much difference (~1% more


-------------- next part --------------
A non-text attachment was scrubbed...
Name: D133679.459392.patch
Type: text/x-patch
Size: 2954 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20220912/40aa7e0f/attachment.bin>


More information about the llvm-commits mailing list