[PATCH] D133679: [ELF] Parallelize --compress-debug-sections=zstd
Fangrui Song via Phabricator via llvm-commits
llvm-commits at lists.llvm.org
Sun Sep 11 17:37:52 PDT 2022
MaskRay created this revision.
Herald added subscribers: StephenFan, arichardson, emaste.
Herald added a project: All.
MaskRay added a comment.
MaskRay updated this revision to Diff 459392.
MaskRay retitled this revision from "[WIP][ELF] Parallelize --compress-debug-sections=zstd" to "[ELF] Parallelize --compress-debug-sections=zstd".
MaskRay added reviewers: ikudrin, andrewng, peter.smith.
MaskRay published this revision for review.
Herald added a project: LLVM.
Herald added a subscriber: llvm-commits.
This patch is derived from the following zstd parallelism experiment:
# Build zstd with cmake
git clone https://github.com/facebook/zstd
cd zstd
cmake -GNinja -Hbuild/cmake -Bout/release -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/tmp/opt/zstd
make -j 8 install
% g++ -O2 -g z.cc -L/tmp/p/zstd/out/release/lib -lzstd -o z
% time ./z debug_info debug_info.zstd 1
./z debug_info debug_info.zstd 1 3.30s user 0.75s system 113% cpu 3.574 total
% time ./z debug_info debug_info.zstd 2
./z debug_info debug_info.zstd 2 3.39s user 0.71s system 182% cpu 2.239 total
% time ./z debug_info debug_info.zstd 4
./z debug_info debug_info.zstd 4 3.47s user 0.63s system 267% cpu 1.533 total
% time ./z debug_info debug_info.zstd 8
./z debug_info debug_info.zstd 8 3.76s user 0.66s system 349% cpu 1.263 total
The cli program is significantly faster. I do not know whether it's the program is async reading or other feature I have missed. Filed https://github.com/llvm/llvm-project/issues/57685
% time /tmp/p/zstd/out/release/programs/zstd -fq -T1 debug_info
/tmp/p/zstd/out/release/programs/zstd -fq -T1 debug_info 2.98s user 0.51s system 126% cpu 2.767 total
% time /tmp/p/zstd/out/release/programs/zstd -fq -T2 debug_info
/tmp/p/zstd/out/release/programs/zstd -fq -T2 debug_info 3.02s user 0.52s system 235% cpu 1.501 total
% time /tmp/p/zstd/out/release/programs/zstd -fq -T4 debug_info
/tmp/p/zstd/out/release/programs/zstd -fq -T4 debug_info 3.02s user 0.51s system 435% cpu 0.811 total
#include <algorithm>
#include <vector>
#include <fcntl.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <unistd.h>
#include <zstd.h>
int main(int argc, char *argv[]) {
int fdin = open(argv[1], O_RDONLY);
if (fdin < 0) return 1;
struct stat st;
if (fstat(fdin, &st) < 0) return 1;
void *in = mmap(0, st.st_size, PROT_READ, MAP_SHARED, fdin, 0);
if (in == MAP_FAILED) return 1;
int fdout = open(argv[2], O_RDWR);
if (fdout < 0) return 1;
int th = 0;
if (argc > 3)
th = atoi(argv[3]);
std::vector<uint8_t> out;
out.resize(64);
size_t pos = 0;
ZSTD_CCtx *cctx = ZSTD_createCCtx();
if (!cctx)
return 1;
if (ZSTD_isError(ZSTD_CCtx_setParameter(cctx, ZSTD_c_nbWorkers, th)))
return 2;
ZSTD_outBuffer zob = {out.data(), out.size(), 0};
auto directive = ZSTD_e_continue;
do {
size_t n = std::min(st.st_size-pos, (size_t)1<<20);
if (n == st.st_size-pos)
directive = ZSTD_e_end;
ZSTD_inBuffer zib = { (char*)in+pos, n, 0 };
size_t more = 1;
while (zib.pos != zib.size || directive == ZSTD_e_end && more != 0) {
if (zob.pos == zob.size) {
out.resize(out.size() * 3 / 2);
zob.dst = out.data();
zob.size = out.size();
}
more = ZSTD_compressStream2(cctx, &zob, &zib, directive);
if (ZSTD_isError(more)) {
fprintf(stderr, "%s\n", ZSTD_getErrorName(more));
return 3;
}
}
pos += n;
} while (directive != ZSTD_e_end);
out.resize(zob.pos);
ftruncate(fdout, out.size());
void *mout = mmap(0, out.size(), PROT_READ | PROT_WRITE, MAP_SHARED, fdout, 0);
memcpy(mout, out.data(), out.size());
munmap(mout, out.size());
close(fdout);
ZSTD_freeCCtx(cctx);
}
MaskRay added a comment.
git clone https://github.com/llvm/llvm-project.git --depth=1
cd llvm-project
curl -L 'https://reviews.llvm.org/D133679?download=1' | patch -p1
# Build lld. See https://llvm.org/docs/GettingStarted.html
cmake -GNinja -Sllvm -B/tmp/out/custom1 -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_PROJECTS=lld -DLLVM_ENABLE_ZSTD=FORCE_ON -DCMAKE_PREFIX_PATH=/tmp/opt/zstd -DLLVM_ENABLE_LLD=on
ninja -C /tmp/out/custom1 lld
No compression
% time /tmp/out/custom1/bin/ld.lld @response.txt -o a.out --threads=2
/tmp/out/custom1/bin/ld.lld @response.txt -o a.out --threads=2 9.89s user 2.92s system 151% cpu 8.477 total
% time /tmp/out/custom1/bin/ld.lld @response.txt -o a.out --threads=4
/tmp/out/custom1/bin/ld.lld @response.txt -o a.out --threads=4 10.82s user 3.08s system 209% cpu 6.640 total
zstd
% time /tmp/out/custom1/bin/ld.lld --compress-debug-sections=zstd @response.txt -o a.out --threads=1
/tmp/out/custom1/bin/ld.lld --compress-debug-sections=zstd @response.txt -o 14.19s user 3.10s system 104% cpu 16.532 total
% time /tmp/out/custom1/bin/ld.lld --compress-debug-sections=zstd @response.txt -o a.out --threads=2
/tmp/out/custom1/bin/ld.lld --compress-debug-sections=zstd @response.txt -o 15.16s user 3.83s system 162% cpu 11.657 total
% time /tmp/out/custom1/bin/ld.lld --compress-debug-sections=zstd @response.txt -o a.out --threads=4
/tmp/out/custom1/bin/ld.lld --compress-debug-sections=zstd @response.txt -o 16.73s user 3.77s system 219% cpu 9.323 total
% time /tmp/out/custom1/bin/ld.lld --compress-debug-sections=zstd @response.txt -o a.out --threads=8
/tmp/out/custom1/bin/ld.lld --compress-debug-sections=zstd @response.txt -o 18.97s user 4.04s system 280% cpu 8.194 total
zlib
% time /tmp/out/custom1/bin/ld.lld --compress-debug-sections=zlib @response.txt -o a.out --threads=2
/tmp/out/custom1/bin/ld.lld --compress-debug-sections=zlib @response.txt -o 23.68s user 3.02s system 168% cpu 15.805 total
% time /tmp/out/custom1/bin/ld.lld --compress-debug-sections=zlib @response.txt -o a.out --threads=4
/tmp/out/custom1/bin/ld.lld --compress-debug-sections=zlib @response.txt -o 24.55s user 3.43s system 253% cpu 11.036 total
MaskRay added a comment.
update
Repository:
rG LLVM Github Monorepo
https://reviews.llvm.org/D133679
Files:
lld/ELF/OutputSections.cpp
Index: lld/ELF/OutputSections.cpp
===================================================================
--- lld/ELF/OutputSections.cpp
+++ lld/ELF/OutputSections.cpp
@@ -24,6 +24,9 @@
#if LLVM_ENABLE_ZLIB
#include <zlib.h>
#endif
+#if LLVM_ENABLE_ZSTD
+#include <zstd.h>
+#endif
using namespace llvm;
using namespace llvm::dwarf;
@@ -331,25 +334,55 @@
llvm::TimeTraceScope timeScope("Compress debug sections");
compressed.uncompressedSize = size;
auto buf = std::make_unique<uint8_t[]>(size);
- if (config->compressDebugSections == DebugCompressionType::Zstd) {
- {
- parallel::TaskGroup tg;
- writeTo<ELFT>(buf.get(), tg);
- }
- compressed.shards = std::make_unique<SmallVector<uint8_t, 0>[]>(1);
- compression::zstd::compress(makeArrayRef(buf.get(), size),
- compressed.shards[0]);
- size = sizeof(Elf_Chdr) + compressed.shards[0].size();
- flags |= SHF_COMPRESSED;
- return;
- }
-
-#if LLVM_ENABLE_ZLIB
// Write uncompressed data to a temporary zero-initialized buffer.
{
parallel::TaskGroup tg;
writeTo<ELFT>(buf.get(), tg);
}
+
+#if LLVM_ENABLE_ZSTD
+ if (config->compressDebugSections == DebugCompressionType::Zstd) {
+ // Allocate a buffer of half of the input size, and grow it by 1.5x if
+ // insufficient.
+ compressed.shards = std::make_unique<SmallVector<uint8_t, 0>[]>(1);
+ SmallVector<uint8_t, 0> &out = compressed.shards[0];
+ out.resize_for_overwrite(std::max<size_t>(size / 2, 32));
+ size_t pos = 0;
+
+ ZSTD_CCtx *cctx = ZSTD_createCCtx();
+ size_t ret = ZSTD_CCtx_setParameter(
+ cctx, ZSTD_c_nbWorkers, parallel::strategy.compute_thread_count());
+ if (ZSTD_isError(ret))
+ fatal(Twine("ZSTD_CCtx_setParameter: ") + ZSTD_getErrorName(ret));
+ ZSTD_outBuffer zob = {out.data(), out.size(), 0};
+ ZSTD_EndDirective directive = ZSTD_e_continue;
+ do {
+ const size_t n = std::min(size - pos, (size_t)1 << 20);
+ if (n == size - pos)
+ directive = ZSTD_e_end;
+ ZSTD_inBuffer zib = {buf.get() + pos, n, 0};
+ size_t more = 1;
+ while (zib.pos != zib.size || (directive == ZSTD_e_end && more != 0)) {
+ if (zob.pos == zob.size) {
+ out.resize_for_overwrite(out.size() * 3 / 2);
+ zob.dst = out.data();
+ zob.size = out.size();
+ }
+ more = ZSTD_compressStream2(cctx, &zob, &zib, directive);
+ assert(!ZSTD_isError(more));
+ }
+ pos += n;
+ } while (directive != ZSTD_e_end);
+ out.resize(zob.pos);
+ ZSTD_freeCCtx(cctx);
+
+ size = sizeof(Elf_Chdr) + out.size();
+ flags |= SHF_COMPRESSED;
+ return;
+ }
+#endif
+
+#if LLVM_ENABLE_ZLIB
// We chose 1 (Z_BEST_SPEED) as the default compression level because it is
// the fastest. If -O2 is given, we use level 6 to compress debug info more by
// ~15%. We found that level 7 to 9 doesn't make much difference (~1% more
-------------- next part --------------
A non-text attachment was scrubbed...
Name: D133679.459392.patch
Type: text/x-patch
Size: 2954 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20220912/40aa7e0f/attachment.bin>
More information about the llvm-commits
mailing list