[lld] 79095b4 - [ELF] --compress-debug-sections=zstd: replace ZSTD_c_nbWorkers parallelism with multi-frame parallelism
Fangrui Song via llvm-commits
llvm-commits at lists.llvm.org
Mon Apr 29 22:05:40 PDT 2024
Author: Fangrui Song
Date: 2024-04-29T22:05:35-07:00
New Revision: 79095b4079e8d4f8176bcc53fdacd2765f310cdb
URL: https://github.com/llvm/llvm-project/commit/79095b4079e8d4f8176bcc53fdacd2765f310cdb
DIFF: https://github.com/llvm/llvm-project/commit/79095b4079e8d4f8176bcc53fdacd2765f310cdb.diff
LOG: [ELF] --compress-debug-sections=zstd: replace ZSTD_c_nbWorkers parallelism with multi-frame parallelism
https://reviews.llvm.org/D133679 utilizes zstd's multithread API to
create one single frame. This provides a higher compression ratio but is
significantly slower than concatenating multiple frames.
With manual parallelism, it is easier to parallelize memcpy in
OutputSection::writeTo for parallel memcpy.
In addition, as the individual allocated decompression buffers are much
smaller, we can make a wild guess (compressed_size/4) without worrying
about a resize (due to wrong guess) would waste memory.
Added:
Modified:
lld/ELF/OutputSections.cpp
Removed:
################################################################################
diff --git a/lld/ELF/OutputSections.cpp b/lld/ELF/OutputSections.cpp
index eadab9d745d687..3e58ed4bda2d3c 100644
--- a/lld/ELF/OutputSections.cpp
+++ b/lld/ELF/OutputSections.cpp
@@ -364,50 +364,43 @@ template <class ELFT> void OutputSection::maybeCompress() {
// useful when there are many compressed output sections.
addralign = 1;
+ // Split input into 1-MiB shards.
+ [[maybe_unused]] constexpr size_t shardSize = 1 << 20;
+ auto shardsIn = split(ArrayRef<uint8_t>(buf.get(), size), shardSize);
+ const size_t numShards = shardsIn.size();
+ compressed.numShards = numShards;
+ auto shardsOut = std::make_unique<SmallVector<uint8_t, 0>[]>(numShards);
+
#if LLVM_ENABLE_ZSTD
// Use ZSTD's streaming compression API which permits parallel workers working
// on the stream. See http://facebook.github.io/zstd/zstd_manual.html
// "Streaming compression - HowTo".
if (ctype == DebugCompressionType::Zstd) {
- // Allocate a buffer of half of the input size, and grow it by 1.5x if
- // insufficient.
- compressed.type = ELFCOMPRESS_ZSTD;
- compressed.shards = std::make_unique<SmallVector<uint8_t, 0>[]>(1);
- SmallVector<uint8_t, 0> &out = compressed.shards[0];
- out.resize_for_overwrite(std::max<size_t>(size / 2, 32));
- size_t pos = 0;
-
- ZSTD_CCtx *cctx = ZSTD_createCCtx();
- // Ignore error if zstd was not built with ZSTD_MULTITHREAD.
- (void)ZSTD_CCtx_setParameter(cctx, ZSTD_c_nbWorkers,
- parallel::strategy.compute_thread_count());
- ZSTD_outBuffer zob = {out.data(), out.size(), 0};
- ZSTD_EndDirective directive = ZSTD_e_continue;
- const size_t blockSize = ZSTD_CStreamInSize();
- do {
- const size_t n = std::min(static_cast<size_t>(size - pos), blockSize);
- if (n == size - pos)
- directive = ZSTD_e_end;
- ZSTD_inBuffer zib = {buf.get() + pos, n, 0};
- size_t bytesRemaining = 0;
- while (zib.pos != zib.size ||
- (directive == ZSTD_e_end && bytesRemaining != 0)) {
+ parallelFor(0, numShards, [&](size_t i) {
+ SmallVector<uint8_t, 0> out;
+ ZSTD_CCtx *cctx = ZSTD_createCCtx();
+ ZSTD_inBuffer zib = {shardsIn[i].data(), shardsIn[i].size(), 0};
+ ZSTD_outBuffer zob = {nullptr, 0, 0};
+ size_t size;
+ do {
+ // Allocate a buffer of half of the input size, and grow it by 1.5x if
+ // insufficient.
if (zob.pos == zob.size) {
- out.resize_for_overwrite(out.size() * 3 / 2);
- zob.dst = out.data();
- zob.size = out.size();
+ out.resize_for_overwrite(
+ zob.size ? zob.size * 3 / 2 : std::max<size_t>(zib.size / 4, 64));
+ zob = {out.data(), out.size(), zob.pos};
}
- bytesRemaining = ZSTD_compressStream2(cctx, &zob, &zib, directive);
- assert(!ZSTD_isError(bytesRemaining));
- }
- pos += n;
- } while (directive != ZSTD_e_end);
- out.resize(zob.pos);
- ZSTD_freeCCtx(cctx);
-
- size = sizeof(Elf_Chdr) + out.size();
- flags |= SHF_COMPRESSED;
- return;
+ size = ZSTD_compressStream2(cctx, &zob, &zib, ZSTD_e_end);
+ assert(!ZSTD_isError(size));
+ } while (size != 0);
+ out.truncate(zob.pos);
+ ZSTD_freeCCtx(cctx);
+ shardsOut[i] = std::move(out);
+ });
+ compressed.type = ELFCOMPRESS_ZSTD;
+ size = sizeof(Elf_Chdr);
+ for (size_t i = 0; i != numShards; ++i)
+ size += shardsOut[i].size();
}
#endif
@@ -417,37 +410,32 @@ template <class ELFT> void OutputSection::maybeCompress() {
// ~15%. We found that level 7 to 9 doesn't make much
diff erence (~1% more
// compression) while they take significant amount of time (~2x), so level 6
// seems enough.
- const int level = config->optimize >= 2 ? 6 : Z_BEST_SPEED;
-
- // Split input into 1-MiB shards.
- constexpr size_t shardSize = 1 << 20;
- auto shardsIn = split(ArrayRef<uint8_t>(buf.get(), size), shardSize);
- const size_t numShards = shardsIn.size();
-
- // Compress shards and compute Alder-32 checksums. Use Z_SYNC_FLUSH for all
- // shards but the last to flush the output to a byte boundary to be
- // concatenated with the next shard.
- auto shardsOut = std::make_unique<SmallVector<uint8_t, 0>[]>(numShards);
- auto shardsAdler = std::make_unique<uint32_t[]>(numShards);
- parallelFor(0, numShards, [&](size_t i) {
- shardsOut[i] = deflateShard(shardsIn[i], level,
- i != numShards - 1 ? Z_SYNC_FLUSH : Z_FINISH);
- shardsAdler[i] = adler32(1, shardsIn[i].data(), shardsIn[i].size());
- });
+ if (ctype == DebugCompressionType::Zlib) {
+ const int level = config->optimize >= 2 ? 6 : Z_BEST_SPEED;
+
+ // Compress shards and compute Alder-32 checksums. Use Z_SYNC_FLUSH for all
+ // shards but the last to flush the output to a byte boundary to be
+ // concatenated with the next shard.
+ auto shardsAdler = std::make_unique<uint32_t[]>(numShards);
+ parallelFor(0, numShards, [&](size_t i) {
+ shardsOut[i] = deflateShard(shardsIn[i], level,
+ i != numShards - 1 ? Z_SYNC_FLUSH : Z_FINISH);
+ shardsAdler[i] = adler32(1, shardsIn[i].data(), shardsIn[i].size());
+ });
- // Update section size and combine Alder-32 checksums.
- uint32_t checksum = 1; // Initial Adler-32 value
- size = sizeof(Elf_Chdr) + 2; // Elf_Chdir and zlib header
- for (size_t i = 0; i != numShards; ++i) {
- size += shardsOut[i].size();
- checksum = adler32_combine(checksum, shardsAdler[i], shardsIn[i].size());
+ // Update section size and combine Alder-32 checksums.
+ uint32_t checksum = 1; // Initial Adler-32 value
+ size = sizeof(Elf_Chdr) + 2; // Elf_Chdir and zlib header
+ for (size_t i = 0; i != numShards; ++i) {
+ size += shardsOut[i].size();
+ checksum = adler32_combine(checksum, shardsAdler[i], shardsIn[i].size());
+ }
+ size += 4; // checksum
+ compressed.type = ELFCOMPRESS_ZLIB;
+ compressed.checksum = checksum;
}
- size += 4; // checksum
- compressed.type = ELFCOMPRESS_ZLIB;
compressed.shards = std::move(shardsOut);
- compressed.numShards = numShards;
- compressed.checksum = checksum;
flags |= SHF_COMPRESSED;
#endif
}
@@ -479,25 +467,22 @@ void OutputSection::writeTo(uint8_t *buf, parallel::TaskGroup &tg) {
chdr->ch_size = compressed.uncompressedSize;
chdr->ch_addralign = addralign;
buf += sizeof(*chdr);
- if (compressed.type == ELFCOMPRESS_ZSTD) {
- memcpy(buf, compressed.shards[0].data(), compressed.shards[0].size());
- return;
+
+ auto offsets = std::make_unique<size_t[]>(compressed.numShards);
+ if (compressed.type == ELFCOMPRESS_ZLIB) {
+ buf[0] = 0x78; // CMF
+ buf[1] = 0x01; // FLG: best speed
+ offsets[0] = 2; // zlib header
+ write32be(buf + (size - sizeof(*chdr) - 4), compressed.checksum);
}
// Compute shard offsets.
- auto offsets = std::make_unique<size_t[]>(compressed.numShards);
- offsets[0] = 2; // zlib header
for (size_t i = 1; i != compressed.numShards; ++i)
offsets[i] = offsets[i - 1] + compressed.shards[i - 1].size();
-
- buf[0] = 0x78; // CMF
- buf[1] = 0x01; // FLG: best speed
parallelFor(0, compressed.numShards, [&](size_t i) {
memcpy(buf + offsets[i], compressed.shards[i].data(),
compressed.shards[i].size());
});
-
- write32be(buf + (size - sizeof(*chdr) - 4), compressed.checksum);
return;
}
More information about the llvm-commits
mailing list