[lld] [llvm] [lld][COFF] Remove duplicate strtab entries (PR #141197)
Haohai Wen via llvm-commits
llvm-commits at lists.llvm.org
Tue May 27 19:52:03 PDT 2025
https://github.com/HaohaiWen updated https://github.com/llvm/llvm-project/pull/141197
>From 4204ca07e85470fa9169025f1c7a8172f4bb03c9 Mon Sep 17 00:00:00 2001
From: Haohai Wen <haohai.wen at intel.com>
Date: Fri, 23 May 2025 12:36:07 +0800
Subject: [PATCH 1/2] [lld][COFF] Remove duplicate strtab entries
String table size is too big for large binary when symbol table is
enabled. Some strings in strtab is same so it can be reused.
---
lld/COFF/Writer.cpp | 12 ++++++++----
1 file changed, 8 insertions(+), 4 deletions(-)
diff --git a/lld/COFF/Writer.cpp b/lld/COFF/Writer.cpp
index db6133e20a037..6ec83a5f77e5a 100644
--- a/lld/COFF/Writer.cpp
+++ b/lld/COFF/Writer.cpp
@@ -285,6 +285,7 @@ class Writer {
std::unique_ptr<FileOutputBuffer> &buffer;
std::map<PartialSectionKey, PartialSection *> partialSections;
std::vector<char> strtab;
+ StringMap<size_t> strtabMap;
std::vector<llvm::object::coff_symbol16> outputSymtab;
std::vector<ECCodeMapEntry> codeMap;
IdataContents idata;
@@ -1439,10 +1440,13 @@ void Writer::assignOutputSectionIndices() {
size_t Writer::addEntryToStringTable(StringRef str) {
assert(str.size() > COFF::NameSize);
- size_t offsetOfEntry = strtab.size() + 4; // +4 for the size field
- strtab.insert(strtab.end(), str.begin(), str.end());
- strtab.push_back('\0');
- return offsetOfEntry;
+ size_t newOffsetOfEntry = strtab.size() + 4; // +4 for the size field
+ auto res = strtabMap.try_emplace(str, newOffsetOfEntry);
+ if (res.second) {
+ strtab.insert(strtab.end(), str.begin(), str.end());
+ strtab.push_back('\0');
+ }
+ return res.first->getValue();
}
std::optional<coff_symbol16> Writer::createSymbol(Defined *def) {
>From a4a6c910a7578a895ed0c84ab588b7af49e97fbc Mon Sep 17 00:00:00 2001
From: Haohai Wen <haohai.wen at intel.com>
Date: Tue, 27 May 2025 14:15:27 +0800
Subject: [PATCH 2/2] Using prioritized StringTableBuilder
---
lld/COFF/Writer.cpp | 53 +++++++++++++----------
llvm/include/llvm/MC/StringTableBuilder.h | 17 +++++---
llvm/lib/MC/StringTableBuilder.cpp | 37 ++++++++++++++--
3 files changed, 77 insertions(+), 30 deletions(-)
diff --git a/lld/COFF/Writer.cpp b/lld/COFF/Writer.cpp
index 6ec83a5f77e5a..2807ab53d9aa1 100644
--- a/lld/COFF/Writer.cpp
+++ b/lld/COFF/Writer.cpp
@@ -24,6 +24,7 @@
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/StringSet.h"
#include "llvm/BinaryFormat/COFF.h"
+#include "llvm/MC/StringTableBuilder.h"
#include "llvm/Support/BinaryStreamReader.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/Endian.h"
@@ -204,7 +205,8 @@ struct ChunkRange {
class Writer {
public:
Writer(COFFLinkerContext &c)
- : buffer(c.e.outputBuffer), delayIdata(c), ctx(c) {}
+ : buffer(c.e.outputBuffer), strtab(StringTableBuilder::WinCOFF),
+ delayIdata(c), ctx(c) {}
void run();
private:
@@ -284,8 +286,7 @@ class Writer {
std::unique_ptr<FileOutputBuffer> &buffer;
std::map<PartialSectionKey, PartialSection *> partialSections;
- std::vector<char> strtab;
- StringMap<size_t> strtabMap;
+ StringTableBuilder strtab;
std::vector<llvm::object::coff_symbol16> outputSymtab;
std::vector<ECCodeMapEntry> codeMap;
IdataContents idata;
@@ -1438,17 +1439,6 @@ void Writer::assignOutputSectionIndices() {
sc->setOutputSectionIdx(mc->getOutputSectionIdx());
}
-size_t Writer::addEntryToStringTable(StringRef str) {
- assert(str.size() > COFF::NameSize);
- size_t newOffsetOfEntry = strtab.size() + 4; // +4 for the size field
- auto res = strtabMap.try_emplace(str, newOffsetOfEntry);
- if (res.second) {
- strtab.insert(strtab.end(), str.begin(), str.end());
- strtab.push_back('\0');
- }
- return res.first->getValue();
-}
-
std::optional<coff_symbol16> Writer::createSymbol(Defined *def) {
coff_symbol16 sym;
switch (def->kind()) {
@@ -1489,7 +1479,8 @@ std::optional<coff_symbol16> Writer::createSymbol(Defined *def) {
StringRef name = def->getName();
if (name.size() > COFF::NameSize) {
sym.Name.Offset.Zeroes = 0;
- sym.Name.Offset.Offset = addEntryToStringTable(name);
+ sym.Name.Offset.Offset = 0; // Filled in later.
+ strtab.add(name);
} else {
memset(sym.Name.ShortName, 0, COFF::NameSize);
memcpy(sym.Name.ShortName, name.data(), name.size());
@@ -1521,6 +1512,7 @@ void Writer::createSymbolAndStringTable() {
// solution where discardable sections have long names preserved and
// non-discardable sections have their names truncated, to ensure that any
// section which is mapped at runtime also has its name mapped at runtime.
+ SmallVector<OutputSection *> longNameSections;
for (OutputSection *sec : ctx.outputSections) {
if (sec->name.size() <= COFF::NameSize)
continue;
@@ -1532,9 +1524,13 @@ void Writer::createSymbolAndStringTable() {
<< " is longer than 8 characters and will use a non-standard string "
"table";
}
- sec->setStringTableOff(addEntryToStringTable(sec->name));
+ // Put the section name in the begin of strtab so that its offset is less
+ // than Max7DecimalOffset otherwise lldb/gdb will not read it.
+ strtab.add(sec->name, /*Priority=*/UINT8_MAX);
+ longNameSections.push_back(sec);
}
+ std::vector<std::pair<size_t, StringRef>> longNameSymbols;
if (ctx.config.writeSymtab) {
for (ObjFile *file : ctx.objFileInstances) {
for (Symbol *b : file->getSymbols()) {
@@ -1549,15 +1545,22 @@ void Writer::createSymbolAndStringTable() {
continue;
}
- if (std::optional<coff_symbol16> sym = createSymbol(d))
+ if (std::optional<coff_symbol16> sym = createSymbol(d)) {
+ if (d->getName().size() > COFF::NameSize)
+ longNameSymbols.emplace_back(outputSymtab.size(), d->getName());
outputSymtab.push_back(*sym);
+ }
if (auto *dthunk = dyn_cast<DefinedImportThunk>(d)) {
if (!dthunk->wrappedSym->writtenToSymtab) {
dthunk->wrappedSym->writtenToSymtab = true;
if (std::optional<coff_symbol16> sym =
- createSymbol(dthunk->wrappedSym))
+ createSymbol(dthunk->wrappedSym)) {
+ if (d->getName().size() > COFF::NameSize)
+ longNameSymbols.emplace_back(outputSymtab.size(),
+ dthunk->wrappedSym->getName());
outputSymtab.push_back(*sym);
+ }
}
}
}
@@ -1567,11 +1570,19 @@ void Writer::createSymbolAndStringTable() {
if (outputSymtab.empty() && strtab.empty())
return;
+ strtab.finalize();
+ for (OutputSection *sec : longNameSections)
+ sec->setStringTableOff(strtab.getOffset(sec->name));
+ for (auto P : longNameSymbols) {
+ coff_symbol16 &sym = outputSymtab[P.first];
+ sym.Name.Offset.Offset = strtab.getOffset(P.second);
+ }
+
// We position the symbol table to be adjacent to the end of the last section.
uint64_t fileOff = fileSize;
pointerToSymbolTable = fileOff;
fileOff += outputSymtab.size() * sizeof(coff_symbol16);
- fileOff += 4 + strtab.size();
+ fileOff += strtab.getSize();
fileSize = alignTo(fileOff, ctx.config.fileAlign);
}
@@ -1952,9 +1963,7 @@ template <typename PEHeaderTy> void Writer::writeHeader() {
// Create the string table, it follows immediately after the symbol table.
// The first 4 bytes is length including itself.
buf = reinterpret_cast<uint8_t *>(&symbolTable[numberOfSymbols]);
- write32le(buf, strtab.size() + 4);
- if (!strtab.empty())
- memcpy(buf + 4, strtab.data(), strtab.size());
+ strtab.write(buf);
}
void Writer::openFile(StringRef path) {
diff --git a/llvm/include/llvm/MC/StringTableBuilder.h b/llvm/include/llvm/MC/StringTableBuilder.h
index a738683548cfa..d594df5acddaa 100644
--- a/llvm/include/llvm/MC/StringTableBuilder.h
+++ b/llvm/include/llvm/MC/StringTableBuilder.h
@@ -37,6 +37,8 @@ class StringTableBuilder {
};
private:
+ // Only non-zero priority will be recorded.
+ DenseMap<CachedHashStringRef, uint8_t> StringPriorityMap;
DenseMap<CachedHashStringRef, size_t> StringIndexMap;
size_t Size = 0;
Kind K;
@@ -50,11 +52,15 @@ class StringTableBuilder {
StringTableBuilder(Kind K, Align Alignment = Align(1));
~StringTableBuilder();
- /// Add a string to the builder. Returns the position of S in the
- /// table. The position will be changed if finalize is used.
- /// Can only be used before the table is finalized.
- size_t add(CachedHashStringRef S);
- size_t add(StringRef S) { return add(CachedHashStringRef(S)); }
+ /// Add a string to the builder. Returns the position of S in the table. The
+ /// position will be changed if finalize is used. Can only be used before the
+ /// table is finalized. Priority is only useful with reordering. Strings with
+ /// same priority will be put together. Strings with higher priority are
+ /// placed closer to the begin of string table.
+ size_t add(CachedHashStringRef S, uint8_t Priority = 0);
+ size_t add(StringRef S, uint8_t Priority = 0) {
+ return add(CachedHashStringRef(S), Priority);
+ }
/// Analyze the strings and build the final table. No more strings can
/// be added after this point.
@@ -77,6 +83,7 @@ class StringTableBuilder {
bool contains(StringRef S) const { return contains(CachedHashStringRef(S)); }
bool contains(CachedHashStringRef S) const { return StringIndexMap.count(S); }
+ bool empty() const { return StringIndexMap.empty(); }
size_t getSize() const { return Size; }
void clear();
diff --git a/llvm/lib/MC/StringTableBuilder.cpp b/llvm/lib/MC/StringTableBuilder.cpp
index df316bae98cea..2cbd18ef7d1bd 100644
--- a/llvm/lib/MC/StringTableBuilder.cpp
+++ b/llvm/lib/MC/StringTableBuilder.cpp
@@ -138,13 +138,41 @@ void StringTableBuilder::finalizeInOrder() {
void StringTableBuilder::finalizeStringTable(bool Optimize) {
Finalized = true;
- if (Optimize) {
+ if (Optimize && StringIndexMap.size()) {
std::vector<StringPair *> Strings;
Strings.reserve(StringIndexMap.size());
for (StringPair &P : StringIndexMap)
Strings.push_back(&P);
- multikeySort(Strings, 0);
+ auto getStringPriority = [this](const CachedHashStringRef Str) -> uint8_t {
+ if (StringPriorityMap.contains(Str))
+ return StringPriorityMap[Str];
+ return 0;
+ };
+
+ size_t RangeBegin = 0;
+ MutableArrayRef<StringPair *> StringsRef(Strings);
+ if (StringPriorityMap.size()) {
+ llvm::sort(Strings,
+ [&](const StringPair *LHS, const StringPair *RHS) -> bool {
+ return getStringPriority(LHS->first) >
+ getStringPriority(RHS->first);
+ });
+ // Make sure ArrayRef is valid. Although std::sort implementaion is
+ // normally in-place , it is not guaranteed by standard.
+ StringsRef = Strings;
+
+ uint8_t RangePriority = getStringPriority(Strings[0]->first);
+ for (size_t I = 1, E = Strings.size(); I != E && RangePriority; ++I) {
+ uint8_t Priority = getStringPriority(Strings[I]->first);
+ if (Priority != RangePriority) {
+ multikeySort(StringsRef.slice(RangeBegin, I - RangeBegin), 0);
+ RangePriority = Priority;
+ RangeBegin = I;
+ }
+ }
+ }
+ multikeySort(StringsRef.slice(RangeBegin), 0);
initSize();
StringRef Previous;
@@ -199,11 +227,14 @@ size_t StringTableBuilder::getOffset(CachedHashStringRef S) const {
return I->second;
}
-size_t StringTableBuilder::add(CachedHashStringRef S) {
+size_t StringTableBuilder::add(CachedHashStringRef S, uint8_t Priority) {
if (K == WinCOFF)
assert(S.size() > COFF::NameSize && "Short string in COFF string table!");
assert(!isFinalized());
+ if (Priority)
+ StringPriorityMap[S] = std::max(Priority, StringPriorityMap[S]);
+
auto P = StringIndexMap.insert(std::make_pair(S, 0));
if (P.second) {
size_t Start = alignTo(Size, Alignment);
More information about the llvm-commits
mailing list