[lld] [llvm] [lld][COFF] Remove duplicate strtab entries (PR #141197)

Haohai Wen via llvm-commits llvm-commits at lists.llvm.org
Tue May 27 19:52:03 PDT 2025


https://github.com/HaohaiWen updated https://github.com/llvm/llvm-project/pull/141197

>From 4204ca07e85470fa9169025f1c7a8172f4bb03c9 Mon Sep 17 00:00:00 2001
From: Haohai Wen <haohai.wen at intel.com>
Date: Fri, 23 May 2025 12:36:07 +0800
Subject: [PATCH 1/2] [lld][COFF] Remove duplicate strtab entries

String table size is too big for large binary when symbol table is
enabled. Some strings in strtab is same so it can be reused.
---
 lld/COFF/Writer.cpp | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/lld/COFF/Writer.cpp b/lld/COFF/Writer.cpp
index db6133e20a037..6ec83a5f77e5a 100644
--- a/lld/COFF/Writer.cpp
+++ b/lld/COFF/Writer.cpp
@@ -285,6 +285,7 @@ class Writer {
   std::unique_ptr<FileOutputBuffer> &buffer;
   std::map<PartialSectionKey, PartialSection *> partialSections;
   std::vector<char> strtab;
+  StringMap<size_t> strtabMap;
   std::vector<llvm::object::coff_symbol16> outputSymtab;
   std::vector<ECCodeMapEntry> codeMap;
   IdataContents idata;
@@ -1439,10 +1440,13 @@ void Writer::assignOutputSectionIndices() {
 
 size_t Writer::addEntryToStringTable(StringRef str) {
   assert(str.size() > COFF::NameSize);
-  size_t offsetOfEntry = strtab.size() + 4; // +4 for the size field
-  strtab.insert(strtab.end(), str.begin(), str.end());
-  strtab.push_back('\0');
-  return offsetOfEntry;
+  size_t newOffsetOfEntry = strtab.size() + 4; // +4 for the size field
+  auto res = strtabMap.try_emplace(str, newOffsetOfEntry);
+  if (res.second) {
+    strtab.insert(strtab.end(), str.begin(), str.end());
+    strtab.push_back('\0');
+  }
+  return res.first->getValue();
 }
 
 std::optional<coff_symbol16> Writer::createSymbol(Defined *def) {

>From a4a6c910a7578a895ed0c84ab588b7af49e97fbc Mon Sep 17 00:00:00 2001
From: Haohai Wen <haohai.wen at intel.com>
Date: Tue, 27 May 2025 14:15:27 +0800
Subject: [PATCH 2/2] Using prioritized StringTableBuilder

---
 lld/COFF/Writer.cpp                       | 53 +++++++++++++----------
 llvm/include/llvm/MC/StringTableBuilder.h | 17 +++++---
 llvm/lib/MC/StringTableBuilder.cpp        | 37 ++++++++++++++--
 3 files changed, 77 insertions(+), 30 deletions(-)

diff --git a/lld/COFF/Writer.cpp b/lld/COFF/Writer.cpp
index 6ec83a5f77e5a..2807ab53d9aa1 100644
--- a/lld/COFF/Writer.cpp
+++ b/lld/COFF/Writer.cpp
@@ -24,6 +24,7 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringSet.h"
 #include "llvm/BinaryFormat/COFF.h"
+#include "llvm/MC/StringTableBuilder.h"
 #include "llvm/Support/BinaryStreamReader.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Endian.h"
@@ -204,7 +205,8 @@ struct ChunkRange {
 class Writer {
 public:
   Writer(COFFLinkerContext &c)
-      : buffer(c.e.outputBuffer), delayIdata(c), ctx(c) {}
+      : buffer(c.e.outputBuffer), strtab(StringTableBuilder::WinCOFF),
+        delayIdata(c), ctx(c) {}
   void run();
 
 private:
@@ -284,8 +286,7 @@ class Writer {
 
   std::unique_ptr<FileOutputBuffer> &buffer;
   std::map<PartialSectionKey, PartialSection *> partialSections;
-  std::vector<char> strtab;
-  StringMap<size_t> strtabMap;
+  StringTableBuilder strtab;
   std::vector<llvm::object::coff_symbol16> outputSymtab;
   std::vector<ECCodeMapEntry> codeMap;
   IdataContents idata;
@@ -1438,17 +1439,6 @@ void Writer::assignOutputSectionIndices() {
           sc->setOutputSectionIdx(mc->getOutputSectionIdx());
 }
 
-size_t Writer::addEntryToStringTable(StringRef str) {
-  assert(str.size() > COFF::NameSize);
-  size_t newOffsetOfEntry = strtab.size() + 4; // +4 for the size field
-  auto res = strtabMap.try_emplace(str, newOffsetOfEntry);
-  if (res.second) {
-    strtab.insert(strtab.end(), str.begin(), str.end());
-    strtab.push_back('\0');
-  }
-  return res.first->getValue();
-}
-
 std::optional<coff_symbol16> Writer::createSymbol(Defined *def) {
   coff_symbol16 sym;
   switch (def->kind()) {
@@ -1489,7 +1479,8 @@ std::optional<coff_symbol16> Writer::createSymbol(Defined *def) {
   StringRef name = def->getName();
   if (name.size() > COFF::NameSize) {
     sym.Name.Offset.Zeroes = 0;
-    sym.Name.Offset.Offset = addEntryToStringTable(name);
+    sym.Name.Offset.Offset = 0; // Filled in later.
+    strtab.add(name);
   } else {
     memset(sym.Name.ShortName, 0, COFF::NameSize);
     memcpy(sym.Name.ShortName, name.data(), name.size());
@@ -1521,6 +1512,7 @@ void Writer::createSymbolAndStringTable() {
   // solution where discardable sections have long names preserved and
   // non-discardable sections have their names truncated, to ensure that any
   // section which is mapped at runtime also has its name mapped at runtime.
+  SmallVector<OutputSection *> longNameSections;
   for (OutputSection *sec : ctx.outputSections) {
     if (sec->name.size() <= COFF::NameSize)
       continue;
@@ -1532,9 +1524,13 @@ void Writer::createSymbolAndStringTable() {
           << " is longer than 8 characters and will use a non-standard string "
              "table";
     }
-    sec->setStringTableOff(addEntryToStringTable(sec->name));
+    // Put the section name in the begin of strtab so that its offset is less
+    // than Max7DecimalOffset otherwise lldb/gdb will not read it.
+    strtab.add(sec->name, /*Priority=*/UINT8_MAX);
+    longNameSections.push_back(sec);
   }
 
+  std::vector<std::pair<size_t, StringRef>> longNameSymbols;
   if (ctx.config.writeSymtab) {
     for (ObjFile *file : ctx.objFileInstances) {
       for (Symbol *b : file->getSymbols()) {
@@ -1549,15 +1545,22 @@ void Writer::createSymbolAndStringTable() {
             continue;
         }
 
-        if (std::optional<coff_symbol16> sym = createSymbol(d))
+        if (std::optional<coff_symbol16> sym = createSymbol(d)) {
+          if (d->getName().size() > COFF::NameSize)
+            longNameSymbols.emplace_back(outputSymtab.size(), d->getName());
           outputSymtab.push_back(*sym);
+        }
 
         if (auto *dthunk = dyn_cast<DefinedImportThunk>(d)) {
           if (!dthunk->wrappedSym->writtenToSymtab) {
             dthunk->wrappedSym->writtenToSymtab = true;
             if (std::optional<coff_symbol16> sym =
-                    createSymbol(dthunk->wrappedSym))
+                    createSymbol(dthunk->wrappedSym)) {
+              if (d->getName().size() > COFF::NameSize)
+                longNameSymbols.emplace_back(outputSymtab.size(),
+                                             dthunk->wrappedSym->getName());
               outputSymtab.push_back(*sym);
+            }
           }
         }
       }
@@ -1567,11 +1570,19 @@ void Writer::createSymbolAndStringTable() {
   if (outputSymtab.empty() && strtab.empty())
     return;
 
+  strtab.finalize();
+  for (OutputSection *sec : longNameSections)
+    sec->setStringTableOff(strtab.getOffset(sec->name));
+  for (auto P : longNameSymbols) {
+    coff_symbol16 &sym = outputSymtab[P.first];
+    sym.Name.Offset.Offset = strtab.getOffset(P.second);
+  }
+
   // We position the symbol table to be adjacent to the end of the last section.
   uint64_t fileOff = fileSize;
   pointerToSymbolTable = fileOff;
   fileOff += outputSymtab.size() * sizeof(coff_symbol16);
-  fileOff += 4 + strtab.size();
+  fileOff += strtab.getSize();
   fileSize = alignTo(fileOff, ctx.config.fileAlign);
 }
 
@@ -1952,9 +1963,7 @@ template <typename PEHeaderTy> void Writer::writeHeader() {
   // Create the string table, it follows immediately after the symbol table.
   // The first 4 bytes is length including itself.
   buf = reinterpret_cast<uint8_t *>(&symbolTable[numberOfSymbols]);
-  write32le(buf, strtab.size() + 4);
-  if (!strtab.empty())
-    memcpy(buf + 4, strtab.data(), strtab.size());
+  strtab.write(buf);
 }
 
 void Writer::openFile(StringRef path) {
diff --git a/llvm/include/llvm/MC/StringTableBuilder.h b/llvm/include/llvm/MC/StringTableBuilder.h
index a738683548cfa..d594df5acddaa 100644
--- a/llvm/include/llvm/MC/StringTableBuilder.h
+++ b/llvm/include/llvm/MC/StringTableBuilder.h
@@ -37,6 +37,8 @@ class StringTableBuilder {
   };
 
 private:
+  // Only non-zero priority will be recorded.
+  DenseMap<CachedHashStringRef, uint8_t> StringPriorityMap;
   DenseMap<CachedHashStringRef, size_t> StringIndexMap;
   size_t Size = 0;
   Kind K;
@@ -50,11 +52,15 @@ class StringTableBuilder {
   StringTableBuilder(Kind K, Align Alignment = Align(1));
   ~StringTableBuilder();
 
-  /// Add a string to the builder. Returns the position of S in the
-  /// table. The position will be changed if finalize is used.
-  /// Can only be used before the table is finalized.
-  size_t add(CachedHashStringRef S);
-  size_t add(StringRef S) { return add(CachedHashStringRef(S)); }
+  /// Add a string to the builder. Returns the position of S in the table. The
+  /// position will be changed if finalize is used. Can only be used before the
+  /// table is finalized. Priority is only useful with reordering. Strings with
+  /// same priority will be put together. Strings with higher priority are
+  /// placed closer to the begin of string table.
+  size_t add(CachedHashStringRef S, uint8_t Priority = 0);
+  size_t add(StringRef S, uint8_t Priority = 0) {
+    return add(CachedHashStringRef(S), Priority);
+  }
 
   /// Analyze the strings and build the final table. No more strings can
   /// be added after this point.
@@ -77,6 +83,7 @@ class StringTableBuilder {
   bool contains(StringRef S) const { return contains(CachedHashStringRef(S)); }
   bool contains(CachedHashStringRef S) const { return StringIndexMap.count(S); }
 
+  bool empty() const { return StringIndexMap.empty(); }
   size_t getSize() const { return Size; }
   void clear();
 
diff --git a/llvm/lib/MC/StringTableBuilder.cpp b/llvm/lib/MC/StringTableBuilder.cpp
index df316bae98cea..2cbd18ef7d1bd 100644
--- a/llvm/lib/MC/StringTableBuilder.cpp
+++ b/llvm/lib/MC/StringTableBuilder.cpp
@@ -138,13 +138,41 @@ void StringTableBuilder::finalizeInOrder() {
 void StringTableBuilder::finalizeStringTable(bool Optimize) {
   Finalized = true;
 
-  if (Optimize) {
+  if (Optimize && StringIndexMap.size()) {
     std::vector<StringPair *> Strings;
     Strings.reserve(StringIndexMap.size());
     for (StringPair &P : StringIndexMap)
       Strings.push_back(&P);
 
-    multikeySort(Strings, 0);
+    auto getStringPriority = [this](const CachedHashStringRef Str) -> uint8_t {
+      if (StringPriorityMap.contains(Str))
+        return StringPriorityMap[Str];
+      return 0;
+    };
+
+    size_t RangeBegin = 0;
+    MutableArrayRef<StringPair *> StringsRef(Strings);
+    if (StringPriorityMap.size()) {
+      llvm::sort(Strings,
+                 [&](const StringPair *LHS, const StringPair *RHS) -> bool {
+                   return getStringPriority(LHS->first) >
+                          getStringPriority(RHS->first);
+                 });
+      // Make sure ArrayRef is valid. Although std::sort implementaion is
+      // normally in-place , it is not guaranteed by standard.
+      StringsRef = Strings;
+
+      uint8_t RangePriority = getStringPriority(Strings[0]->first);
+      for (size_t I = 1, E = Strings.size(); I != E && RangePriority; ++I) {
+        uint8_t Priority = getStringPriority(Strings[I]->first);
+        if (Priority != RangePriority) {
+          multikeySort(StringsRef.slice(RangeBegin, I - RangeBegin), 0);
+          RangePriority = Priority;
+          RangeBegin = I;
+        }
+      }
+    }
+    multikeySort(StringsRef.slice(RangeBegin), 0);
     initSize();
 
     StringRef Previous;
@@ -199,11 +227,14 @@ size_t StringTableBuilder::getOffset(CachedHashStringRef S) const {
   return I->second;
 }
 
-size_t StringTableBuilder::add(CachedHashStringRef S) {
+size_t StringTableBuilder::add(CachedHashStringRef S, uint8_t Priority) {
   if (K == WinCOFF)
     assert(S.size() > COFF::NameSize && "Short string in COFF string table!");
 
   assert(!isFinalized());
+  if (Priority)
+    StringPriorityMap[S] = std::max(Priority, StringPriorityMap[S]);
+
   auto P = StringIndexMap.insert(std::make_pair(S, 0));
   if (P.second) {
     size_t Start = alignTo(Size, Alignment);



More information about the llvm-commits mailing list