[lld] c676104 - [lld-macho] Implement symbol string deduplication (#123874)

via llvm-commits llvm-commits at lists.llvm.org
Thu Jan 23 15:48:15 PST 2025


Author: alx32
Date: 2025-01-23T15:48:11-08:00
New Revision: c676104875f34a87051b446469cc395932bc1f13

URL: https://github.com/llvm/llvm-project/commit/c676104875f34a87051b446469cc395932bc1f13
DIFF: https://github.com/llvm/llvm-project/commit/c676104875f34a87051b446469cc395932bc1f13.diff

LOG: [lld-macho] Implement symbol string deduplication (#123874)

The symbol string table does not have deduplication. 
Here we add code to deduplicate the symbol string table. 
This has a rather large size impact (20-30%) on unstripped binaries
(typically debug binaries) but no size impact on stripped
binaries(typically release binaries).

We enable deduplication by default and add a flag to disable it
(`-no-deduplicate-symbol-strings`).

Added: 
    

Modified: 
    lld/MachO/Config.h
    lld/MachO/Driver.cpp
    lld/MachO/Options.td
    lld/MachO/SyntheticSections.cpp
    lld/MachO/SyntheticSections.h
    lld/test/MachO/cfstring-dedup.s

Removed: 
    


################################################################################
diff  --git a/lld/MachO/Config.h b/lld/MachO/Config.h
index d41ca5382c692a..f8dcc84e4ee1ba 100644
--- a/lld/MachO/Config.h
+++ b/lld/MachO/Config.h
@@ -143,6 +143,7 @@ struct Configuration {
   bool timeTraceEnabled = false;
   bool dataConst = false;
   bool dedupStrings = true;
+  bool dedupSymbolStrings = true;
   bool deadStripDuplicates = false;
   bool omitDebugInfo = false;
   bool warnDylibInstallName = false;

diff  --git a/lld/MachO/Driver.cpp b/lld/MachO/Driver.cpp
index 31630ba7d69de2..4f6c9b4ddc7984 100644
--- a/lld/MachO/Driver.cpp
+++ b/lld/MachO/Driver.cpp
@@ -1806,6 +1806,7 @@ bool link(ArrayRef<const char *> argsArr, llvm::raw_ostream &stdoutOS,
   config->keepICFStabs = args.hasArg(OPT_keep_icf_stabs);
   config->dedupStrings =
       args.hasFlag(OPT_deduplicate_strings, OPT_no_deduplicate_strings, true);
+  config->dedupSymbolStrings = !args.hasArg(OPT_no_deduplicate_symbol_strings);
   config->deadStripDuplicates = args.hasArg(OPT_dead_strip_duplicates);
   config->warnDylibInstallName = args.hasFlag(
       OPT_warn_dylib_install_name, OPT_no_warn_dylib_install_name, false);

diff  --git a/lld/MachO/Options.td b/lld/MachO/Options.td
index 4c89f96c3ebaad..9001e85582c124 100644
--- a/lld/MachO/Options.td
+++ b/lld/MachO/Options.td
@@ -1476,3 +1476,8 @@ def no_warn_duplicate_libraries : Flag<["-"], "no_warn_duplicate_libraries">,
     HelpText<"Do not warn if the input contains duplicate library options.">,
     Flags<[HelpHidden]>,
     Group<grp_ignored_silently>;
+
+// Add this with the other flags in the rare options group
+def no_deduplicate_symbol_strings : Flag<["-"], "no-deduplicate-symbol-strings">,
+    HelpText<"Do not deduplicate strings in the symbol string table. Might result in larger binaries but slightly faster link times.">,
+    Group<grp_rare>;

diff  --git a/lld/MachO/SyntheticSections.cpp b/lld/MachO/SyntheticSections.cpp
index 97164e5992b8c7..60b57bb3a192c9 100644
--- a/lld/MachO/SyntheticSections.cpp
+++ b/lld/MachO/SyntheticSections.cpp
@@ -1541,7 +1541,14 @@ StringTableSection::StringTableSection()
 
 uint32_t StringTableSection::addString(StringRef str) {
   uint32_t strx = size;
-  strings.push_back(str); // TODO: consider deduplicating strings
+  if (config->dedupSymbolStrings) {
+    llvm::CachedHashStringRef hashedStr(str);
+    auto [it, inserted] = stringMap.try_emplace(hashedStr, strx);
+    if (!inserted)
+      return it->second;
+  }
+
+  strings.push_back(str);
   size += str.size() + 1; // account for null terminator
   return strx;
 }

diff  --git a/lld/MachO/SyntheticSections.h b/lld/MachO/SyntheticSections.h
index af99f22788d6e9..5796b0790c83a0 100644
--- a/lld/MachO/SyntheticSections.h
+++ b/lld/MachO/SyntheticSections.h
@@ -447,6 +447,7 @@ class StringTableSection final : public LinkEditSection {
   // match its behavior here since some tools depend on it.
   // Consequently, the empty string will be at index 1, not zero.
   std::vector<StringRef> strings{" "};
+  llvm::DenseMap<llvm::CachedHashStringRef, uint32_t> stringMap;
   size_t size = 2;
 };
 

diff  --git a/lld/test/MachO/cfstring-dedup.s b/lld/test/MachO/cfstring-dedup.s
index fb121cde3e9585..4f490ba4380e13 100644
--- a/lld/test/MachO/cfstring-dedup.s
+++ b/lld/test/MachO/cfstring-dedup.s
@@ -7,6 +7,17 @@
 # RUN: %lld -dylib -framework CoreFoundation %t/foo1.o %t/foo2.o -o %t/foo
 # RUN: llvm-objdump --no-print-imm-hex --macho --rebase --bind --syms -d %t/foo | FileCheck %s --check-prefix=LITERALS
 
+# Check that string deduplication for symbol names is working
+# RUN: %lld -dylib -framework CoreFoundation %t/foo1.o %t/foo2.o -o %t/foo_no_dedup -no-deduplicate-symbol-strings
+# RUN: llvm-strings %t/foo | FileCheck %s --check-prefix=CHECK-DEDUP
+# RUN: llvm-strings %t/foo_no_dedup | FileCheck %s --check-prefix=CHECK-NO-DEDUP
+# CHECK-DEDUP: _named_cfstring
+# CHECK-DEDUP-NOT: _named_cfstring
+# CHECK-NO-DEDUP: _named_cfstring
+# CHECK-NO-DEDUP: _named_cfstring
+# CHECK-NO-DEDUP-NOT: _named_cfstring
+
+
 # CHECK:       (__TEXT,__text) section
 # CHECK-NEXT:  _foo1:
 # CHECK-NEXT:  _foo2:


        


More information about the llvm-commits mailing list