[lld] 718c321 - [lld-macho] Only emit one BIND_OPCODE_SET_SYMBOL per symbol

Jez Ng via llvm-commits llvm-commits at lists.llvm.org
Mon Jul 5 17:00:36 PDT 2021


Author: Jez Ng
Date: 2021-07-05T20:00:19-04:00
New Revision: 718c32175b3bcced88d9ca25ab1482a682ae5bbb

URL: https://github.com/llvm/llvm-project/commit/718c32175b3bcced88d9ca25ab1482a682ae5bbb
DIFF: https://github.com/llvm/llvm-project/commit/718c32175b3bcced88d9ca25ab1482a682ae5bbb.diff

LOG: [lld-macho] Only emit one BIND_OPCODE_SET_SYMBOL per symbol

Size-wise, BIND_OPCODE_SET_SYMBOL_TRAILING_FLAGS_IMM is the most
expensive opcode, since it comes with an associated symbol string. We
were previously emitting it once per binding, instead of once per
symbol. This diff groups all bindings for a given symbol together and
ensures we only emit one such opcode per symbol. This matches ld64's
behavior.

While this is a relatively small win on chromium_framework (-72KiB), for
programs that have more dynamic bindings, the difference can be quite
large.

This change is perf-neutral when linking chromium_framework.

Reviewed By: #lld-macho, thakis

Differential Revision: https://reviews.llvm.org/D105075

Added: 
    lld/test/MachO/bind-opcodes.s

Modified: 
    lld/MachO/SyntheticSections.cpp
    lld/MachO/SyntheticSections.h

Removed: 
    


################################################################################
diff  --git a/lld/MachO/SyntheticSections.cpp b/lld/MachO/SyntheticSections.cpp
index 2a409bfcfa112..0a8153f422462 100644
--- a/lld/MachO/SyntheticSections.cpp
+++ b/lld/MachO/SyntheticSections.cpp
@@ -273,7 +273,6 @@ struct Binding {
   OutputSegment *segment = nullptr;
   uint64_t offset = 0;
   int64_t addend = 0;
-  int16_t ordinal = 0;
 };
 } // namespace
 
@@ -283,9 +282,8 @@ struct Binding {
 // The bind opcode "interpreter" remembers the values of each binding field, so
 // we only need to encode the 
diff erences between bindings. Hence the use of
 // lastBinding.
-static void encodeBinding(const Symbol *sym, const OutputSection *osec,
-                          uint64_t outSecOff, int64_t addend,
-                          bool isWeakBinding, Binding &lastBinding,
+static void encodeBinding(const OutputSection *osec, uint64_t outSecOff,
+                          int64_t addend, Binding &lastBinding,
                           raw_svector_ostream &os) {
   OutputSegment *seg = osec->parent;
   uint64_t offset = osec->getSegmentOffset() + outSecOff;
@@ -307,13 +305,7 @@ static void encodeBinding(const Symbol *sym, const OutputSection *osec,
     lastBinding.addend = addend;
   }
 
-  uint8_t flags = BIND_OPCODE_SET_SYMBOL_TRAILING_FLAGS_IMM;
-  if (!isWeakBinding && sym->isWeakRef())
-    flags |= BIND_SYMBOL_FLAGS_WEAK_IMPORT;
-
-  os << flags << sym->getName() << '\0'
-     << static_cast<uint8_t>(BIND_OPCODE_SET_TYPE_IMM | BIND_TYPE_POINTER)
-     << static_cast<uint8_t>(BIND_OPCODE_DO_BIND);
+  os << static_cast<uint8_t>(BIND_OPCODE_DO_BIND);
   // DO_BIND causes dyld to both perform the binding and increment the offset
   lastBinding.offset += target->wordSize;
 }
@@ -345,6 +337,37 @@ static void encodeWeakOverride(const Defined *defined,
      << defined->getName() << '\0';
 }
 
+// Organize the bindings so we can encoded them with fewer opcodes.
+//
+// First, all bindings for a given symbol should be grouped together.
+// BIND_OPCODE_SET_SYMBOL_TRAILING_FLAGS_IMM is the largest opcode (since it
+// has an associated symbol string), so we only want to emit it once per symbol.
+//
+// Within each group, we sort the bindings by address. Since bindings are
+// delta-encoded, sorting them allows for a more compact result. Note that
+// sorting by address alone ensures that bindings for the same segment / section
+// are located together, minimizing the number of times we have to emit
+// BIND_OPCODE_SET_SEGMENT_AND_OFFSET_ULEB.
+//
+// Finally, we sort the symbols by the address of their first binding, again
+// to facilitate the delta-encoding process.
+template <class Sym>
+std::vector<std::pair<const Sym *, std::vector<BindingEntry>>>
+sortBindings(const BindingsMap<const Sym *> &bindingsMap) {
+  std::vector<std::pair<const Sym *, std::vector<BindingEntry>>> bindingsVec(
+      bindingsMap.begin(), bindingsMap.end());
+  for (auto &p : bindingsVec) {
+    std::vector<BindingEntry> &bindings = p.second;
+    llvm::sort(bindings, [](const BindingEntry &a, const BindingEntry &b) {
+      return a.target.getVA() < b.target.getVA();
+    });
+  }
+  llvm::sort(bindingsVec, [](const auto &a, const auto &b) {
+    return a.second[0].target.getVA() < b.second[0].target.getVA();
+  });
+  return bindingsVec;
+}
+
 // Emit bind opcodes, which are a stream of byte-sized opcodes that dyld
 // interprets to update a record with the following fields:
 //  * segment index (of the segment to write the symbol addresses to, typically
@@ -361,24 +384,30 @@ static void encodeWeakOverride(const Defined *defined,
 void BindingSection::finalizeContents() {
   raw_svector_ostream os{contents};
   Binding lastBinding;
-
-  // Since bindings are delta-encoded, sorting them allows for a more compact
-  // result. Note that sorting by address alone ensures that bindings for the
-  // same segment / section are located together.
-  llvm::sort(bindings, [](const BindingEntry &a, const BindingEntry &b) {
-    return a.target.getVA() < b.target.getVA();
-  });
-  for (const BindingEntry &b : bindings) {
-    int16_t ordinal = ordinalForDylibSymbol(*b.dysym);
-    if (ordinal != lastBinding.ordinal) {
+  int16_t lastOrdinal = 0;
+
+  for (auto &p : sortBindings(bindingsMap)) {
+    const DylibSymbol *sym = p.first;
+    std::vector<BindingEntry> &bindings = p.second;
+    llvm::sort(bindings, [](const BindingEntry &a, const BindingEntry &b) {
+      return a.target.getVA() < b.target.getVA();
+    });
+    uint8_t flags = BIND_OPCODE_SET_SYMBOL_TRAILING_FLAGS_IMM;
+    if (sym->isWeakRef())
+      flags |= BIND_SYMBOL_FLAGS_WEAK_IMPORT;
+    os << flags << sym->getName() << '\0'
+       << static_cast<uint8_t>(BIND_OPCODE_SET_TYPE_IMM | BIND_TYPE_POINTER);
+    int16_t ordinal = ordinalForDylibSymbol(*sym);
+    if (ordinal != lastOrdinal) {
       encodeDylibOrdinal(ordinal, os);
-      lastBinding.ordinal = ordinal;
+      lastOrdinal = ordinal;
     }
-    encodeBinding(b.dysym, b.target.isec->parent,
-                  b.target.isec->getOffset(b.target.offset), b.addend,
-                  /*isWeakBinding=*/false, lastBinding, os);
+    for (const BindingEntry &b : bindings)
+      encodeBinding(b.target.isec->parent,
+                    b.target.isec->getOffset(b.target.offset), b.addend,
+                    lastBinding, os);
   }
-  if (!bindings.empty())
+  if (!bindingsMap.empty())
     os << static_cast<uint8_t>(BIND_OPCODE_DONE);
 }
 
@@ -396,17 +425,18 @@ void WeakBindingSection::finalizeContents() {
   for (const Defined *defined : definitions)
     encodeWeakOverride(defined, os);
 
-  // Since bindings are delta-encoded, sorting them allows for a more compact
-  // result.
-  llvm::sort(bindings,
-             [](const WeakBindingEntry &a, const WeakBindingEntry &b) {
-               return a.target.getVA() < b.target.getVA();
-             });
-  for (const WeakBindingEntry &b : bindings)
-    encodeBinding(b.symbol, b.target.isec->parent,
-                  b.target.isec->getOffset(b.target.offset), b.addend,
-                  /*isWeakBinding=*/true, lastBinding, os);
-  if (!bindings.empty() || !definitions.empty())
+  for (auto &p : sortBindings(bindingsMap)) {
+    const Symbol *sym = p.first;
+    std::vector<BindingEntry> &bindings = p.second;
+    os << static_cast<uint8_t>(BIND_OPCODE_SET_SYMBOL_TRAILING_FLAGS_IMM)
+       << sym->getName() << '\0'
+       << static_cast<uint8_t>(BIND_OPCODE_SET_TYPE_IMM | BIND_TYPE_POINTER);
+    for (const BindingEntry &b : bindings)
+      encodeBinding(b.target.isec->parent,
+                    b.target.isec->getOffset(b.target.offset), b.addend,
+                    lastBinding, os);
+  }
+  if (!bindingsMap.empty() || !definitions.empty())
     os << static_cast<uint8_t>(BIND_OPCODE_DONE);
 }
 

diff  --git a/lld/MachO/SyntheticSections.h b/lld/MachO/SyntheticSections.h
index b33d5e596c6b7..162e69e6fe245 100644
--- a/lld/MachO/SyntheticSections.h
+++ b/lld/MachO/SyntheticSections.h
@@ -16,6 +16,7 @@
 #include "OutputSegment.h"
 #include "Target.h"
 
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/Hashing.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/MC/StringTableBuilder.h"
@@ -169,40 +170,34 @@ class RebaseSection final : public LinkEditSection {
 };
 
 struct BindingEntry {
-  const DylibSymbol *dysym;
   int64_t addend;
   Location target;
-  BindingEntry(const DylibSymbol *dysym, int64_t addend, Location target)
-      : dysym(dysym), addend(addend), target(std::move(target)) {}
+  BindingEntry(int64_t addend, Location target)
+      : addend(addend), target(std::move(target)) {}
 };
 
+template <class Sym>
+using BindingsMap = llvm::DenseMap<Sym, std::vector<BindingEntry>>;
+
 // Stores bind opcodes for telling dyld which symbols to load non-lazily.
 class BindingSection final : public LinkEditSection {
 public:
   BindingSection();
   void finalizeContents() override;
   uint64_t getRawSize() const override { return contents.size(); }
-  bool isNeeded() const override { return !bindings.empty(); }
+  bool isNeeded() const override { return !bindingsMap.empty(); }
   void writeTo(uint8_t *buf) const override;
 
   void addEntry(const DylibSymbol *dysym, const InputSection *isec,
                 uint64_t offset, int64_t addend = 0) {
-    bindings.emplace_back(dysym, addend, Location(isec, offset));
+    bindingsMap[dysym].emplace_back(addend, Location(isec, offset));
   }
 
 private:
-  std::vector<BindingEntry> bindings;
+  BindingsMap<const DylibSymbol *> bindingsMap;
   SmallVector<char, 128> contents;
 };
 
-struct WeakBindingEntry {
-  const Symbol *symbol;
-  int64_t addend;
-  Location target;
-  WeakBindingEntry(const Symbol *symbol, int64_t addend, Location target)
-      : symbol(symbol), addend(addend), target(std::move(target)) {}
-};
-
 // Stores bind opcodes for telling dyld which weak symbols need coalescing.
 // There are two types of entries in this section:
 //
@@ -219,17 +214,17 @@ class WeakBindingSection final : public LinkEditSection {
   void finalizeContents() override;
   uint64_t getRawSize() const override { return contents.size(); }
   bool isNeeded() const override {
-    return !bindings.empty() || !definitions.empty();
+    return !bindingsMap.empty() || !definitions.empty();
   }
 
   void writeTo(uint8_t *buf) const override;
 
   void addEntry(const Symbol *symbol, const InputSection *isec, uint64_t offset,
                 int64_t addend = 0) {
-    bindings.emplace_back(symbol, addend, Location(isec, offset));
+    bindingsMap[symbol].emplace_back(addend, Location(isec, offset));
   }
 
-  bool hasEntry() const { return !bindings.empty(); }
+  bool hasEntry() const { return !bindingsMap.empty(); }
 
   void addNonWeakDefinition(const Defined *defined) {
     definitions.emplace_back(defined);
@@ -238,7 +233,7 @@ class WeakBindingSection final : public LinkEditSection {
   bool hasNonWeakDefinition() const { return !definitions.empty(); }
 
 private:
-  std::vector<WeakBindingEntry> bindings;
+  BindingsMap<const Symbol *> bindingsMap;
   std::vector<const Defined *> definitions;
   SmallVector<char, 128> contents;
 };

diff  --git a/lld/test/MachO/bind-opcodes.s b/lld/test/MachO/bind-opcodes.s
new file mode 100644
index 0000000000000..b66bffbd8c41c
--- /dev/null
+++ b/lld/test/MachO/bind-opcodes.s
@@ -0,0 +1,45 @@
+# REQUIRES: x86
+# RUN: rm -rf %t; split-file %s %t
+# RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %t/foo.s -o %t/foo.o
+# RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %t/test.s -o %t/test.o
+# RUN: %lld -dylib %t/foo.o -o %t/libfoo.dylib
+# RUN: %lld -lSystem %t/test.o %t/libfoo.dylib -o %t/test
+
+## Make sure we emit exactly one BIND_OPCODE_SET_SYMBOL_TRAILING_FLAGS_IMM per
+## symbol.
+# RUN: obj2yaml %t/test | FileCheck %s --implicit-check-not BIND_OPCODE_SET_SYMBOL_TRAILING_FLAGS_IMM
+
+# CHECK:      Opcode:          BIND_OPCODE_SET_SYMBOL_TRAILING_FLAGS_IMM
+# CHECK-NEXT: Imm:             0
+# CHECK-NEXT: Symbol:          _foo
+
+# CHECK:      Opcode:          BIND_OPCODE_SET_SYMBOL_TRAILING_FLAGS_IMM
+# CHECK-NEXT: Imm:             0
+# CHECK-NEXT: Symbol:          _bar
+
+# RUN: llvm-objdump --macho --bind %t/test | FileCheck %s --check-prefix=BIND
+# BIND:       Bind table:
+# BIND-NEXT:  segment  section address type    addend dylib   symbol
+# BIND-NEXT:  __DATA   __data  {{.*}}  pointer      0 libfoo  _foo
+# BIND-NEXT:  __DATA   __data  {{.*}}  pointer      0 libfoo  _foo
+# BIND-NEXT:  __DATA   __data  {{.*}}  pointer      0 libfoo  _bar
+# BIND-NEXT:  __DATA   __data  {{.*}}  pointer      0 libfoo  _bar
+# BIND-EMPTY:
+
+#--- foo.s
+.globl _foo, _bar
+_foo:
+  .space 4
+_bar:
+  .space 4
+
+#--- test.s
+.data
+.quad _foo
+.quad _bar
+.quad _foo
+.quad _bar
+
+.globl _main
+.text
+_main:


        


More information about the llvm-commits mailing list