[lld] 9854edd - [lld-macho] Implement basic export trie

Shoaib Meenai via llvm-commits llvm-commits at lists.llvm.org
Wed Apr 29 15:52:30 PDT 2020


Author: Jez Ng
Date: 2020-04-29T15:44:27-07:00
New Revision: 9854edd817c9d48966a40de061a385b37f9a352d

URL: https://github.com/llvm/llvm-project/commit/9854edd817c9d48966a40de061a385b37f9a352d
DIFF: https://github.com/llvm/llvm-project/commit/9854edd817c9d48966a40de061a385b37f9a352d.diff

LOG: [lld-macho] Implement basic export trie

Build the trie by performing a three-way radix quicksort: We start by
sorting the strings by their first characters, then sort the strings
with the same first characters by their second characters, and so on
recursively. Each time the prefixes diverge, we add a node to the trie.
Thanks to @ruiu for the idea.

I used llvm-mc's radix quicksort implementation as a starting point. The
trie offset fixpoint code was taken from
MachONormalizedFileBinaryWriter.cpp.

Differential Revision: https://reviews.llvm.org/D76977

Added: 
    lld/MachO/ExportTrie.cpp
    lld/MachO/ExportTrie.h
    lld/test/MachO/export-trie.s
    lld/test/MachO/no-exports-dylib.s

Modified: 
    lld/MachO/CMakeLists.txt
    lld/MachO/SyntheticSections.cpp
    lld/MachO/SyntheticSections.h
    lld/test/MachO/Inputs/libhello.s
    lld/test/MachO/dylink.s
    lld/test/MachO/symtab.s

Removed: 
    


################################################################################
diff  --git a/lld/MachO/CMakeLists.txt b/lld/MachO/CMakeLists.txt
index 0ed841fb15e0..f999c87f93ff 100644
--- a/lld/MachO/CMakeLists.txt
+++ b/lld/MachO/CMakeLists.txt
@@ -5,6 +5,7 @@ add_public_tablegen_target(MachOOptionsTableGen)
 add_lld_library(lldMachO2
   Arch/X86_64.cpp
   Driver.cpp
+  ExportTrie.cpp
   InputFiles.cpp
   InputSection.cpp
   OutputSegment.cpp

diff  --git a/lld/MachO/ExportTrie.cpp b/lld/MachO/ExportTrie.cpp
new file mode 100644
index 000000000000..cb3c52e28381
--- /dev/null
+++ b/lld/MachO/ExportTrie.cpp
@@ -0,0 +1,236 @@
+//===- ExportTrie.cpp -----------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This is a partial implementation of the Mach-O export trie format. It's
+// essentially a symbol table encoded as a compressed prefix trie, meaning that
+// the common prefixes of each symbol name are shared for a more compact
+// representation. The prefixes are stored on the edges of the trie, and one
+// edge can represent multiple characters. For example, given two exported
+// symbols _bar and _baz, we will have a trie like this (terminal nodes are
+// marked with an asterisk):
+//
+//              +-+-+
+//              |   | // root node
+//              +-+-+
+//                |
+//                | _ba
+//                |
+//              +-+-+
+//              |   |
+//              +-+-+
+//           r /     \ z
+//            /       \
+//        +-+-+       +-+-+
+//        | * |       | * |
+//        +-+-+       +-+-+
+//
+// More documentation of the format can be found in
+// llvm/tools/obj2yaml/macho2yaml.cpp.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ExportTrie.h"
+#include "Symbols.h"
+
+#include "lld/Common/Memory.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/BinaryFormat/MachO.h"
+#include "llvm/Support/LEB128.h"
+
+using namespace llvm;
+using namespace llvm::MachO;
+using namespace lld;
+using namespace lld::macho;
+
+namespace {
+
+struct Edge {
+  Edge(StringRef s, TrieNode *node) : substring(s), child(node) {}
+
+  StringRef substring;
+  struct TrieNode *child;
+};
+
+struct ExportInfo {
+  uint64_t address;
+  // TODO: Add proper support for re-exports & stub-and-resolver flags.
+};
+
+} // namespace
+
+namespace lld {
+namespace macho {
+
+struct TrieNode {
+  std::vector<Edge> edges;
+  Optional<ExportInfo> info;
+  // Estimated offset from the start of the serialized trie to the current node.
+  // This will converge to the true offset when updateOffset() is run to a
+  // fixpoint.
+  size_t offset = 0;
+
+  // Returns whether the new estimated offset 
diff ers from the old one.
+  bool updateOffset(size_t &nextOffset);
+  void writeTo(uint8_t *buf);
+};
+
+bool TrieNode::updateOffset(size_t &nextOffset) {
+  // Size of the whole node (including the terminalSize and the outgoing edges.)
+  // In contrast, terminalSize only records the size of the other data in the
+  // node.
+  size_t nodeSize;
+  if (info) {
+    uint64_t flags = 0;
+    uint32_t terminalSize =
+        getULEB128Size(flags) + getULEB128Size(info->address);
+    // Overall node size so far is the uleb128 size of the length of the symbol
+    // info + the symbol info itself.
+    nodeSize = terminalSize + getULEB128Size(terminalSize);
+  } else {
+    nodeSize = 1; // Size of terminalSize (which has a value of 0)
+  }
+  // Compute size of all child edges.
+  ++nodeSize; // Byte for number of children.
+  for (Edge &edge : edges) {
+    nodeSize += edge.substring.size() + 1             // String length.
+                + getULEB128Size(edge.child->offset); // Offset len.
+  }
+  // On input, 'nextOffset' is the new preferred location for this node.
+  bool result = (offset != nextOffset);
+  // Store new location in node object for use by parents.
+  offset = nextOffset;
+  nextOffset += nodeSize;
+  return result;
+}
+
+void TrieNode::writeTo(uint8_t *buf) {
+  buf += offset;
+  if (info) {
+    // TrieNodes with Symbol info: size, flags address
+    uint64_t flags = 0; // TODO: emit proper flags
+    uint32_t terminalSize =
+        getULEB128Size(flags) + getULEB128Size(info->address);
+    buf += encodeULEB128(terminalSize, buf);
+    buf += encodeULEB128(flags, buf);
+    buf += encodeULEB128(info->address, buf);
+  } else {
+    // TrieNode with no Symbol info.
+    *buf++ = 0; // terminalSize
+  }
+  // Add number of children. TODO: Handle case where we have more than 256.
+  assert(edges.size() < 256);
+  *buf++ = edges.size();
+  // Append each child edge substring and node offset.
+  for (const Edge &edge : edges) {
+    memcpy(buf, edge.substring.data(), edge.substring.size());
+    buf += edge.substring.size();
+    *buf++ = '\0';
+    buf += encodeULEB128(edge.child->offset, buf);
+  }
+}
+
+TrieNode *TrieBuilder::makeNode() {
+  auto *node = make<TrieNode>();
+  nodes.emplace_back(node);
+  return node;
+}
+
+static int charAt(const Symbol *sym, size_t pos) {
+  StringRef str = sym->getName();
+  if (pos >= str.size())
+    return -1;
+  return str[pos];
+}
+
+// Build the trie by performing a three-way radix quicksort: We start by sorting
+// the strings by their first characters, then sort the strings with the same
+// first characters by their second characters, and so on recursively. Each
+// time the prefixes diverge, we add a node to the trie.
+//
+// node:    The most recently created node along this path in the trie (i.e.
+//          the furthest from the root.)
+// lastPos: The prefix length of the most recently created node, i.e. the number
+//          of characters along its path from the root.
+// pos:     The string index we are currently sorting on. Note that each symbol
+//          S contained in vec has the same prefix S[0...pos).
+void TrieBuilder::sortAndBuild(MutableArrayRef<const Symbol *> vec,
+                               TrieNode *node, size_t lastPos, size_t pos) {
+tailcall:
+  if (vec.empty())
+    return;
+
+  // Partition items so that items in [0, i) are less than the pivot,
+  // [i, j) are the same as the pivot, and [j, vec.size()) are greater than
+  // the pivot.
+  const Symbol *pivotSymbol = vec[vec.size() / 2];
+  int pivot = charAt(pivotSymbol, pos);
+  size_t i = 0;
+  size_t j = vec.size();
+  for (size_t k = 0; k < j;) {
+    int c = charAt(vec[k], pos);
+    if (c < pivot)
+      std::swap(vec[i++], vec[k++]);
+    else if (c > pivot)
+      std::swap(vec[--j], vec[k]);
+    else
+      k++;
+  }
+
+  bool isTerminal = pivot == -1;
+  bool prefixesDiverge = i != 0 || j != vec.size();
+  if (lastPos != pos && (isTerminal || prefixesDiverge)) {
+    TrieNode *newNode = makeNode();
+    node->edges.emplace_back(pivotSymbol->getName().slice(lastPos, pos),
+                             newNode);
+    node = newNode;
+    lastPos = pos;
+  }
+
+  sortAndBuild(vec.slice(0, i), node, lastPos, pos);
+  sortAndBuild(vec.slice(j), node, lastPos, pos);
+
+  if (isTerminal) {
+    assert(j - i == 1); // no duplicate symbols
+    node->info = {pivotSymbol->getVA() + ImageBase};
+  } else {
+    // This is the tail-call-optimized version of the following:
+    // sortAndBuild(vec.slice(i, j - i), node, lastPos, pos + 1);
+    vec = vec.slice(i, j - i);
+    ++pos;
+    goto tailcall;
+  }
+}
+
+size_t TrieBuilder::build() {
+  if (exported.empty())
+    return 0;
+
+  TrieNode *root = makeNode();
+  sortAndBuild(exported, root, 0, 0);
+
+  // Assign each node in the vector an offset in the trie stream, iterating
+  // until all uleb128 sizes have stabilized.
+  size_t offset;
+  bool more;
+  do {
+    offset = 0;
+    more = false;
+    for (TrieNode *node : nodes)
+      more |= node->updateOffset(offset);
+  } while (more);
+
+  return offset;
+}
+
+void TrieBuilder::writeTo(uint8_t *buf) {
+  for (TrieNode *node : nodes)
+    node->writeTo(buf);
+}
+
+} // namespace macho
+} // namespace lld

diff  --git a/lld/MachO/ExportTrie.h b/lld/MachO/ExportTrie.h
new file mode 100644
index 000000000000..22cf06d12893
--- /dev/null
+++ b/lld/MachO/ExportTrie.h
@@ -0,0 +1,41 @@
+//===- ExportTrie.h ---------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLD_MACHO_EXPORT_TRIE_H
+#define LLD_MACHO_EXPORT_TRIE_H
+
+#include "llvm/ADT/ArrayRef.h"
+
+#include <vector>
+
+namespace lld {
+namespace macho {
+
+struct TrieNode;
+class Symbol;
+
+class TrieBuilder {
+public:
+  void addSymbol(const Symbol &sym) { exported.push_back(&sym); }
+  // Returns the size in bytes of the serialized trie.
+  size_t build();
+  void writeTo(uint8_t *buf);
+
+private:
+  TrieNode *makeNode();
+  void sortAndBuild(llvm::MutableArrayRef<const Symbol *> vec, TrieNode *node,
+                    size_t lastPos, size_t pos);
+
+  std::vector<const Symbol *> exported;
+  std::vector<TrieNode *> nodes;
+};
+
+} // namespace macho
+} // namespace lld
+
+#endif

diff  --git a/lld/MachO/SyntheticSections.cpp b/lld/MachO/SyntheticSections.cpp
index 8fce138a20fd..a6d22ed82e1a 100644
--- a/lld/MachO/SyntheticSections.cpp
+++ b/lld/MachO/SyntheticSections.cpp
@@ -8,6 +8,7 @@
 
 #include "SyntheticSections.h"
 #include "Config.h"
+#include "ExportTrie.h"
 #include "InputFiles.h"
 #include "OutputSegment.h"
 #include "SymbolTable.h"
@@ -136,38 +137,14 @@ ExportSection::ExportSection() {
 }
 
 void ExportSection::finalizeContents() {
-  raw_svector_ostream os{contents};
-  std::vector<const Defined *> exported;
   // TODO: We should check symbol visibility.
   for (const Symbol *sym : symtab->getSymbols())
     if (auto *defined = dyn_cast<Defined>(sym))
-      exported.push_back(defined);
-
-  if (exported.empty())
-    return;
-
-  if (exported.size() > 1) {
-    error("TODO: Unable to export more than 1 symbol");
-    return;
-  }
-
-  const Defined *sym = exported.front();
-  os << (char)0; // Indicates non-leaf node
-  os << (char)1; // # of children
-  os << sym->getName() << '\0';
-  encodeULEB128(sym->getName().size() + 4, os); // Leaf offset
-
-  // Leaf node
-  uint64_t addr = sym->getVA() + ImageBase;
-  os << (char)(1 + getULEB128Size(addr));
-  os << (char)0; // Flags
-  encodeULEB128(addr, os);
-  os << (char)0; // Terminator
+      trieBuilder.addSymbol(*defined);
+  size = trieBuilder.build();
 }
 
-void ExportSection::writeTo(uint8_t *buf) {
-  memcpy(buf, contents.data(), contents.size());
-}
+void ExportSection::writeTo(uint8_t *buf) { trieBuilder.writeTo(buf); }
 
 SymtabSection::SymtabSection(StringTableSection &stringTableSection)
     : stringTableSection(stringTableSection) {

diff  --git a/lld/MachO/SyntheticSections.h b/lld/MachO/SyntheticSections.h
index 1d59312ac14e..df53200e5c28 100644
--- a/lld/MachO/SyntheticSections.h
+++ b/lld/MachO/SyntheticSections.h
@@ -9,6 +9,7 @@
 #ifndef LLD_MACHO_SYNTHETIC_SECTIONS_H
 #define LLD_MACHO_SYNTHETIC_SECTIONS_H
 
+#include "ExportTrie.h"
 #include "InputSection.h"
 #include "Target.h"
 #include "llvm/ADT/SetVector.h"
@@ -101,14 +102,16 @@ class ExportSection : public InputSection {
 public:
   ExportSection();
   void finalizeContents();
-  size_t getSize() const override { return contents.size(); }
+  size_t getSize() const override { return size; }
   // Like other sections in __LINKEDIT, the export section is special: its
   // offsets are recorded in the LC_DYLD_INFO_ONLY load command, instead of in
   // section headers.
   bool isHidden() const override { return true; }
   void writeTo(uint8_t *buf) override;
 
-  SmallVector<char, 128> contents;
+private:
+  TrieBuilder trieBuilder;
+  size_t size = 0;
 };
 
 // Stores the strings referenced by the symbol table.

diff  --git a/lld/test/MachO/Inputs/libhello.s b/lld/test/MachO/Inputs/libhello.s
index 4b09ab430418..9fc880c85cb5 100644
--- a/lld/test/MachO/Inputs/libhello.s
+++ b/lld/test/MachO/Inputs/libhello.s
@@ -1,5 +1,8 @@
 .section __TEXT,__cstring
-.globl _hello_world
+.globl _hello_world, _hello_its_me
 
 _hello_world:
 .asciz "Hello world!\n"
+
+_hello_its_me:
+.asciz "Hello, it's me\n"

diff  --git a/lld/test/MachO/dylink.s b/lld/test/MachO/dylink.s
index 18265aa0ffe9..26302013035d 100644
--- a/lld/test/MachO/dylink.s
+++ b/lld/test/MachO/dylink.s
@@ -15,12 +15,16 @@
 # CHECK: movq [[#%u, HELLO_OFF:]](%rip), %rsi
 # CHECK-NEXT: [[#%x, HELLO_RIP:]]:
 
+# CHECK: movq [[#%u, HELLO_ITS_ME_OFF:]](%rip), %rsi
+# CHECK-NEXT: [[#%x, HELLO_ITS_ME_RIP:]]:
+
 # CHECK: movq [[#%u, GOODBYE_OFF:]](%rip), %rsi
 # CHECK-NEXT: [[#%x, GOODBYE_RIP:]]:
 
 # CHECK-LABEL: Bind table:
-# CHECK-DAG: __DATA_CONST __got 0x{{0*}}[[#%x, HELLO_RIP + HELLO_OFF]]     pointer 0 libhello   _hello_world
-# CHECK-DAG: __DATA_CONST __got 0x{{0*}}[[#%x, GOODBYE_RIP + GOODBYE_OFF]] pointer 0 libgoodbye _goodbye_world
+# CHECK-DAG: __DATA_CONST __got 0x{{0*}}[[#%x, HELLO_RIP + HELLO_OFF]]               pointer 0 libhello   _hello_world
+# CHECK-DAG: __DATA_CONST __got 0x{{0*}}[[#%x, HELLO_ITS_ME_RIP + HELLO_ITS_ME_OFF]] pointer 0 libhello   _hello_its_me
+# CHECK-DAG: __DATA_CONST __got 0x{{0*}}[[#%x, GOODBYE_RIP + GOODBYE_OFF]]           pointer 0 libgoodbye _goodbye_world
 
 .section __TEXT,__text
 .globl _main
@@ -32,6 +36,12 @@ _main:
   mov $13, %rdx # length of str
   syscall
 
+  movl $0x2000004, %eax # write() syscall
+  mov $1, %rdi # stdout
+  movq _hello_its_me at GOTPCREL(%rip), %rsi
+  mov $15, %rdx # length of str
+  syscall
+
   movl $0x2000004, %eax # write() syscall
   mov $1, %rdi # stdout
   movq _goodbye_world at GOTPCREL(%rip), %rsi

diff  --git a/lld/test/MachO/export-trie.s b/lld/test/MachO/export-trie.s
new file mode 100644
index 000000000000..fc1f6d110dba
--- /dev/null
+++ b/lld/test/MachO/export-trie.s
@@ -0,0 +1,44 @@
+# REQUIRES: x86
+# RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %s -o %t.o
+# RUN: lld -flavor darwinnew -dylib %t.o -o %t.dylib
+
+# RUN: llvm-objdump --syms --exports-trie %t.dylib | \
+# RUN:   FileCheck %s --check-prefix=EXPORTS
+# EXPORTS-LABEL: SYMBOL TABLE:
+# EXPORTS-DAG:   [[#%x, HELLO_ADDR:]] {{.*}} _hello
+# EXPORTS-DAG:   [[#%x, HELLO_WORLD_ADDR:]] {{.*}} _hello_world
+# EXPORTS-DAG:   [[#%x, HELLO_ITS_ME_ADDR:]] {{.*}} _hello_its_me
+# EXPORTS-DAG:   [[#%x, HELLO_ITS_YOU_ADDR:]] {{.*}} _hello_its_you
+# EXPORTS-LABEL: Exports trie:
+# EXPORTS-DAG:   0x{{0*}}[[#%X, HELLO_ADDR]] _hello
+# EXPORTS-DAG:   0x{{0*}}[[#%X, HELLO_WORLD_ADDR]] _hello_world
+# EXPORTS-DAG:   0x{{0*}}[[#%x, HELLO_ITS_ME_ADDR:]] _hello_its_me
+# EXPORTS-DAG:   0x{{0*}}[[#%x, HELLO_ITS_YOU_ADDR:]] _hello_its_you
+
+## Check that we are sharing prefixes in the trie.
+# RUN: obj2yaml %t.dylib | FileCheck %s
+# CHECK-LABEL: ExportTrie:
+# CHECK: Name: ''
+# CHECK: Name: _hello
+# CHECK: Name: _
+# CHECK: Name: world
+# CHECK: Name: its_
+# CHECK: Name: me
+# CHECK: Name: you
+
+.section __TEXT,__cstring
+.globl _hello, _hello_world, _hello_its_me, _hello_its_you
+
+## Test for when an entire symbol name is a prefix of another.
+_hello:
+.asciz "Hello!\n"
+
+_hello_world:
+.asciz "Hello world!\n"
+
+.data
+_hello_its_me:
+.asciz "Hello, it's me\n"
+
+_hello_its_you:
+.asciz "Hello, it's you\n"

diff  --git a/lld/test/MachO/no-exports-dylib.s b/lld/test/MachO/no-exports-dylib.s
new file mode 100644
index 000000000000..896c31ef3c2f
--- /dev/null
+++ b/lld/test/MachO/no-exports-dylib.s
@@ -0,0 +1,6 @@
+# REQUIRES: x86
+# RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %s -o %t.o
+# RUN: lld -flavor darwinnew -dylib %t.o -o %t.dylib
+
+# RUN: obj2yaml %t.dylib | FileCheck %s
+# CHECK: export_size: 0

diff  --git a/lld/test/MachO/symtab.s b/lld/test/MachO/symtab.s
index a647e4a5bb39..44a016912bd6 100644
--- a/lld/test/MachO/symtab.s
+++ b/lld/test/MachO/symtab.s
@@ -14,10 +14,41 @@
 # CHECK-NEXT:     ]
 # CHECK-NEXT:     Value:
 # CHECK-NEXT:   }
+# CHECK-NEXT:   Symbol {
+# CHECK-NEXT:     Name: bar
+# CHECK-NEXT:     Extern
+# CHECK-NEXT:     Type: Section (0xE)
+# CHECK-NEXT:     Section: __text (0x1)
+# CHECK-NEXT:     RefType:
+# CHECK-NEXT:     Flags [ (0x0)
+# CHECK-NEXT:     ]
+# CHECK-NEXT:     Value:
+# CHECK-NEXT:   }
+# CHECK-NEXT:   Symbol {
+# CHECK-NEXT:     Name: foo
+# CHECK-NEXT:     Extern
+# CHECK-NEXT:     Type: Section (0xE)
+# CHECK-NEXT:     Section: __data
+# CHECK-NEXT:     RefType:
+# CHECK-NEXT:     Flags [ (0x0)
+# CHECK-NEXT:     ]
+# CHECK-NEXT:     Value:
+# CHECK-NEXT:   }
 # CHECK-NEXT: ]
 
+.data
+.global foo
+foo:
+  .asciz "Hello world!\n"
+
+.text
+.global bar
 .global _main
 
 _main:
   mov $0, %rax
   ret
+
+bar:
+  mov $2, %rax
+  ret


        


More information about the llvm-commits mailing list