[lld] 213dbdb - [lld-macho] Overhaul map file code

Jez Ng via llvm-commits llvm-commits at lists.llvm.org
Tue Nov 8 13:34:19 PST 2022


Author: Jez Ng
Date: 2022-11-08T16:33:22-05:00
New Revision: 213dbdbef0bad835abca0753f9e59b17dc2bcde2

URL: https://github.com/llvm/llvm-project/commit/213dbdbef0bad835abca0753f9e59b17dc2bcde2
DIFF: https://github.com/llvm/llvm-project/commit/213dbdbef0bad835abca0753f9e59b17dc2bcde2.diff

LOG: [lld-macho] Overhaul map file code

The previous map file code left out was modeled after LLD-ELF's
implementation. However, ld64's map file differs quite a bit from
LLD-ELF's. I've revamped our map file implementation so it is better
able to emit ld64-style map files.

Notable differences:
* ld64 doesn't demangle symbols in map files, regardless of whether
  `-demangle` is passed. So we don't have to bother with
  `getSymbolStrings()`.
* ld64 doesn't emit symbols in cstring sections; it emits just the
  literal values. Moreover, it emits these literal values regardless of
  whether they are labeled with a symbol.
* ld64 emits map file entries for things that are not strictly symbols,
  such as unwind info, GOT entries, etc. That isn't handled in this
  diff, but this redesign makes them easy to implement.

Additionally, the previous implementation sorted the symbols so as to
emit them in address order. This was slow and unnecessary -- the symbols
can already be traversed in address order by walking the list of
OutputSections. This brings significant speedups. Here's the numbers
from the chromium_framework_less_dwarf benchmark on my Mac Pro, with the
`-map` argument added to the response file:

             base            diff           difference (95% CI)
  sys_time   2.922 ± 0.059   2.950 ± 0.085  [  -0.7% ..   +2.5%]
  user_time  11.464 ± 0.191  8.290 ± 0.123  [ -28.7% ..  -26.7%]
  wall_time  11.235 ± 0.175  9.184 ± 0.169  [ -19.3% ..  -17.2%]
  samples    16              23

(It's worth noting that map files are written in parallel with the
output binary, but they often took longer to write than the binary
itself.)

Finally, I did further cleanups to the map-file.s test -- there was no
real need to have a custom-named section. There were also alt_entry
symbol declarations that had no corresponding definition. Either way,
neither custom-named sections nor alt_entry symbols trigger special code
paths in our map file implementation.

Reviewed By: #lld-macho, Roger

Differential Revision: https://reviews.llvm.org/D137368

Added: 
    

Modified: 
    lld/MachO/MapFile.cpp
    lld/test/MachO/map-file.s

Removed: 
    


################################################################################
diff  --git a/lld/MachO/MapFile.cpp b/lld/MachO/MapFile.cpp
index 8f1b6a13330a7..5d6c87baba9f1 100644
--- a/lld/MachO/MapFile.cpp
+++ b/lld/MachO/MapFile.cpp
@@ -6,9 +6,10 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file implements the -map option. It shows lists in order and
-// hierarchically the outputFile, arch, input files, output sections and
-// symbols:
+// This file implements the -map option, which maps address ranges to their
+// respective contents, plus the input file these contents were originally from.
+// The contents (typically symbols) are listed in address order. Dead-stripped
+// contents are included as well.
 //
 // # Path: test
 // # Arch: x86_84
@@ -28,15 +29,16 @@
 //===----------------------------------------------------------------------===//
 
 #include "MapFile.h"
+#include "ConcatOutputSection.h"
 #include "Config.h"
 #include "InputFiles.h"
 #include "InputSection.h"
-#include "OutputSection.h"
 #include "OutputSegment.h"
 #include "Symbols.h"
 #include "SyntheticSections.h"
 #include "Target.h"
 #include "lld/Common/ErrorHandler.h"
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/Support/Parallel.h"
 #include "llvm/Support/TimeProfiler.h"
 
@@ -45,69 +47,75 @@ using namespace llvm::sys;
 using namespace lld;
 using namespace lld::macho;
 
+struct CStringInfo {
+  uint32_t fileIndex;
+  StringRef str;
+};
+
 struct MapInfo {
   SmallVector<InputFile *> files;
-  SmallVector<Defined *> liveSymbols;
   SmallVector<Defined *> deadSymbols;
+  DenseMap<const OutputSection *,
+           SmallVector<std::pair<uint64_t /*addr*/, CStringInfo>>>
+      liveCStringsForSection;
+  SmallVector<CStringInfo> deadCStrings;
 };
 
 static MapInfo gatherMapInfo() {
   MapInfo info;
   for (InputFile *file : inputFiles)
     if (isa<ObjFile>(file) || isa<BitcodeFile>(file)) {
-      bool hasEmittedSymbol = false;
+      uint32_t fileIndex = info.files.size() + 1;
+      bool isReferencedFile = false;
+
+      // Gather the dead symbols. We don't have to bother with the live ones
+      // because we will pick them up as we iterate over the OutputSections
+      // later.
       for (Symbol *sym : file->symbols) {
         if (auto *d = dyn_cast_or_null<Defined>(sym))
-          if (d->isec && d->getFile() == file) {
-            if (d->isLive()) {
-              assert(!shouldOmitFromOutput(d->isec));
-              info.liveSymbols.push_back(d);
-            } else {
+          // Only emit the prevailing definition of a symbol. Also, don't emit
+          // the symbol if it is part of a cstring section (we use the literal
+          // value instead, similar to ld64)
+          if (d->isec && d->getFile() == file &&
+              !isa<CStringInputSection>(d->isec)) {
+            isReferencedFile = true;
+            if (!d->isLive())
               info.deadSymbols.push_back(d);
+          }
+      }
+
+      // Gather all the cstrings (both live and dead). A CString(Output)Section
+      // doesn't provide us a way of figuring out which InputSections its
+      // cstring contents came from, so we need to build up that mapping here.
+      for (const Section *sec : file->sections) {
+        for (const Subsection &subsec : sec->subsections) {
+          if (auto isec = dyn_cast<CStringInputSection>(subsec.isec)) {
+            auto &liveCStrings = info.liveCStringsForSection[isec->parent];
+            for (const auto &[i, piece] : llvm::enumerate(isec->pieces)) {
+              if (piece.live)
+                liveCStrings.push_back({isec->parent->addr + piece.outSecOff,
+                                        {fileIndex, isec->getStringRef(i)}});
+              else
+                info.deadCStrings.push_back({fileIndex, isec->getStringRef(i)});
+              isReferencedFile = true;
             }
-            hasEmittedSymbol = true;
+          } else {
+            break;
           }
+        }
       }
-      if (hasEmittedSymbol)
-        info.files.push_back(file);
-    }
-  parallelSort(info.liveSymbols.begin(), info.liveSymbols.end(),
-               [](Defined *a, Defined *b) { return a->getVA() < b->getVA(); });
-  return info;
-}
 
-// Construct a map from symbols to their stringified representations.
-// Demangling symbols (which is what toString() does) is slow, so
-// we do that in batch using parallel-for.
-static DenseMap<Symbol *, std::string>
-getSymbolStrings(ArrayRef<Defined *> syms) {
-  std::vector<std::string> str(syms.size());
-  parallelFor(0, syms.size(), [&](size_t i) {
-    raw_string_ostream os(str[i]);
-    Defined *sym = syms[i];
-
-    switch (sym->isec->kind()) {
-    case InputSection::CStringLiteralKind: {
-      // Output "literal string: <string literal>"
-      const auto *isec = cast<CStringInputSection>(sym->isec);
-      const StringPiece &piece = isec->getStringPiece(sym->value);
-      assert(
-          sym->value == piece.inSecOff &&
-          "We expect symbols to always point to the start of a StringPiece.");
-      StringRef str = isec->getStringRef(&piece - &(*isec->pieces.begin()));
-      (os << "literal string: ").write_escaped(str);
-      break;
-    }
-    case InputSection::ConcatKind:
-    case InputSection::WordLiteralKind:
-      os << toString(*sym);
+      if (isReferencedFile)
+        info.files.push_back(file);
     }
-  });
 
-  DenseMap<Symbol *, std::string> ret;
-  for (size_t i = 0, e = syms.size(); i < e; ++i)
-    ret[syms[i]] = std::move(str[i]);
-  return ret;
+  // cstrings are not stored in sorted order in their OutputSections, so we sort
+  // them here.
+  for (auto &liveCStrings : info.liveCStringsForSection)
+    parallelSort(liveCStrings.second, [](const auto &p1, const auto &p2) {
+      return p1.first < p2.first;
+    });
+  return info;
 }
 
 void macho::writeMapFile() {
@@ -124,16 +132,12 @@ void macho::writeMapFile() {
     return;
   }
 
-  // Dump output path.
   os << format("# Path: %s\n", config->outputFile.str().c_str());
-
-  // Dump output architecture.
   os << format("# Arch: %s\n",
                getArchitectureName(config->arch()).str().c_str());
 
   MapInfo info = gatherMapInfo();
 
-  // Dump table of object files.
   os << "# Object files:\n";
   os << format("[%3u] %s\n", 0, (const char *)"linker synthesized");
   uint32_t fileIndex = 1;
@@ -143,7 +147,6 @@ void macho::writeMapFile() {
     readerToFileOrdinal[file] = fileIndex++;
   }
 
-  // Dump table of sections
   os << "# Sections:\n";
   os << "# Address\tSize    \tSegment\tSection\n";
   for (OutputSegment *seg : outputSegments)
@@ -155,28 +158,48 @@ void macho::writeMapFile() {
                    seg->name.str().c_str(), osec->name.str().c_str());
     }
 
-  // Dump table of symbols
-  DenseMap<Symbol *, std::string> liveSymbolStrings =
-      getSymbolStrings(info.liveSymbols);
   os << "# Symbols:\n";
   os << "# Address\tSize    \tFile  Name\n";
-  for (Defined *sym : info.liveSymbols) {
-    assert(sym->isLive());
-    os << format("0x%08llX\t0x%08llX\t[%3u] %s\n", sym->getVA(), sym->size,
-                 readerToFileOrdinal[sym->getFile()],
-                 liveSymbolStrings[sym].c_str());
+  for (const OutputSegment *seg : outputSegments) {
+    for (const OutputSection *osec : seg->getSections()) {
+      if (auto *concatOsec = dyn_cast<ConcatOutputSection>(osec)) {
+        for (const InputSection *isec : concatOsec->inputs) {
+          for (Defined *sym : isec->symbols)
+            os << format("0x%08llX\t0x%08llX\t[%3u] %s\n", sym->getVA(),
+                         sym->size, readerToFileOrdinal[sym->getFile()],
+                         sym->getName().str().data());
+        }
+      } else if (osec == in.cStringSection || osec == in.objcMethnameSection) {
+        const auto &liveCStrings = info.liveCStringsForSection.lookup(osec);
+        uint64_t lastAddr = 0; // strings will never start at address 0, so this
+                               // is a sentinel value
+        for (const auto &[addr, info] : liveCStrings) {
+          uint64_t size = 0;
+          if (addr != lastAddr)
+            size = info.str.size() + 1; // include null terminator
+          lastAddr = addr;
+          os << format("0x%08llX\t0x%08llX\t[%3u] literal string: ", addr, size,
+                       info.fileIndex);
+          os.write_escaped(info.str) << "\n";
+        }
+      }
+      // TODO print other synthetic sections
+    }
   }
 
   if (config->deadStrip) {
-    DenseMap<Symbol *, std::string> deadSymbolStrings =
-        getSymbolStrings(info.deadSymbols);
     os << "# Dead Stripped Symbols:\n";
     os << "#        \tSize    \tFile  Name\n";
     for (Defined *sym : info.deadSymbols) {
       assert(!sym->isLive());
       os << format("<<dead>>\t0x%08llX\t[%3u] %s\n", sym->size,
                    readerToFileOrdinal[sym->getFile()],
-                   deadSymbolStrings[sym].c_str());
+                   sym->getName().str().data());
+    }
+    for (CStringInfo &cstrInfo : info.deadCStrings) {
+      os << format("<<dead>>\t0x%08llX\t[%3u] literal string: ",
+                   cstrInfo.str.size() + 1, cstrInfo.fileIndex);
+      os.write_escaped(cstrInfo.str) << "\n";
     }
   }
 }

diff  --git a/lld/test/MachO/map-file.s b/lld/test/MachO/map-file.s
index ac5ae9d02074c..fe1ef88604e1e 100644
--- a/lld/test/MachO/map-file.s
+++ b/lld/test/MachO/map-file.s
@@ -4,23 +4,24 @@
 # RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %t/test.s -o %t/test.o
 # RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %t/c-string-literal.s -o %t/c-string-literal.o
 
-# RUN: %lld -map %t/map %t/test.o %t/foo.o %t/c-string-literal.o --time-trace -o %t/test
+# RUN: %lld -demangle -map %t/map %t/test.o %t/foo.o %t/c-string-literal.o \
+# RUN:   --time-trace -o %t/test
 # RUN: llvm-objdump --syms --section-headers %t/test > %t/objdump
-# RUN: cat %t/objdump %t/map > %t/out
-# RUN: FileCheck %s < %t/out
+## Check that symbols in cstring sections aren't emitted
+# RUN: cat %t/objdump %t/map | FileCheck %s --implicit-check-not _hello_world
 # RUN: FileCheck %s --check-prefix=MAPFILE < %t/test.time-trace
 
 # CHECK:      Sections:
-# CHECK-NEXT: Idx  Name          Size           VMA           Type
-# CHECK-NEXT: 0    __text        {{[0-9a-f]+}}  [[#%x,TEXT:]] TEXT
-# CHECK-NEXT: 1    obj           {{[0-9a-f]+}}  [[#%x,DATA:]] TEXT
-# CHECK-NEXT: 2    __cstring     {{[0-9a-f]+}}  [[#%x,CSTR:]] DATA
-# CHECK-NEXT: 3    __common      {{[0-9a-f]+}}  [[#%x,BSS:]]  BSS
+# CHECK-NEXT: Idx  Name          Size           VMA               Type
+# CHECK-NEXT: 0    __text        {{[0-9a-f]+}}  [[#%x,TEXT:]]     TEXT
+# CHECK-NEXT: 1    __cstring     {{[0-9a-f]+}}  [[#%x,CSTR:]]     DATA
+# CHECK-NEXT: 2    __common      {{[0-9a-f]+}}  [[#%x,BSS:]]      BSS
 
 # CHECK:      SYMBOL TABLE:
 # CHECK-DAG:  [[#%x,MAIN:]]    g     F __TEXT,__text _main
 # CHECK-DAG:  [[#%x,NUMBER:]]  g     O __DATA,__common _number
-# CHECK-DAG:  [[#%x,FOO:]]     g     F __TEXT,obj _foo
+# CHECK-DAG:  [[#%x,BAR:]]     g     F __TEXT,__text _bar
+# CHECK-DAG:  [[#%x,FOO:]]     g     F __TEXT,__text __ZTIN3foo3bar4MethE
 # CHECK-DAG:  [[#%x,HIWORLD:]] g     O __TEXT,__cstring _hello_world
 # CHECK-DAG:  [[#%x,HIITSME:]] g     O __TEXT,__cstring _hello_its_me
 
@@ -35,43 +36,50 @@
 # CHECK-NEXT: # Sections:
 # CHECK-NEXT: # Address       Size              Segment  Section
 # CHECK-NEXT: 0x[[#%X,TEXT]]  0x{{[0-9A-F]+}}   __TEXT   __text
-# CHECK-NEXT: 0x[[#%X,DATA]]  0x{{[0-9A-F]+}}   __TEXT   obj
 # CHECK-NEXT: 0x[[#%X,CSTR]]  0x{{[0-9A-F]+}}   __TEXT   __cstring
 # CHECK-NEXT: 0x[[#%X,BSS]]   0x{{[0-9A-F]+}}   __DATA   __common
 
 # CHECK-NEXT: # Symbols:
-# CHECK-NEXT: # Address           Size        File   Name
-# CHECK-DAG:  0x[[#%X,MAIN]]      0x00000001  [  1]  _main
-# CHECK-DAG:  0x[[#%X,FOO]]       0x00000001  [  2]  _foo
-# CHECK-DAG:  0x[[#%X,HIWORLD]]   0x0000000E  [  3]  literal string: Hello world!\n
-# CHECK-DAG:  0x[[#%X,HIITSME]]   0x0000000F  [  3]  literal string: Hello, it's me
-# CHECK-DAG:  0x[[#%X,NUMBER]]    0x00000001  [  1]  _number
+# CHECK-NEXT: # Address                Size        File   Name
+# CHECK-DAG:  0x[[#%X,MAIN]]           0x00000001  [  1]  _main
+# CHECK-DAG:  0x[[#%X,BAR]]            0x00000001  [  1]  _bar
+# CHECK-DAG:  0x[[#%X,FOO]]            0x00000001  [  2]  __ZTIN3foo3bar4MethE
+# CHECK-DAG:  0x[[#%X,HIWORLD]]        0x0000000E  [  3]  literal string: Hello world!\n
+# CHECK-DAG:  0x[[#%X,HIITSME]]        0x0000000F  [  3]  literal string: Hello, it's me
+# CHECK-DAG:  0x[[#%X,HIITSME + 0xf]]  0x0000000E  [  3]  literal string: Hello world!\n
+# CHECK-DAG:  0x[[#%X,NUMBER]]         0x00000001  [  1]  _number
 
 # MAPFILE: "name":"Total Write map file"
 
-# RUN: %lld -dead_strip -map %t/stripped-map %t/test.o %t/foo.o %t/c-string-literal.o -o %t/stripped
+# RUN: %lld -demangle -dead_strip -map %t/stripped-map %t/test.o %t/foo.o %t/c-string-literal.o -o %t/stripped
 # RUN: FileCheck --check-prefix=STRIPPED %s < %t/stripped-map
 
 ## C-string literals should be printed as "literal string: <C string literal>"
 # STRIPPED-LABEL: Dead Stripped Symbols:
-# STRIPPED-DAG:   <<dead>> 0x00000001 [  2] _foo
-# STRIPPED-DAG:   <<dead>> 0x0000000E [  3] literal string: Hello world!\n
-# STRIPPED-DAG:   <<dead>> 0x0000000F [  3] literal string: Hello, it's me
-# STRIPPED-DAG:   <<dead>> 0x00000001 [  1] _number
+# STRIPPED-DAG:   <<dead>>	0x00000001	[  1] _bar
+# STRIPPED-DAG:   <<dead>>	0x00000001	[  1] _number
+# STRIPPED-DAG:   <<dead>>	0x00000001	[  2] __ZTIN3foo3bar4MethE
+# STRIPPED-DAG:   <<dead>>	0x0000000E	[  3] literal string: Hello world!\n
+# STRIPPED-DAG:   <<dead>>	0x0000000F	[  3] literal string: Hello, it's me
+# STRIPPED-DAG:   <<dead>>	0x0000000E	[  3] literal string: Hello world!\n
 
 # RUN: %lld --icf=all -map %t/icf-map %t/test.o %t/foo.o %t/c-string-literal.o -o %t/icf
 # RUN: FileCheck --check-prefix=ICF %s < %t/icf-map
 
+## Verify that folded symbols and cstrings have size zero. Note that ld64 prints
+## folded symbols but not folded cstrings; we print both.
+
 # ICF:     Symbols:
-# ICF-DAG: 0x[[#%X,FOO:]]  0x00000000  [  2] _foo
-# ICF-DAG: 0x[[#FOO]]      0x00000001  [  1] _bar
+# ICF-DAG: 0x[[#%X,FOO:]]     0x00000000  [  2] __ZTIN3foo3bar4MethE
+# ICF-DAG: 0x[[#FOO]]         0x00000001  [  1] _bar
+# ICF-DAG: 0x[[#%X,HIWORLD:]] 0x0000000E  [  3]  literal string: Hello world!\n
+# ICF-DAG: 0x[[#%X,HIWORLD]]  0x00000000  [  3]  literal string: Hello world!\n
 
 #--- foo.s
-## ICF will only fold sections marked as pure_instructions
-.section __TEXT,obj,regular,pure_instructions
-.globl _foo
-.alt_entry _alt_foo
-_foo:
+.globl __ZTIN3foo3bar4MethE
+## This C++ symbol makes it clear that we do not print the demangled name in
+## the map file, even if `-demangle` is passed.
+__ZTIN3foo3bar4MethE:
   nop
 
 .subsections_via_symbols
@@ -79,12 +87,10 @@ _foo:
 #--- test.s
 .comm _number, 1
 .globl _main, _bar
-.alt_entry _alt_bar
 
 _main:
   ret
 
-.section __TEXT,obj,regular,pure_instructions
 _bar:
   nop
 
@@ -101,4 +107,6 @@ _hello_world:
 _hello_its_me:
 .asciz "Hello, it's me"
 
+.asciz "Hello world!\n"
+
 .subsections_via_symbols


        


More information about the llvm-commits mailing list