[lld] b440c25 - [lld-macho][nfc] Give non-text ConcatOutputSections order-independent finalization

Thu Apr 7 15:13:35 PDT 2022

Author: Jez Ng
Date: 2022-04-07T18:13:27-04:00
New Revision: b440c25742749ce9e9008f84e020002290dca93c

URL: https://github.com/llvm/llvm-project/commit/b440c25742749ce9e9008f84e020002290dca93c
DIFF: https://github.com/llvm/llvm-project/commit/b440c25742749ce9e9008f84e020002290dca93c.diff

LOG: [lld-macho][nfc] Give non-text ConcatOutputSections order-independent finalization

This diff is motivated by my work to add proper DWARF unwind support. As
detailed in PR50956 functions that need DWARF unwind need to have
compact unwind entries synthesized for them. These CU entries encode an
offset within `__eh_frame` that points to the corresponding DWARF FDE.

In order to encode this offset during
`UnwindInfoSectionImpl::finalize()`, we need to first assign values to
`InputSection::outSecOff` for each `__eh_frame` subsection. But
`__eh_frame` is ordered after `__unwind_info` (according to ld64 at
least), which puts us in a bit of a bind: `outSecOff` gets assigned
during finalization, but `__eh_frame` is being finalized after
`__unwind_info`.

But it occurred to me that there's no real need for most
ConcatOutputSections to be finalized sequentially. It's only necessary
for text-containing ConcatOutputSections that may contain branch relocs
which may need thunks. ConcatOutputSections containing other types of
data can be finalized in any order.

This diff moves the finalization logic for non-text sections into a
separate `finalizeContents()` method. This method is called before
section address assignment & unwind info finalization takes place. In
theory we could call these `finalizeContents()` methods in parallel, but
in practice it seems to be faster to do it all on the main thread.

Reviewed By: #lld-macho, oontvoo

Differential Revision: https://reviews.llvm.org/D123279

Added: 
    

Modified: 
    lld/MachO/ConcatOutputSection.cpp
    lld/MachO/ConcatOutputSection.h
    lld/MachO/Writer.cpp

Removed: 
    


################################################################################
diff  --git a/lld/MachO/ConcatOutputSection.cpp b/lld/MachO/ConcatOutputSection.cpp
index 1aa21d6c7bb1b..28f31ee7e07ca 100644

--- a/lld/MachO/ConcatOutputSection.cpp
+++ b/lld/MachO/ConcatOutputSection.cpp
@@ -121,7 +121,7 @@ DenseMap<Symbol *, ThunkInfo> lld::macho::thunkMap;
 // instructions, whereas CISC (i.e., x86) generally doesn't. RISC only needs
 // thunks for programs so large that branch source & destination addresses
 // might 
diff er more than the range of branch instruction(s).
-bool ConcatOutputSection::needsThunks() const {
+bool TextOutputSection::needsThunks() const {
   if (!target->usesThunks())
     return false;
   uint64_t isecAddr = addr;
@@ -138,7 +138,7 @@ bool ConcatOutputSection::needsThunks() const {
       auto *sym = r.referent.get<Symbol *>();
       // Pre-populate the thunkMap and memoize call site counts for every
       // InputSection and ThunkInfo. We do this for the benefit of
-      // ConcatOutputSection::estimateStubsInRangeVA()
+      // estimateStubsInRangeVA().
       ThunkInfo &thunkInfo = thunkMap[sym];
       // Knowing ThunkInfo call site count will help us know whether or not we
       // might need to create more for this referent at the time we are
@@ -154,7 +154,7 @@ bool ConcatOutputSection::needsThunks() const {
 // Since __stubs is placed after __text, we must estimate the address
 // beyond which stubs are within range of a simple forward branch.
 // This is called exactly once, when the last input section has been finalized.
-uint64_t ConcatOutputSection::estimateStubsInRangeVA(size_t callIdx) const {
+uint64_t TextOutputSection::estimateStubsInRangeVA(size_t callIdx) const {
   // Tally the functions which still have call sites remaining to process,
   // which yields the maximum number of thunks we might yet place.
   size_t maxPotentialThunks = 0;
@@ -193,23 +193,24 @@ uint64_t ConcatOutputSection::estimateStubsInRangeVA(size_t callIdx) const {
   return stubsInRangeVA;
 }
 
-void ConcatOutputSection::finalize() {
-  uint64_t isecAddr = addr;
-  uint64_t isecFileOff = fileOff;
-  auto finalizeOne = [&](ConcatInputSection *isec) {
-    isecAddr = alignTo(isecAddr, isec->align);
-    isecFileOff = alignTo(isecFileOff, isec->align);
-    isec->outSecOff = isecAddr - addr;
-    isec->isFinal = true;
-    isecAddr += isec->getSize();
-    isecFileOff += isec->getFileSize();
-  };
+void ConcatOutputSection::finalizeOne(ConcatInputSection *isec) {
+  size = alignTo(size, isec->align);
+  fileSize = alignTo(fileSize, isec->align);
+  isec->outSecOff = size;
+  isec->isFinal = true;
+  size += isec->getSize();
+  fileSize += isec->getFileSize();
+}
+
+void ConcatOutputSection::finalizeContents() {
+  for (ConcatInputSection *isec : inputs)
+    finalizeOne(isec);
+}
 
+void TextOutputSection::finalize() {
   if (!needsThunks()) {
     for (ConcatInputSection *isec : inputs)
       finalizeOne(isec);
-    size = isecAddr - addr;
-    fileSize = isecFileOff - fileOff;
     return;
   }
 
@@ -225,7 +226,7 @@ void ConcatOutputSection::finalize() {
   // Walk all sections in order. Finalize all sections that are less than
   // forwardBranchRange in front of it.
   // isecVA is the address of the current section.
-  // isecAddr is the start address of the first non-finalized section.
+  // addr + size is the start address of the first non-finalized section.
 
   // inputs[finalIdx] is for finalization (address-assignment)
   size_t finalIdx = 0;
@@ -246,7 +247,7 @@ void ConcatOutputSection::finalize() {
     // from the current position to the position where the thunks are inserted
     // grows. So leave room for a bunch of thunks.
     unsigned slop = 256 * thunkSize;
-    while (finalIdx < endIdx && isecAddr + inputs[finalIdx]->getSize() <
+    while (finalIdx < endIdx && addr + size + inputs[finalIdx]->getSize() <
                                     isecVA + forwardBranchRange - slop)
       finalizeOne(inputs[finalIdx++]);
 
@@ -307,7 +308,7 @@ void ConcatOutputSection::finalize() {
         }
       }
       // ... otherwise, create a new thunk.
-      if (isecAddr > highVA) {
+      if (addr + size > highVA) {
         // There were too many consecutive branch instructions for `slop`
         // above. If you hit this: For the current algorithm, just bumping up
         // slop above and trying again is probably simplest. (See also PR51578
@@ -342,12 +343,11 @@ void ConcatOutputSection::finalize() {
       thunkInfo.sym->used = true;
       target->populateThunk(thunkInfo.isec, funcSym);
       finalizeOne(thunkInfo.isec);
+      fprintf(stderr, "%llx\n", thunkInfo.isec->outSecOff);
       thunks.push_back(thunkInfo.isec);
       ++thunkCount;
     }
   }
-  size = isecAddr - addr;
-  fileSize = isecFileOff - fileOff;
 
   log("thunks for " + parent->name + "," + name +
       ": funcs = " + std::to_string(thunkMap.size()) +
@@ -358,6 +358,11 @@ void ConcatOutputSection::finalize() {
 }
 
 void ConcatOutputSection::writeTo(uint8_t *buf) const {
+  for (ConcatInputSection *isec : inputs)
+    isec->writeTo(buf + isec->outSecOff);
+}
+
+void TextOutputSection::writeTo(uint8_t *buf) const {
   // Merge input sections from thunk & ordinary vectors
   size_t i = 0, ie = inputs.size();
   size_t t = 0, te = thunks.size();
@@ -402,8 +407,14 @@ ConcatOutputSection *
 ConcatOutputSection::getOrCreateForInput(const InputSection *isec) {
   NamePair names = maybeRenameSection({isec->getSegName(), isec->getName()});
   ConcatOutputSection *&osec = concatOutputSections[names];
-  if (!osec)
-    osec = make<ConcatOutputSection>(names.second);
+  if (!osec) {
+    if (isec->getSegName() == segment_names::text &&
+        isec->getName() != section_names::gccExceptTab &&
+        isec->getName() != section_names::ehFrame)
+      osec = make<TextOutputSection>(names.second);
+    else
+      osec = make<ConcatOutputSection>(names.second);
+  }
   return osec;
 }
 

diff  --git a/lld/MachO/ConcatOutputSection.h b/lld/MachO/ConcatOutputSection.h
index ec3d6bfbc344c..c7404f48e27a3 100644
--- a/lld/MachO/ConcatOutputSection.h
+++ b/lld/MachO/ConcatOutputSection.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLD_MACHO_MERGED_OUTPUT_SECTION_H
-#define LLD_MACHO_MERGED_OUTPUT_SECTION_H
+#ifndef LLD_MACHO_CONCAT_OUTPUT_SECTION_H
+#define LLD_MACHO_CONCAT_OUTPUT_SECTION_H
 
 #include "InputSection.h"
 #include "OutputSection.h"
@@ -24,7 +24,7 @@ class Defined;
 // files that are labeled with the same segment and section name. This class
 // contains all such sections and writes the data from each section sequentially
 // in the final binary.
-class ConcatOutputSection final : public OutputSection {
+class ConcatOutputSection : public OutputSection {
 public:
   explicit ConcatOutputSection(StringRef name)
       : OutputSection(ConcatKind, name) {}
@@ -37,27 +37,46 @@ class ConcatOutputSection final : public OutputSection {
   uint64_t getSize() const override { return size; }
   uint64_t getFileSize() const override { return fileSize; }
 
-  void addInput(ConcatInputSection *input);
-  void finalize() override;
-  bool needsThunks() const;
-  uint64_t estimateStubsInRangeVA(size_t callIdx) const;
+  // Assign values to InputSection::outSecOff. In contrast to TextOutputSection,
+  // which does this in its implementation of `finalize()`, we can do this
+  // without `finalize()`'s sequential guarantees detailed in the block comment
+  // of `OutputSection::finalize()`.
+  virtual void finalizeContents();
 
+  void addInput(ConcatInputSection *input);
   void writeTo(uint8_t *buf) const override;
 
-  std::vector<ConcatInputSection *> inputs;
-  std::vector<ConcatInputSection *> thunks;
-
   static bool classof(const OutputSection *sec) {
     return sec->kind() == ConcatKind;
   }
 
   static ConcatOutputSection *getOrCreateForInput(const InputSection *);
 
-private:
-  void finalizeFlags(InputSection *input);
+  std::vector<ConcatInputSection *> inputs;
 
+protected:
   size_t size = 0;
   uint64_t fileSize = 0;
+  void finalizeOne(ConcatInputSection *);
+
+private:
+  void finalizeFlags(InputSection *input);
+};
+
+// ConcatOutputSections that contain code (text) require special handling to
+// support thunk insertion.
+class TextOutputSection : public ConcatOutputSection {
+public:
+  explicit TextOutputSection(StringRef name) : ConcatOutputSection(name) {}
+  void finalizeContents() override {}
+  void finalize() override;
+  bool needsThunks() const;
+  void writeTo(uint8_t *buf) const override;
+
+private:
+  uint64_t estimateStubsInRangeVA(size_t callIdx) const;
+
+  std::vector<ConcatInputSection *> thunks;
 };
 
 // We maintain one ThunkInfo per real function.

diff  --git a/lld/MachO/Writer.cpp b/lld/MachO/Writer.cpp
index fdb4cafd5cb95..727640e1f582e 100644
--- a/lld/MachO/Writer.cpp
+++ b/lld/MachO/Writer.cpp
@@ -972,6 +972,21 @@ template <class LP> void Writer::createOutputSections() {
 void Writer::finalizeAddresses() {
   TimeTraceScope timeScope("Finalize addresses");
   uint64_t pageSize = target->getPageSize();
+
+  // We could parallelize this loop, but local benchmarking indicates it is
+  // faster to do it all in the main thread.
+  for (OutputSegment *seg : outputSegments) {
+    if (seg == linkEditSegment)
+      continue;
+    for (OutputSection *osec : seg->getSections()) {
+      if (!osec->isNeeded())
+        continue;
+      // Other kinds of OutputSections have already been finalized.
+      if (auto concatOsec = dyn_cast<ConcatOutputSection>(osec))
+          concatOsec->finalizeContents();
+    }
+  }
+
   // Ensure that segments (and the sections they contain) are allocated
   // addresses in ascending order, which dyld requires.
   //