[lld] a8843ec - [lld-macho] Parallelize linker optimization hint processing

Fri Sep 16 08:39:22 PDT 2022

Author: Daniel Bertalan
Date: 2022-09-16T17:38:46+02:00
New Revision: a8843ec95295a927d3f31719cec4b6bcefb90844

URL: https://github.com/llvm/llvm-project/commit/a8843ec95295a927d3f31719cec4b6bcefb90844
DIFF: https://github.com/llvm/llvm-project/commit/a8843ec95295a927d3f31719cec4b6bcefb90844.diff

LOG: [lld-macho] Parallelize linker optimization hint processing

This commit moves the parsing of linker optimization hints into
`ARM64::applyOptimizationHints`. This lets us avoid allocating memory
for holding the parsed information, and moves work out of
`ObjFile::parse`, which is not parallelized at the moment.

This change reduces the overhead of processing LOHs to 25-30 ms when
linking Chromium Framework on my M1 machine; previously it took close to
100 ms.

There's no statistically significant change in runtime for a --threads=1
link.

Performance figures with all 8 cores utilized:

      N           Min           Max        Median           Avg        Stddev
  x  20     3.8027232     3.8760762     3.8505335     3.8454145   0.026352574
  +  20     3.7019017     3.8660538     3.7546209     3.7620371   0.032680043
  Difference at 95.0% confidence
  	-0.0833775 +/- 0.019
  	-2.16823% +/- 0.494094%
  	(Student's t, pooled s = 0.0296854)

Differential Revision: https://reviews.llvm.org/D133439

Added: 
    

Modified: 
    lld/MachO/Arch/ARM64.cpp
    lld/MachO/InputFiles.cpp
    lld/MachO/InputFiles.h
    lld/MachO/InputSection.cpp
    lld/MachO/InputSection.h
    lld/MachO/Relocations.h
    lld/MachO/Target.h
    lld/MachO/Writer.cpp
    lld/test/MachO/invalid/invalid-loh.s
    lld/test/MachO/loh-adrp-adrp.s

Removed: 
    


################################################################################
diff  --git a/lld/MachO/Arch/ARM64.cpp b/lld/MachO/Arch/ARM64.cpp
index 2041de6561c32..241b3f557b45d 100644

--- a/lld/MachO/Arch/ARM64.cpp
+++ b/lld/MachO/Arch/ARM64.cpp
@@ -18,6 +18,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/BinaryFormat/MachO.h"
 #include "llvm/Support/Endian.h"
+#include "llvm/Support/LEB128.h"
 #include "llvm/Support/MathExtras.h"
 
 using namespace llvm;
@@ -40,8 +41,7 @@ struct ARM64 : ARM64Common {
                             uint64_t selectorIndex, uint64_t gotAddr,
                             uint64_t msgSendIndex) const override;
   void populateThunk(InputSection *thunk, Symbol *funcSym) override;
-  void applyOptimizationHints(uint8_t *,
-                              const ConcatInputSection *) const override;
+  void applyOptimizationHints(uint8_t *, const ObjFile &) const override;
 };
 
 } // namespace
@@ -196,23 +196,6 @@ struct Ldr {
   ExtendType extendType;
   int64_t offset;
 };
-
-class OptimizationHintContext {
-public:
-  OptimizationHintContext(uint8_t *buf, const ConcatInputSection *isec)
-      : buf(buf), isec(isec) {}
-
-  void applyAdrpAdd(uint64_t, uint64_t);
-  void applyAdrpAdrp(uint64_t, uint64_t);
-  void applyAdrpLdr(uint64_t, uint64_t);
-  void applyAdrpLdrGot(uint64_t, uint64_t);
-  void applyAdrpAddLdr(uint64_t, uint64_t, uint64_t);
-  void applyAdrpLdrGotLdr(uint64_t, uint64_t, uint64_t);
-
-private:
-  uint8_t *buf;
-  const ConcatInputSection *isec;
-};
 } // namespace
 
 static bool parseAdrp(uint32_t insn, Adrp &adrp) {
@@ -347,7 +330,8 @@ static void writeImmediateLdr(void *loc, const Ldr &ldr) {
 // ->
 //   adr  xM, _foo
 //   nop
-void OptimizationHintContext::applyAdrpAdd(uint64_t offset1, uint64_t offset2) {
+static void applyAdrpAdd(uint8_t *buf, const ConcatInputSection *isec,
+                         uint64_t offset1, uint64_t offset2) {
   uint32_t ins1 = read32le(buf + offset1);
   uint32_t ins2 = read32le(buf + offset2);
   Adrp adrp;
@@ -375,8 +359,8 @@ void OptimizationHintContext::applyAdrpAdd(uint64_t offset1, uint64_t offset2) {
 // ->
 //   adrp xN, _foo at PAGE
 //   nop
-void OptimizationHintContext::applyAdrpAdrp(uint64_t offset1,
-                                            uint64_t offset2) {
+static void applyAdrpAdrp(uint8_t *buf, const ConcatInputSection *isec,
+                          uint64_t offset1, uint64_t offset2) {
   uint32_t ins1 = read32le(buf + offset1);
   uint32_t ins2 = read32le(buf + offset2);
   Adrp adrp1, adrp2;
@@ -402,7 +386,8 @@ void OptimizationHintContext::applyAdrpAdrp(uint64_t offset1,
 // ->
 //   nop
 //   ldr  xM, _foo
-void OptimizationHintContext::applyAdrpLdr(uint64_t offset1, uint64_t offset2) {
+static void applyAdrpLdr(uint8_t *buf, const ConcatInputSection *isec,
+                         uint64_t offset1, uint64_t offset2) {
   uint32_t ins1 = read32le(buf + offset1);
   uint32_t ins2 = read32le(buf + offset2);
   Adrp adrp;
@@ -426,15 +411,15 @@ void OptimizationHintContext::applyAdrpLdr(uint64_t offset1, uint64_t offset2) {
 // GOT loads are emitted by the compiler as a pair of adrp and ldr instructions,
 // but they may be changed to adrp+add by relaxGotLoad(). This hint performs
 // the AdrpLdr or AdrpAdd transformation depending on whether it was relaxed.
-void OptimizationHintContext::applyAdrpLdrGot(uint64_t offset1,
-                                              uint64_t offset2) {
+static void applyAdrpLdrGot(uint8_t *buf, const ConcatInputSection *isec,
+                            uint64_t offset1, uint64_t offset2) {
   uint32_t ins2 = read32le(buf + offset2);
   Add add;
   Ldr ldr;
   if (parseAdd(ins2, add))
-    applyAdrpAdd(offset1, offset2);
+    applyAdrpAdd(buf, isec, offset1, offset2);
   else if (parseLdr(ins2, ldr))
-    applyAdrpLdr(offset1, offset2);
+    applyAdrpLdr(buf, isec, offset1, offset2);
 }
 
 // Optimizes an adrp+add+ldr sequence used for loading from a local symbol's
@@ -444,9 +429,9 @@ void OptimizationHintContext::applyAdrpLdrGot(uint64_t offset1,
 //   adrp x0, _foo at PAGE
 //   add  x1, x0, _foo at PAGEOFF
 //   ldr  x2, [x1, #off]
-void OptimizationHintContext::applyAdrpAddLdr(uint64_t offset1,
-                                              uint64_t offset2,
-                                              uint64_t offset3) {
+static void applyAdrpAddLdr(uint8_t *buf, const ConcatInputSection *isec,
+                            uint64_t offset1, uint64_t offset2,
+                            uint64_t offset3) {
   uint32_t ins1 = read32le(buf + offset1);
   Adrp adrp;
   if (!parseAdrp(ins1, adrp))
@@ -512,15 +497,15 @@ void OptimizationHintContext::applyAdrpAddLdr(uint64_t offset1,
 // the GOT entry can be loaded with a single literal ldr instruction.
 // If the referenced symbol is local and thus has been relaxed to adrp+add+ldr,
 // we perform the AdrpAddLdr transformation.
-void OptimizationHintContext::applyAdrpLdrGotLdr(uint64_t offset1,
-                                                 uint64_t offset2,
-                                                 uint64_t offset3) {
+static void applyAdrpLdrGotLdr(uint8_t *buf, const ConcatInputSection *isec,
+                               uint64_t offset1, uint64_t offset2,
+                               uint64_t offset3) {
   uint32_t ins2 = read32le(buf + offset2);
   Add add;
   Ldr ldr2;
 
   if (parseAdd(ins2, add)) {
-    applyAdrpAddLdr(offset1, offset2, offset3);
+    applyAdrpAddLdr(buf, isec, offset1, offset2, offset3);
   } else if (parseLdr(ins2, ldr2)) {
     // adrp x1, _foo at GOTPAGE
     // ldr  x2, [x1, _foo at GOTPAGEOFF]
@@ -559,47 +544,167 @@ void OptimizationHintContext::applyAdrpLdrGotLdr(uint64_t offset1,
   }
 }
 
-void ARM64::applyOptimizationHints(uint8_t *buf,
-                                   const ConcatInputSection *isec) const {
-  assert(isec);
+static uint64_t readValue(const uint8_t *&ptr, const uint8_t *end) {
+  unsigned int n = 0;
+  uint64_t value = decodeULEB128(ptr, &n, end);
+  ptr += n;
+  return value;
+}
 
-  // Note: Some of these optimizations might not be valid when shared regions
-  // are in use. Will need to revisit this if splitSegInfo is added.
+template <typename Callback>
+static void forEachHint(ArrayRef<uint8_t> data, Callback callback) {
+  std::array<uint64_t, 3> args;
 
-  OptimizationHintContext ctx(buf, isec);
-  for (const OptimizationHint &hint : isec->optimizationHints) {
-    switch (hint.type) {
-    case LOH_ARM64_ADRP_ADRP:
-      // This is done in another pass because the other optimization hints
-      // might cause its targets to be turned into NOPs.
+  for (const uint8_t *p = data.begin(), *end = data.end(); p < end;) {
+    uint64_t type = readValue(p, end);
+    if (type == 0)
+      break;
+
+    uint64_t argCount = readValue(p, end);
+    // All known LOH types as of 2022-09 have 3 or fewer arguments; skip others.
+    if (argCount > 3) {
+      for (unsigned i = 0; i < argCount; ++i)
+        readValue(p, end);
+      continue;
+    }
+
+    for (unsigned i = 0; i < argCount; ++i)
+      args[i] = readValue(p, end);
+    callback(type, ArrayRef<uint64_t>(args.data(), argCount));
+  }
+}
+
+// On RISC architectures like arm64, materializing a memory address generally
+// takes multiple instructions. If the referenced symbol is located close enough
+// in memory, fewer instructions are needed.
+//
+// Linker optimization hints record where addresses are computed. After
+// addresses have been assigned, if possible, we change them to a shorter
+// sequence of instructions. The size of the binary is not modified; the
+// eliminated instructions are replaced with NOPs. This still leads to faster
+// code as the CPU can skip over NOPs quickly.
+//
+// LOHs are specified by the LC_LINKER_OPTIMIZATION_HINTS load command, which
+// points to a sequence of ULEB128-encoded numbers. Each entry specifies a
+// transformation kind, and 2 or 3 addresses where the instructions are located.
+void ARM64::applyOptimizationHints(uint8_t *outBuf, const ObjFile &obj) const {
+  ArrayRef<uint8_t> data = obj.getOptimizationHints();
+  if (data.empty())
+    return;
+
+  const ConcatInputSection *section = nullptr;
+  uint64_t sectionAddr = 0;
+  uint8_t *buf = nullptr;
+
+  auto findSection = [&](uint64_t addr) {
+    if (section && addr >= sectionAddr &&
+        addr < sectionAddr + section->getSize())
+      return true;
+
+    auto secIt = std::prev(llvm::upper_bound(
+        obj.sections, addr,
+        [](uint64_t off, const Section *sec) { return off < sec->addr; }));
+    const Section *sec = *secIt;
+
+    auto subsecIt = std::prev(llvm::upper_bound(
+        sec->subsections, addr - sec->addr,
+        [](uint64_t off, Subsection subsec) { return off < subsec.offset; }));
+    const Subsection &subsec = *subsecIt;
+    const ConcatInputSection *isec =
+        dyn_cast_or_null<ConcatInputSection>(subsec.isec);
+    if (!isec || isec->shouldOmitFromOutput())
+      return false;
+
+    section = isec;
+    sectionAddr = subsec.offset + sec->addr;
+    buf = outBuf + section->outSecOff + section->parent->fileOff;
+    return true;
+  };
+
+  auto isValidOffset = [&](uint64_t offset) {
+    if (offset < sectionAddr || offset >= sectionAddr + section->getSize()) {
+      error("linker optimization hint spans multiple sections");
+      return false;
+    }
+    return true;
+  };
+
+  bool hasAdrpAdrp = false;
+  forEachHint(data, [&](uint64_t kind, ArrayRef<uint64_t> args) {
+    if (kind == LOH_ARM64_ADRP_ADRP) {
+      hasAdrpAdrp = true;
+      return;
+    }
+
+    if (!findSection(args[0]))
+      return;
+    switch (kind) {
+    case LOH_ARM64_ADRP_ADD:
+      if (isValidOffset(args[1]))
+        applyAdrpAdd(buf, section, args[0] - sectionAddr,
+                     args[1] - sectionAddr);
       break;
     case LOH_ARM64_ADRP_LDR:
-      ctx.applyAdrpLdr(hint.offset0, hint.offset0 + hint.delta[0]);
+      if (isValidOffset(args[1]))
+        applyAdrpLdr(buf, section, args[0] - sectionAddr,
+                     args[1] - sectionAddr);
+      break;
+    case LOH_ARM64_ADRP_LDR_GOT:
+      if (isValidOffset(args[1]))
+        applyAdrpLdrGot(buf, section, args[0] - sectionAddr,
+                        args[1] - sectionAddr);
       break;
     case LOH_ARM64_ADRP_ADD_LDR:
-      ctx.applyAdrpAddLdr(hint.offset0, hint.offset0 + hint.delta[0],
-                          hint.offset0 + hint.delta[1]);
+      if (isValidOffset(args[1]) && isValidOffset(args[2]))
+        applyAdrpAddLdr(buf, section, args[0] - sectionAddr,
+                        args[1] - sectionAddr, args[2] - sectionAddr);
       break;
     case LOH_ARM64_ADRP_LDR_GOT_LDR:
-      ctx.applyAdrpLdrGotLdr(hint.offset0, hint.offset0 + hint.delta[0],
-                             hint.offset0 + hint.delta[1]);
+      if (isValidOffset(args[1]) && isValidOffset(args[2]))
+        applyAdrpLdrGotLdr(buf, section, args[0] - sectionAddr,
+                           args[1] - sectionAddr, args[2] - sectionAddr);
       break;
     case LOH_ARM64_ADRP_ADD_STR:
     case LOH_ARM64_ADRP_LDR_GOT_STR:
       // TODO: Implement these
       break;
-    case LOH_ARM64_ADRP_ADD:
-      ctx.applyAdrpAdd(hint.offset0, hint.offset0 + hint.delta[0]);
-      break;
-    case LOH_ARM64_ADRP_LDR_GOT:
-      ctx.applyAdrpLdrGot(hint.offset0, hint.offset0 + hint.delta[0]);
-      break;
     }
-  }
+  });
+
+  if (!hasAdrpAdrp)
+    return;
 
-  for (const OptimizationHint &hint : isec->optimizationHints)
-    if (hint.type == LOH_ARM64_ADRP_ADRP)
-      ctx.applyAdrpAdrp(hint.offset0, hint.offset0 + hint.delta[0]);
+  // AdrpAdrp optimization hints are performed in a second pass because they
+  // might interfere with other transformations. For instance, consider the
+  // following input:
+  //
+  //   adrp x0, _foo at PAGE
+  //   add  x1, x0, _foo at PAGEOFF
+  //   adrp x0, _bar at PAGE
+  //   add  x2, x0, _bar at PAGEOFF
+  //
+  // If we perform the AdrpAdrp relaxation first, we get:
+  //
+  //   adrp x0, _foo at PAGE
+  //   add  x1, x0, _foo at PAGEOFF
+  //   nop
+  //   add x2, x0, _bar at PAGEOFF
+  //
+  // If we then apply AdrpAdd to the first two instructions, the add will have a
+  // garbage value in x0:
+  //
+  //   adr  x1, _foo
+  //   nop
+  //   nop
+  //   add  x2, x0, _bar at PAGEOFF
+  forEachHint(data, [&](uint64_t kind, ArrayRef<uint64_t> args) {
+    if (kind != LOH_ARM64_ADRP_ADRP)
+      return;
+    if (!findSection(args[0]))
+      return;
+    if (isValidOffset(args[1]))
+      applyAdrpAdrp(buf, section, args[0] - sectionAddr, args[1] - sectionAddr);
+  });
 }
 
 TargetInfo *macho::createARM64TargetInfo() {

diff  --git a/lld/MachO/InputFiles.cpp b/lld/MachO/InputFiles.cpp
index 588b87a2927f9..ee382a5aeabc3 100644
--- a/lld/MachO/InputFiles.cpp
+++ b/lld/MachO/InputFiles.cpp
@@ -463,155 +463,6 @@ static Defined *findSymbolAtOffset(const ConcatInputSection *isec,
   return *it;
 }
 
-// Linker optimization hints mark a sequence of instructions used for
-// synthesizing an address which that be transformed into a faster sequence. The
-// transformations depend on conditions that are determined at link time, like
-// the distance to the referenced symbol or its alignment.
-//
-// Each hint has a type and refers to 2 or 3 instructions. Each of those
-// instructions must have a corresponding relocation. After addresses have been
-// finalized and relocations have been performed, we check if the requirements
-// hold, and perform the optimizations if they do.
-//
-// Similar linker relaxations exist for ELF as well, with the 
diff erence being
-// that the explicit marking allows for the relaxation of non-consecutive
-// relocations too.
-//
-// The specific types of hints are documented in Arch/ARM64.cpp
-void ObjFile::parseOptimizationHints(ArrayRef<uint8_t> data) {
-  auto expectedArgCount = [](uint8_t type) {
-    switch (type) {
-    case LOH_ARM64_ADRP_ADRP:
-    case LOH_ARM64_ADRP_LDR:
-    case LOH_ARM64_ADRP_ADD:
-    case LOH_ARM64_ADRP_LDR_GOT:
-      return 2;
-    case LOH_ARM64_ADRP_ADD_LDR:
-    case LOH_ARM64_ADRP_ADD_STR:
-    case LOH_ARM64_ADRP_LDR_GOT_LDR:
-    case LOH_ARM64_ADRP_LDR_GOT_STR:
-      return 3;
-    }
-    return -1;
-  };
-
-  // Each hint contains at least 4 ULEB128-encoded fields, so in the worst case,
-  // there are data.size() / 4 LOHs. It's a huge overestimation though, as
-  // offsets are unlikely to fall in the 0-127 byte range, so we pre-allocate
-  // half as much.
-  optimizationHints.reserve(data.size() / 8);
-
-  for (const uint8_t *p = data.begin(); p < data.end();) {
-    const ptr
diff _t inputOffset = p - data.begin();
-    unsigned int n = 0;
-    uint8_t type = decodeULEB128(p, &n, data.end());
-    p += n;
-
-    // An entry of type 0 terminates the list.
-    if (type == 0)
-      break;
-
-    int expectedCount = expectedArgCount(type);
-    if (LLVM_UNLIKELY(expectedCount == -1)) {
-      error("Linker optimization hint at offset " + Twine(inputOffset) +
-            " has unknown type " + Twine(type));
-      return;
-    }
-
-    uint8_t argCount = decodeULEB128(p, &n, data.end());
-    p += n;
-
-    if (LLVM_UNLIKELY(argCount != expectedCount)) {
-      error("Linker optimization hint at offset " + Twine(inputOffset) +
-            " has " + Twine(argCount) + " arguments instead of the expected " +
-            Twine(expectedCount));
-      return;
-    }
-
-    uint64_t offset0 = decodeULEB128(p, &n, data.end());
-    p += n;
-
-    int16_t delta[2];
-    for (int i = 0; i < argCount - 1; ++i) {
-      uint64_t address = decodeULEB128(p, &n, data.end());
-      p += n;
-      int64_t d = address - offset0;
-      if (LLVM_UNLIKELY(d > std::numeric_limits<int16_t>::max() ||
-                        d < std::numeric_limits<int16_t>::min())) {
-        error("Linker optimization hint at offset " + Twine(inputOffset) +
-              " has addresses too far apart");
-        return;
-      }
-      delta[i] = d;
-    }
-
-    optimizationHints.push_back({offset0, {delta[0], delta[1]}, type});
-  }
-
-  // We sort the per-object vector of optimization hints so each section only
-  // needs to hold an ArrayRef to a contiguous range of hints.
-  llvm::sort(optimizationHints,
-             [](const OptimizationHint &a, const OptimizationHint &b) {
-               return a.offset0 < b.offset0;
-             });
-
-  auto section = sections.begin();
-  auto subsection = (*section)->subsections.begin();
-  uint64_t subsectionBase = 0;
-  uint64_t subsectionEnd = 0;
-
-  auto updateAddr = [&]() {
-    subsectionBase = (*section)->addr + subsection->offset;
-    subsectionEnd = subsectionBase + subsection->isec->getSize();
-  };
-
-  auto advanceSubsection = [&]() {
-    if (section == sections.end())
-      return;
-    ++subsection;
-    while (subsection == (*section)->subsections.end()) {
-      ++section;
-      if (section == sections.end())
-        return;
-      subsection = (*section)->subsections.begin();
-    }
-  };
-
-  updateAddr();
-  auto hintStart = optimizationHints.begin();
-  for (auto hintEnd = hintStart, end = optimizationHints.end(); hintEnd != end;
-       ++hintEnd) {
-    if (hintEnd->offset0 >= subsectionEnd) {
-      subsection->isec->optimizationHints =
-          ArrayRef<OptimizationHint>(&*hintStart, hintEnd - hintStart);
-
-      hintStart = hintEnd;
-      while (hintStart->offset0 >= subsectionEnd) {
-        advanceSubsection();
-        if (section == sections.end())
-          break;
-        updateAddr();
-        assert(hintStart->offset0 >= subsectionBase);
-      }
-    }
-
-    hintEnd->offset0 -= subsectionBase;
-    for (int i = 0, count = expectedArgCount(hintEnd->type); i < count - 1;
-         ++i) {
-      if (LLVM_UNLIKELY(
-              hintEnd->delta[i] < -static_cast<int64_t>(hintEnd->offset0) ||
-              hintEnd->delta[i] >=
-                  static_cast<int64_t>(subsectionEnd - hintEnd->offset0))) {
-        error("Linker optimization hint spans multiple sections");
-        return;
-      }
-    }
-  }
-  if (section != sections.end())
-    subsection->isec->optimizationHints = ArrayRef<OptimizationHint>(
-        &*hintStart, optimizationHints.end() - hintStart);
-}
-
 template <class SectionHeader>
 static bool validateRelocationInfo(InputFile *file, const SectionHeader &sec,
                                    relocation_info rel) {
@@ -1129,11 +980,6 @@ template <class LP> void ObjFile::parse() {
     if (!sections[i]->subsections.empty())
       parseRelocations(sectionHeaders, sectionHeaders[i], *sections[i]);
 
-  if (!config->ignoreOptimizationHints)
-    if (auto *cmd = findCommand<linkedit_data_command>(
-            hdr, LC_LINKER_OPTIMIZATION_HINT))
-      parseOptimizationHints({buf + cmd->dataoff, cmd->datasize});
-
   parseDebugInfo();
 
   Section *ehFrameSection = nullptr;
@@ -1213,6 +1059,14 @@ ArrayRef<data_in_code_entry> ObjFile::getDataInCode() const {
           c->datasize / sizeof(data_in_code_entry)};
 }
 
+ArrayRef<uint8_t> ObjFile::getOptimizationHints() const {
+  const auto *buf = reinterpret_cast<const uint8_t *>(mb.getBufferStart());
+  if (auto *cmd =
+          findCommand<linkedit_data_command>(buf, LC_LINKER_OPTIMIZATION_HINT))
+    return {buf + cmd->dataoff, cmd->datasize};
+  return {};
+}
+
 // Create pointers from symbols to their associated compact unwind entries.
 void ObjFile::registerCompactUnwind(Section &compactUnwindSection) {
   for (const Subsection &subsection : compactUnwindSection.subsections) {

diff  --git a/lld/MachO/InputFiles.h b/lld/MachO/InputFiles.h
index 1b454f98932a8..b883bd040f889 100644
--- a/lld/MachO/InputFiles.h
+++ b/lld/MachO/InputFiles.h
@@ -159,6 +159,7 @@ class ObjFile final : public InputFile {
   ObjFile(MemoryBufferRef mb, uint32_t modTime, StringRef archiveName,
           bool lazy = false, bool forceHidden = false);
   ArrayRef<llvm::MachO::data_in_code_entry> getDataInCode() const;
+  ArrayRef<uint8_t> getOptimizationHints() const;
   template <class LP> void parse();
 
   static bool classof(const InputFile *f) { return f->kind() == ObjKind; }
@@ -176,7 +177,6 @@ class ObjFile final : public InputFile {
   std::vector<ConcatInputSection *> debugSections;
   std::vector<CallGraphEntry> callGraph;
   llvm::DenseMap<ConcatInputSection *, FDE> fdes;
-  std::vector<OptimizationHint> optimizationHints;
   std::vector<AliasSymbol *> aliases;
 
 private:
@@ -193,7 +193,6 @@ class ObjFile final : public InputFile {
   void parseRelocations(ArrayRef<SectionHeader> sectionHeaders,
                         const SectionHeader &, Section &);
   void parseDebugInfo();
-  void parseOptimizationHints(ArrayRef<uint8_t> data);
   void splitEhFrames(ArrayRef<uint8_t> dataArr, Section &ehFrameSection);
   void registerCompactUnwind(Section &compactUnwindSection);
   void registerEhFrames(Section &ehFrameSection);

diff  --git a/lld/MachO/InputSection.cpp b/lld/MachO/InputSection.cpp
index f6a03649985e0..660a27c3d3179 100644
--- a/lld/MachO/InputSection.cpp
+++ b/lld/MachO/InputSection.cpp
@@ -29,8 +29,8 @@ using namespace lld::macho;
 // Verify ConcatInputSection's size on 64-bit builds. The size of std::vector
 // can 
diff er based on STL debug levels (e.g. iterator debugging on MSVC's STL),
 // so account for that.
-static_assert(sizeof(void *) != 8 || sizeof(ConcatInputSection) ==
-                                         sizeof(std::vector<Reloc>) + 104,
+static_assert(sizeof(void *) != 8 ||
+                  sizeof(ConcatInputSection) == sizeof(std::vector<Reloc>) + 88,
               "Try to minimize ConcatInputSection's size, we create many "
               "instances of it");
 
@@ -219,8 +219,6 @@ void ConcatInputSection::writeTo(uint8_t *buf) {
     }
     target->relocateOne(loc, r, referentVA, getVA() + r.offset);
   }
-
-  target->applyOptimizationHints(buf, this);
 }
 
 ConcatInputSection *macho::makeSyntheticInputSection(StringRef segName,

diff  --git a/lld/MachO/InputSection.h b/lld/MachO/InputSection.h
index 2b8369142b802..ecb46f926a0a5 100644
--- a/lld/MachO/InputSection.h
+++ b/lld/MachO/InputSection.h
@@ -83,7 +83,6 @@ class InputSection {
   OutputSection *parent = nullptr;
   ArrayRef<uint8_t> data;
   std::vector<Reloc> relocs;
-  ArrayRef<OptimizationHint> optimizationHints;
   // The symbols that belong to this InputSection, sorted by value. With
   // .subsections_via_symbols, there is typically only one element here.
   llvm::TinyPtrVector<Defined *> symbols;

diff  --git a/lld/MachO/Relocations.h b/lld/MachO/Relocations.h
index d0eba4643bd07..023d25a795a0d 100644
--- a/lld/MachO/Relocations.h
+++ b/lld/MachO/Relocations.h
@@ -69,14 +69,6 @@ struct Reloc {
         addend(addend), referent(referent) {}
 };
 
-struct OptimizationHint {
-  // Offset of the first address within the containing InputSection.
-  uint64_t offset0;
-  // Offset of the other addresses relative to the first one.
-  int16_t delta[2];
-  uint8_t type;
-};
-
 bool validateSymbolRelocation(const Symbol *, const InputSection *,
                               const Reloc &);
 

diff  --git a/lld/MachO/Target.h b/lld/MachO/Target.h
index ea8141740413e..ff7998b96ce8a 100644
--- a/lld/MachO/Target.h
+++ b/lld/MachO/Target.h
@@ -27,7 +27,7 @@ class Symbol;
 class Defined;
 class DylibSymbol;
 class InputSection;
-class ConcatInputSection;
+class ObjFile;
 
 class TargetInfo {
 public:
@@ -97,8 +97,7 @@ class TargetInfo {
     llvm_unreachable("Unsupported architecture for dtrace symbols");
   }
 
-  virtual void applyOptimizationHints(uint8_t *buf,
-                                      const ConcatInputSection *) const {};
+  virtual void applyOptimizationHints(uint8_t *, const ObjFile &) const {};
 
   uint32_t magic;
   llvm::MachO::CPUType cpuType;

diff  --git a/lld/MachO/Writer.cpp b/lld/MachO/Writer.cpp
index 44c34db00b1fd..a8ae554a6246f 100644
--- a/lld/MachO/Writer.cpp
+++ b/lld/MachO/Writer.cpp
@@ -60,6 +60,7 @@ class Writer {
 
   void openFile();
   void writeSections();
+  void applyOptimizationHints();
   void writeUuid();
   void writeCodeSignature();
   void writeOutputFile();
@@ -1072,6 +1073,18 @@ void Writer::writeSections() {
   });
 }
 
+void Writer::applyOptimizationHints() {
+  if (config->arch() != AK_arm64 || config->ignoreOptimizationHints)
+    return;
+
+  uint8_t *buf = buffer->getBufferStart();
+  TimeTraceScope timeScope("Apply linker optimization hints");
+  parallelForEach(inputFiles, [buf](const InputFile *file) {
+    if (const auto *objFile = dyn_cast<ObjFile>(file))
+      target->applyOptimizationHints(buf, *objFile);
+  });
+}
+
 // In order to utilize multiple cores, we first split the buffer into chunks,
 // compute a hash for each chunk, and then compute a hash value of the hash
 // values.
@@ -1114,6 +1127,7 @@ void Writer::writeOutputFile() {
   if (errorCount())
     return;
   writeSections();
+  applyOptimizationHints();
   writeUuid();
   writeCodeSignature();
 

diff  --git a/lld/test/MachO/invalid/invalid-loh.s b/lld/test/MachO/invalid/invalid-loh.s
index 19ed52866948a..9bf6b012709b8 100644
--- a/lld/test/MachO/invalid/invalid-loh.s
+++ b/lld/test/MachO/invalid/invalid-loh.s
@@ -1,15 +1,10 @@
 # REQUIRES: aarch64
 
-# RUN: rm -rf %t; split-file %s %t
-# RUN: llvm-mc -filetype=obj -triple=arm64-apple-darwin %t/section.s -o %t/section.o
-# RUN: llvm-mc -filetype=obj -triple=arm64-apple-darwin %t/far.s -o %t/far.o
-# RUN: not %lld -arch arm64 %t/section.o -o /dev/null 2>&1 | FileCheck %s --check-prefix=SECTION
-# RUN: not %lld -arch arm64 %t/far.o -o /dev/null 2>&1 | FileCheck %s --check-prefix=FAR
+# RUN: llvm-mc -filetype=obj -triple=arm64-apple-darwin %s -o %t.o
+# RUN: not %lld -arch arm64 %t.o -o /dev/null 2>&1 | FileCheck %s
 
-# SECTION: error: Linker optimization hint spans multiple sections
-# FAR:     error: Linker optimization hint at offset 0 has addresses too far apart
+# CHECK: error: linker optimization hint spans multiple sections
 
-#--- section.s
 .globl _main
 _main:
 L1:
@@ -23,17 +18,3 @@ _target:
 
 .loh AdrpAdd L1, L2
 .subsections_via_symbols
-
-#--- far.s
-.globl _main
-_main:
-L1:
-  adrp x0, _target at PAGE
-  .zero 0x8000
-L2:
-  add  x0, x0, _target at PAGEOFF
-
-_target:
-
-.loh AdrpAdd L1, L2
-.subsections_via_symbols

diff  --git a/lld/test/MachO/loh-adrp-adrp.s b/lld/test/MachO/loh-adrp-adrp.s
index 05abc8ab1961d..55d6a614f374e 100644
--- a/lld/test/MachO/loh-adrp-adrp.s
+++ b/lld/test/MachO/loh-adrp-adrp.s
@@ -17,6 +17,11 @@
 ## Not an adrp instruction (invalid)
 # CHECK-NEXT: nop
 # CHECK-NEXT: adrp x4
+## Other relaxations take precedence over AdrpAdrp
+# CHECK-NEXT: adr x6
+# CHECK-NEXT: nop
+# CHECK-NEXT: adr x6
+# CHECK-NEXT: nop
 
 .text
 .align 2
@@ -39,6 +44,14 @@ L7:
   nop
 L8:
   adrp x4, _baz at PAGE
+L9:
+  adrp x5, _foo at PAGE
+L10:
+  add  x6, x5, _foo at PAGEOFF
+L11:
+  adrp x5, _bar at PAGE
+L12:
+  add  x6, x5, _bar at PAGEOFF
 
 .data
 .align 12
@@ -54,3 +67,6 @@ _baz:
 .loh AdrpAdrp L3, L4
 .loh AdrpAdrp L5, L6
 .loh AdrpAdrp L7, L8
+.loh AdrpAdrp L9, L11
+.loh AdrpAdd  L9, L10
+.loh AdrpAdd  L11, L12