[lld] [lld][InstrProf] Sort startup functions for compression (PR #107348)

Wed Sep 4 21:06:14 PDT 2024

llvmbot wrote:



@llvm/pr-subscribers-lld

@llvm/pr-subscribers-lld-macho

Author: Ellis Hoag (ellishg)

<details>
<summary>Changes</summary>

Before this, startup functions were only ordered to reduce the number of page faults. This patch introduces `--compression-sort-startup-functions` which orders these function to also improve compressed size at the same time. Of course, this will regress some startup performance in favor of improved compressed size.

I built a large binary without this flag (Base) and with this flag (Startup Compression). I also removed all startup hashes (Remove Startup Hashes) to see how ordering them for startup impacts the number of page faults. I recorded "Page Fault Curve Area" from the verbose logs and compressed the resulting binary using `zip` for "Compressed Size".

|                       | Page Fault Curve Area | Compressed Size |
| --------------------- | --------------------- | --------------- |
| Base                  | 1.045659e+09          | 25.1 MB         |
| Startup Compression   | 1.430330e+09 (+36.8%) | 24.7 MB (-1.4%) |
| Remove Startup Hashes | 2.272126e+09 (+117%)  | 24.6 MB (-2.1%) |

Depends on https://github.com/llvm/llvm-project/pull/107347

---
Full diff: https://github.com/llvm/llvm-project/pull/107348.diff


7 Files Affected:

- (modified) lld/MachO/BPSectionOrderer.cpp (+79-59) 
- (modified) lld/MachO/BPSectionOrderer.h (+1-1) 
- (modified) lld/MachO/Config.h (+1) 
- (modified) lld/MachO/Driver.cpp (+3) 
- (modified) lld/MachO/Options.td (+2) 
- (modified) lld/MachO/SectionPriorities.cpp (+1) 
- (modified) lld/test/MachO/bp-section-orderer-stress.s (+2-2) 


``````````diff

diff --git a/lld/MachO/BPSectionOrderer.cpp b/lld/MachO/BPSectionOrderer.cpp
index 568843d72bbb50..71758070ddc8ae 100644
--- a/lld/MachO/BPSectionOrderer.cpp
+++ b/lld/MachO/BPSectionOrderer.cpp
@@ -60,12 +60,13 @@ getRelocHash(const Reloc &reloc,
   return getRelocHash(kind, sectionIdx.value_or(0), 0, reloc.addend);
 }
 
-static void constructNodesForCompression(
-    const SmallVector<const InputSection *> &sections,
+static SmallVector<
+    std::pair<unsigned, SmallVector<BPFunctionNode::UtilityNodeT>>>
+getUnsForCompression(
+    ArrayRef<const InputSection *> sections,
     const DenseMap<const InputSection *, uint64_t> &sectionToIdx,
-    const SmallVector<unsigned> &sectionIdxs,
-    std::vector<BPFunctionNode> &nodes,
-    DenseMap<unsigned, SmallVector<unsigned>> &duplicateSectionIdxs,
+    ArrayRef<unsigned> sectionIdxs,
+    DenseMap<unsigned, SmallVector<unsigned>> *duplicateSectionIdxs,
     BPFunctionNode::UtilityNodeT &maxUN) {
   TimeTraceScope timeScope("Build nodes for compression");
 
@@ -103,54 +104,59 @@ static void constructNodesForCompression(
     for (auto hash : hashes)
       ++hashFrequency[hash];
 
-  // Merge section that are nearly identical
-  SmallVector<std::pair<unsigned, SmallVector<uint64_t>>> newSectionHashes;
-  DenseMap<uint64_t, unsigned> wholeHashToSectionIdx;
-  for (auto &[sectionIdx, hashes] : sectionHashes) {
-    uint64_t wholeHash = 0;
-    for (auto hash : hashes)
-      if (hashFrequency[hash] > 5)
-        wholeHash ^= hash;
-    auto [it, wasInserted] =
-        wholeHashToSectionIdx.insert(std::make_pair(wholeHash, sectionIdx));
-    if (wasInserted) {
-      newSectionHashes.emplace_back(sectionIdx, hashes);
-    } else {
-      duplicateSectionIdxs[it->getSecond()].push_back(sectionIdx);
+  if (duplicateSectionIdxs) {
+    // Merge section that are nearly identical
+    SmallVector<std::pair<unsigned, SmallVector<uint64_t>>> newSectionHashes;
+    DenseMap<uint64_t, unsigned> wholeHashToSectionIdx;
+    for (auto &[sectionIdx, hashes] : sectionHashes) {
+      uint64_t wholeHash = 0;
+      for (auto hash : hashes)
+        if (hashFrequency[hash] > 5)
+          wholeHash ^= hash;
+      auto [it, wasInserted] =
+          wholeHashToSectionIdx.insert(std::make_pair(wholeHash, sectionIdx));
+      if (wasInserted) {
+        newSectionHashes.emplace_back(sectionIdx, hashes);
+      } else {
+        (*duplicateSectionIdxs)[it->getSecond()].push_back(sectionIdx);
+      }
     }
-  }
-  sectionHashes = newSectionHashes;
+    sectionHashes = newSectionHashes;
 
-  // Recompute hash frequencies
-  hashFrequency.clear();
-  for (auto &[sectionIdx, hashes] : sectionHashes)
-    for (auto hash : hashes)
-      ++hashFrequency[hash];
+    // Recompute hash frequencies
+    hashFrequency.clear();
+    for (auto &[sectionIdx, hashes] : sectionHashes)
+      for (auto hash : hashes)
+        ++hashFrequency[hash];
+  }
 
   // Filter rare and common hashes and assign each a unique utility node that
   // doesn't conflict with the trace utility nodes
   DenseMap<uint64_t, BPFunctionNode::UtilityNodeT> hashToUN;
   for (auto &[hash, frequency] : hashFrequency) {
-    if (frequency <= 1 || frequency * 2 > wholeHashToSectionIdx.size())
+    if (frequency <= 1 || frequency * 2 > sectionHashes.size())
       continue;
     hashToUN[hash] = ++maxUN;
   }
 
-  std::vector<BPFunctionNode::UtilityNodeT> uns;
+  SmallVector<std::pair<unsigned, SmallVector<BPFunctionNode::UtilityNodeT>>>
+      sectionUns;
   for (auto &[sectionIdx, hashes] : sectionHashes) {
+    SmallVector<BPFunctionNode::UtilityNodeT> uns;
     for (auto &hash : hashes) {
       auto it = hashToUN.find(hash);
       if (it != hashToUN.end())
         uns.push_back(it->second);
     }
-    nodes.emplace_back(sectionIdx, uns);
-    uns.clear();
+    sectionUns.emplace_back(sectionIdx, uns);
   }
+  return sectionUns;
 }
 
 DenseMap<const InputSection *, size_t> lld::macho::runBalancedPartitioning(
     size_t &highestAvailablePriority, StringRef profilePath,
-    bool forFunctionCompression, bool forDataCompression, bool verbose) {
+    bool forFunctionCompression, bool forDataCompression,
+    bool compressionSortStartupFunctions, bool verbose) {
 
   SmallVector<const InputSection *> sections;
   DenseMap<const InputSection *, uint64_t> sectionToIdx;
@@ -185,10 +191,11 @@ DenseMap<const InputSection *, size_t> lld::macho::runBalancedPartitioning(
                                            sectionIdxs.end());
   }
 
-  std::vector<BPFunctionNode> nodesForStartup;
   BPFunctionNode::UtilityNodeT maxUN = 0;
   DenseMap<unsigned, SmallVector<BPFunctionNode::UtilityNodeT>>
       startupSectionIdxUNs;
+  // Used to define the initial order for startup functions.
+  DenseMap<unsigned, size_t> sectionIdxToTimestamp;
   std::unique_ptr<InstrProfReader> reader;
   if (!profilePath.empty()) {
     auto fs = vfs::getRealFileSystem();
@@ -202,8 +209,6 @@ DenseMap<const InputSection *, size_t> lld::macho::runBalancedPartitioning(
     }
     auto &traces = reader->getTemporalProfTraces();
 
-    // Used to define the initial order for startup functions.
-    DenseMap<unsigned, size_t> sectionIdxToTimestamp;
     DenseMap<unsigned, BPFunctionNode::UtilityNodeT> sectionIdxToFirstUN;
     for (size_t traceIdx = 0; traceIdx < traces.size(); traceIdx++) {
       uint64_t currentSize = 0, cutoffSize = 1;
@@ -245,15 +250,6 @@ DenseMap<const InputSection *, size_t> lld::macho::runBalancedPartitioning(
       ++maxUN;
       sectionIdxToFirstUN.clear();
     }
-
-    // These uns should already be sorted without duplicates.
-    for (auto &[sectionIdx, uns] : startupSectionIdxUNs)
-      nodesForStartup.emplace_back(sectionIdx, uns);
-
-    llvm::sort(nodesForStartup, [&sectionIdxToTimestamp](auto &L, auto &R) {
-      return std::make_pair(sectionIdxToTimestamp[L.Id], L.Id) <
-             std::make_pair(sectionIdxToTimestamp[R.Id], R.Id);
-    });
   }
 
   SmallVector<unsigned> sectionIdxsForFunctionCompression,
@@ -271,21 +267,45 @@ DenseMap<const InputSection *, size_t> lld::macho::runBalancedPartitioning(
     }
   }
 
-  std::vector<BPFunctionNode> nodesForFunctionCompression,
-      nodesForDataCompression;
-  // Map a section index (to be ordered for compression) to a list of duplicate
-  // section indices (not ordered for compression).
-  DenseMap<unsigned, SmallVector<unsigned>> duplicateFunctionSectionIdxs,
-      duplicateDataSectionIdxs;
-  constructNodesForCompression(
+  SmallVector<unsigned> startupIdxs;
+  if (compressionSortStartupFunctions)
+    for (auto &[sectionIdx, uns] : startupSectionIdxUNs)
+      startupIdxs.push_back(sectionIdx);
+  auto unsForStartupFunctionCompression =
+      getUnsForCompression(sections, sectionToIdx, startupIdxs, nullptr, maxUN);
+
+  // Map a section index (order directly) to a list of duplicate section indices
+  // (not ordered directly).
+  DenseMap<unsigned, SmallVector<unsigned>> duplicateSectionIdxs;
+  auto unsForFunctionCompression = getUnsForCompression(
       sections, sectionToIdx, sectionIdxsForFunctionCompression,
-      nodesForFunctionCompression, duplicateFunctionSectionIdxs, maxUN);
-  constructNodesForCompression(
+      &duplicateSectionIdxs, maxUN);
+  auto unsForDataCompression = getUnsForCompression(
       sections, sectionToIdx, sectionIdxsForDataCompression,
-      nodesForDataCompression, duplicateDataSectionIdxs, maxUN);
+      &duplicateSectionIdxs, maxUN);
 
-  // Sort nodes by their Id (which is the section index) because the input
-  // linker order tends to be not bad
+  for (auto &[sectionIdx, compressionUns] : unsForStartupFunctionCompression) {
+    auto &uns = startupSectionIdxUNs[sectionIdx];
+    uns.append(compressionUns);
+    llvm::sort(uns);
+    uns.erase(std::unique(uns.begin(), uns.end()), uns.end());
+  }
+  std::vector<BPFunctionNode> nodesForStartup, nodesForFunctionCompression,
+      nodesForDataCompression;
+  for (auto &[sectionIdx, uns] : startupSectionIdxUNs)
+    nodesForStartup.emplace_back(sectionIdx, uns);
+  for (auto &[sectionIdx, uns] : unsForFunctionCompression)
+    nodesForFunctionCompression.emplace_back(sectionIdx, uns);
+  for (auto &[sectionIdx, uns] : unsForDataCompression)
+    nodesForDataCompression.emplace_back(sectionIdx, uns);
+
+  // Use the first timestamp to define the initial order for startup nodes.
+  llvm::sort(nodesForStartup, [&sectionIdxToTimestamp](auto &L, auto &R) {
+    return std::make_pair(sectionIdxToTimestamp[L.Id], L.Id) <
+           std::make_pair(sectionIdxToTimestamp[R.Id], R.Id);
+  });
+  // Sort compression nodes by their Id (which is the section index) because the
+  // input linker order tends to be not bad.
   llvm::sort(nodesForFunctionCompression,
              [](auto &L, auto &R) { return L.Id < R.Id; });
   llvm::sort(nodesForDataCompression,
@@ -318,8 +338,8 @@ DenseMap<const InputSection *, size_t> lld::macho::runBalancedPartitioning(
     if (orderedSections.insert(isec))
       ++numCodeCompressionSections;
 
-    auto It = duplicateFunctionSectionIdxs.find(node.Id);
-    if (It == duplicateFunctionSectionIdxs.end())
+    auto It = duplicateSectionIdxs.find(node.Id);
+    if (It == duplicateSectionIdxs.end())
       continue;
     for (auto dupSecIdx : It->getSecond()) {
       const auto *dupIsec = sections[dupSecIdx];
@@ -332,8 +352,8 @@ DenseMap<const InputSection *, size_t> lld::macho::runBalancedPartitioning(
     const auto *isec = sections[node.Id];
     if (orderedSections.insert(isec))
       ++numDataCompressionSections;
-    auto It = duplicateDataSectionIdxs.find(node.Id);
-    if (It == duplicateDataSectionIdxs.end())
+    auto It = duplicateSectionIdxs.find(node.Id);
+    if (It == duplicateSectionIdxs.end())
       continue;
     for (auto dupSecIdx : It->getSecond()) {
       const auto *dupIsec = sections[dupSecIdx];
diff --git a/lld/MachO/BPSectionOrderer.h b/lld/MachO/BPSectionOrderer.h
index 6f9eefd5d82beb..cefd0ceb10e561 100644
--- a/lld/MachO/BPSectionOrderer.h
+++ b/lld/MachO/BPSectionOrderer.h
@@ -30,7 +30,7 @@ llvm::DenseMap<const lld::macho::InputSection *, size_t>
 runBalancedPartitioning(size_t &highestAvailablePriority,
                         llvm::StringRef profilePath,
                         bool forFunctionCompression, bool forDataCompression,
-                        bool verbose);
+                        bool compressionSortStartupFunctions, bool verbose);
 
 } // namespace lld::macho
 
diff --git a/lld/MachO/Config.h b/lld/MachO/Config.h
index 5beb0662ba7274..17259ee75aa4db 100644
--- a/lld/MachO/Config.h
+++ b/lld/MachO/Config.h
@@ -218,6 +218,7 @@ struct Configuration {
   llvm::StringRef printSymbolOrder;
 
   llvm::StringRef irpgoProfileSortProfilePath;
+  bool compressionSortStartupFunctions = false;
   bool functionOrderForCompression = false;
   bool dataOrderForCompression = false;
   bool verboseBpSectionOrderer = false;
diff --git a/lld/MachO/Driver.cpp b/lld/MachO/Driver.cpp
index 6a1ff96ed65697..514ee1933f2d1a 100644
--- a/lld/MachO/Driver.cpp
+++ b/lld/MachO/Driver.cpp
@@ -1772,6 +1772,9 @@ bool link(ArrayRef<const char *> argsArr, llvm::raw_ostream &stdoutOS,
     config->irpgoProfileSortProfilePath = arg->getValue();
     IncompatWithCGSort(arg->getSpelling());
   }
+  config->compressionSortStartupFunctions =
+      args.hasFlag(OPT_compression_sort_startup_functions,
+                   OPT_no_compression_sort_startup_functions, false);
   if (const Arg *arg = args.getLastArg(OPT_compression_sort)) {
     StringRef compressionSortStr = arg->getValue();
     if (compressionSortStr == "function") {
diff --git a/lld/MachO/Options.td b/lld/MachO/Options.td
index 9c9570cdbeb05c..4ac1e2080de5cc 100644
--- a/lld/MachO/Options.td
+++ b/lld/MachO/Options.td
@@ -131,6 +131,8 @@ def irpgo_profile_sort_eq: Joined<["--"], "irpgo-profile-sort=">,
     Alias<!cast<Separate>(irpgo_profile_sort)>, MetaVarName<"<profile>">,
     HelpText<"Read the IRPGO profile at <profile> to order sections to improve startup time">,
     Group<grp_lld>;
+def compression_sort_startup_functions: Flag<["--"], "compression-sort-startup-functions">, HelpText<"TODO">, Group<grp_lld>;
+def no_compression_sort_startup_functions: Flag<["--"], "no-compression-sort-startup-functions">, HelpText<"TODO">, Group<grp_lld>;
 def compression_sort: Joined<["--"], "compression-sort=">,
     MetaVarName<"[none,function,data,both]">,
     HelpText<"Order sections to improve compressed size">, Group<grp_lld>;
diff --git a/lld/MachO/SectionPriorities.cpp b/lld/MachO/SectionPriorities.cpp
index 69c301d8ff8a71..1e7fb5973b8086 100644
--- a/lld/MachO/SectionPriorities.cpp
+++ b/lld/MachO/SectionPriorities.cpp
@@ -359,6 +359,7 @@ macho::PriorityBuilder::buildInputSectionPriorities() {
     sectionPriorities = runBalancedPartitioning(
         highestAvailablePriority, config->irpgoProfileSortProfilePath,
         config->functionOrderForCompression, config->dataOrderForCompression,
+        config->compressionSortStartupFunctions,
         config->verboseBpSectionOrderer);
   } else if (config->callGraphProfileSort) {
     // Sort sections by the profile data provided by __LLVM,__cg_profile
diff --git a/lld/test/MachO/bp-section-orderer-stress.s b/lld/test/MachO/bp-section-orderer-stress.s
index fdc6a20e2655b9..986e2d8fd1069b 100644
--- a/lld/test/MachO/bp-section-orderer-stress.s
+++ b/lld/test/MachO/bp-section-orderer-stress.s
@@ -7,8 +7,8 @@
 # RUN: llvm-mc -filetype=obj -triple=arm64-apple-darwin %t.s -o %t.o
 # RUN: llvm-profdata merge %t.proftext -o %t.profdata
 
-# RUN: %lld -arch arm64 -lSystem -e _main --icf=all -o - %t.o --irpgo-profile-sort=%t.profdata --compression-sort=both | llvm-nm --numeric-sort --format=just-symbols - > %t.order1.txt
-# RUN: %lld -arch arm64 -lSystem -e _main --icf=all -o - %t.o --irpgo-profile-sort=%t.profdata --compression-sort=both | llvm-nm --numeric-sort --format=just-symbols - > %t.order2.txt
+# RUN: %lld -arch arm64 -lSystem -e _main --icf=all -o - %t.o --irpgo-profile-sort=%t.profdata --compression-sort-startup-functions --compression-sort=both | llvm-nm --numeric-sort --format=just-symbols - > %t.order1.txt
+# RUN: %lld -arch arm64 -lSystem -e _main --icf=all -o - %t.o --irpgo-profile-sort=%t.profdata --compression-sort-startup-functions --compression-sort=both | llvm-nm --numeric-sort --format=just-symbols - > %t.order2.txt
 # RUN: diff %t.order1.txt %t.order2.txt
 
 import random

``````````

</details>


https://github.com/llvm/llvm-project/pull/107348