[lld] [lld][InstrProf] Sort startup functions for compression (PR #107348)

Thu Sep 5 10:08:40 PDT 2024

https://github.com/ellishg updated https://github.com/llvm/llvm-project/pull/107348

>From cca346ff49665f8c751ad1f42a189d017074fc3c Mon Sep 17 00:00:00 2001
From: Ellis Hoag <ellis.sparky.hoag at gmail.com>
Date: Wed, 4 Sep 2024 19:42:00 -0700
Subject: [PATCH 1/3] [InstrProf][LLD] Refactor

* Rename `constructNodesForCompression()` -> `getUnsForCompression()`
* Pass `duplicateSectionIdxs` as a pointer to make it possible to skip
* Combine `duplicate{Function,Data}SectionIdxs` into one variable
* Compute all `BPFunctionNode` vectors at the end (like `nodesForStartup`)
---
 lld/MachO/BPSectionOrderer.cpp | 118 +++++++++++++++++----------------
 1 file changed, 62 insertions(+), 56 deletions(-)

diff --git a/lld/MachO/BPSectionOrderer.cpp b/lld/MachO/BPSectionOrderer.cpp
index 568843d72bbb50..3ba844f44fb791 100644
--- a/lld/MachO/BPSectionOrderer.cpp
+++ b/lld/MachO/BPSectionOrderer.cpp
@@ -60,12 +60,13 @@ getRelocHash(const Reloc &reloc,
   return getRelocHash(kind, sectionIdx.value_or(0), 0, reloc.addend);
 }
 
-static void constructNodesForCompression(
-    const SmallVector<const InputSection *> &sections,
+static SmallVector<
+    std::pair<unsigned, SmallVector<BPFunctionNode::UtilityNodeT>>>
+getUnsForCompression(
+    ArrayRef<const InputSection *> sections,
     const DenseMap<const InputSection *, uint64_t> &sectionToIdx,
-    const SmallVector<unsigned> &sectionIdxs,
-    std::vector<BPFunctionNode> &nodes,
-    DenseMap<unsigned, SmallVector<unsigned>> &duplicateSectionIdxs,
+    ArrayRef<unsigned> sectionIdxs,
+    DenseMap<unsigned, SmallVector<unsigned>> *duplicateSectionIdxs,
     BPFunctionNode::UtilityNodeT &maxUN) {
   TimeTraceScope timeScope("Build nodes for compression");
 
@@ -103,49 +104,53 @@ static void constructNodesForCompression(
     for (auto hash : hashes)
       ++hashFrequency[hash];
 
-  // Merge section that are nearly identical
-  SmallVector<std::pair<unsigned, SmallVector<uint64_t>>> newSectionHashes;
-  DenseMap<uint64_t, unsigned> wholeHashToSectionIdx;
-  for (auto &[sectionIdx, hashes] : sectionHashes) {
-    uint64_t wholeHash = 0;
-    for (auto hash : hashes)
-      if (hashFrequency[hash] > 5)
-        wholeHash ^= hash;
-    auto [it, wasInserted] =
-        wholeHashToSectionIdx.insert(std::make_pair(wholeHash, sectionIdx));
-    if (wasInserted) {
-      newSectionHashes.emplace_back(sectionIdx, hashes);
-    } else {
-      duplicateSectionIdxs[it->getSecond()].push_back(sectionIdx);
+  if (duplicateSectionIdxs) {
+    // Merge section that are nearly identical
+    SmallVector<std::pair<unsigned, SmallVector<uint64_t>>> newSectionHashes;
+    DenseMap<uint64_t, unsigned> wholeHashToSectionIdx;
+    for (auto &[sectionIdx, hashes] : sectionHashes) {
+      uint64_t wholeHash = 0;
+      for (auto hash : hashes)
+        if (hashFrequency[hash] > 5)
+          wholeHash ^= hash;
+      auto [it, wasInserted] =
+          wholeHashToSectionIdx.insert(std::make_pair(wholeHash, sectionIdx));
+      if (wasInserted) {
+        newSectionHashes.emplace_back(sectionIdx, hashes);
+      } else {
+        (*duplicateSectionIdxs)[it->getSecond()].push_back(sectionIdx);
+      }
     }
-  }
-  sectionHashes = newSectionHashes;
+    sectionHashes = newSectionHashes;
 
-  // Recompute hash frequencies
-  hashFrequency.clear();
-  for (auto &[sectionIdx, hashes] : sectionHashes)
-    for (auto hash : hashes)
-      ++hashFrequency[hash];
+    // Recompute hash frequencies
+    hashFrequency.clear();
+    for (auto &[sectionIdx, hashes] : sectionHashes)
+      for (auto hash : hashes)
+        ++hashFrequency[hash];
+  }
 
   // Filter rare and common hashes and assign each a unique utility node that
   // doesn't conflict with the trace utility nodes
   DenseMap<uint64_t, BPFunctionNode::UtilityNodeT> hashToUN;
   for (auto &[hash, frequency] : hashFrequency) {
-    if (frequency <= 1 || frequency * 2 > wholeHashToSectionIdx.size())
+    if (frequency <= 1 || frequency * 2 > sectionHashes.size())
       continue;
     hashToUN[hash] = ++maxUN;
   }
 
-  std::vector<BPFunctionNode::UtilityNodeT> uns;
+  SmallVector<std::pair<unsigned, SmallVector<BPFunctionNode::UtilityNodeT>>>
+      sectionUns;
   for (auto &[sectionIdx, hashes] : sectionHashes) {
+    SmallVector<BPFunctionNode::UtilityNodeT> uns;
     for (auto &hash : hashes) {
       auto it = hashToUN.find(hash);
       if (it != hashToUN.end())
         uns.push_back(it->second);
     }
-    nodes.emplace_back(sectionIdx, uns);
-    uns.clear();
+    sectionUns.emplace_back(sectionIdx, uns);
   }
+  return sectionUns;
 }
 
 DenseMap<const InputSection *, size_t> lld::macho::runBalancedPartitioning(
@@ -185,10 +190,11 @@ DenseMap<const InputSection *, size_t> lld::macho::runBalancedPartitioning(
                                            sectionIdxs.end());
   }
 
-  std::vector<BPFunctionNode> nodesForStartup;
   BPFunctionNode::UtilityNodeT maxUN = 0;
   DenseMap<unsigned, SmallVector<BPFunctionNode::UtilityNodeT>>
       startupSectionIdxUNs;
+  // Used to define the initial order for startup functions.
+  DenseMap<unsigned, size_t> sectionIdxToTimestamp;
   std::unique_ptr<InstrProfReader> reader;
   if (!profilePath.empty()) {
     auto fs = vfs::getRealFileSystem();
@@ -202,8 +208,6 @@ DenseMap<const InputSection *, size_t> lld::macho::runBalancedPartitioning(
     }
     auto &traces = reader->getTemporalProfTraces();
 
-    // Used to define the initial order for startup functions.
-    DenseMap<unsigned, size_t> sectionIdxToTimestamp;
     DenseMap<unsigned, BPFunctionNode::UtilityNodeT> sectionIdxToFirstUN;
     for (size_t traceIdx = 0; traceIdx < traces.size(); traceIdx++) {
       uint64_t currentSize = 0, cutoffSize = 1;
@@ -245,15 +249,6 @@ DenseMap<const InputSection *, size_t> lld::macho::runBalancedPartitioning(
       ++maxUN;
       sectionIdxToFirstUN.clear();
     }
-
-    // These uns should already be sorted without duplicates.
-    for (auto &[sectionIdx, uns] : startupSectionIdxUNs)
-      nodesForStartup.emplace_back(sectionIdx, uns);
-
-    llvm::sort(nodesForStartup, [&sectionIdxToTimestamp](auto &L, auto &R) {
-      return std::make_pair(sectionIdxToTimestamp[L.Id], L.Id) <
-             std::make_pair(sectionIdxToTimestamp[R.Id], R.Id);
-    });
   }
 
   SmallVector<unsigned> sectionIdxsForFunctionCompression,
@@ -271,21 +266,32 @@ DenseMap<const InputSection *, size_t> lld::macho::runBalancedPartitioning(
     }
   }
 
-  std::vector<BPFunctionNode> nodesForFunctionCompression,
-      nodesForDataCompression;
   // Map a section index (to be ordered for compression) to a list of duplicate
   // section indices (not ordered for compression).
-  DenseMap<unsigned, SmallVector<unsigned>> duplicateFunctionSectionIdxs,
-      duplicateDataSectionIdxs;
-  constructNodesForCompression(
+  DenseMap<unsigned, SmallVector<unsigned>> duplicateSectionIdxs;
+  auto unsForFunctionCompression = getUnsForCompression(
       sections, sectionToIdx, sectionIdxsForFunctionCompression,
-      nodesForFunctionCompression, duplicateFunctionSectionIdxs, maxUN);
-  constructNodesForCompression(
+      &duplicateSectionIdxs, maxUN);
+  auto unsForDataCompression = getUnsForCompression(
       sections, sectionToIdx, sectionIdxsForDataCompression,
-      nodesForDataCompression, duplicateDataSectionIdxs, maxUN);
+      &duplicateSectionIdxs, maxUN);
 
-  // Sort nodes by their Id (which is the section index) because the input
-  // linker order tends to be not bad
+  std::vector<BPFunctionNode> nodesForStartup, nodesForFunctionCompression,
+      nodesForDataCompression;
+  for (auto &[sectionIdx, uns] : startupSectionIdxUNs)
+    nodesForStartup.emplace_back(sectionIdx, uns);
+  for (auto &[sectionIdx, uns] : unsForFunctionCompression)
+    nodesForFunctionCompression.emplace_back(sectionIdx, uns);
+  for (auto &[sectionIdx, uns] : unsForDataCompression)
+    nodesForDataCompression.emplace_back(sectionIdx, uns);
+
+  // Use the first timestamp to define the initial order for startup nodes.
+  llvm::sort(nodesForStartup, [&sectionIdxToTimestamp](auto &L, auto &R) {
+    return std::make_pair(sectionIdxToTimestamp[L.Id], L.Id) <
+           std::make_pair(sectionIdxToTimestamp[R.Id], R.Id);
+  });
+  // Sort compression nodes by their Id (which is the section index) because the
+  // input linker order tends to be not bad.
   llvm::sort(nodesForFunctionCompression,
              [](auto &L, auto &R) { return L.Id < R.Id; });
   llvm::sort(nodesForDataCompression,
@@ -318,8 +324,8 @@ DenseMap<const InputSection *, size_t> lld::macho::runBalancedPartitioning(
     if (orderedSections.insert(isec))
       ++numCodeCompressionSections;
 
-    auto It = duplicateFunctionSectionIdxs.find(node.Id);
-    if (It == duplicateFunctionSectionIdxs.end())
+    auto It = duplicateSectionIdxs.find(node.Id);
+    if (It == duplicateSectionIdxs.end())
       continue;
     for (auto dupSecIdx : It->getSecond()) {
       const auto *dupIsec = sections[dupSecIdx];
@@ -332,8 +338,8 @@ DenseMap<const InputSection *, size_t> lld::macho::runBalancedPartitioning(
     const auto *isec = sections[node.Id];
     if (orderedSections.insert(isec))
       ++numDataCompressionSections;
-    auto It = duplicateDataSectionIdxs.find(node.Id);
-    if (It == duplicateDataSectionIdxs.end())
+    auto It = duplicateSectionIdxs.find(node.Id);
+    if (It == duplicateSectionIdxs.end())
       continue;
     for (auto dupSecIdx : It->getSecond()) {
       const auto *dupIsec = sections[dupSecIdx];

>From 29604b3f150e1017732eff68a0e05298c49797e9 Mon Sep 17 00:00:00 2001
From: Ellis Hoag <ellis.sparky.hoag at gmail.com>
Date: Wed, 4 Sep 2024 20:31:55 -0700
Subject: [PATCH 2/3] [InstrProf][LLD] Sort startup functions for compression

|                       | Page Fault Area | Compressed Size |
| --------------------- | --------------- | --------------- |
| Base                  | 1.045659e+09    | 25098246        |
| Startup Compression   | 1.430330e+09    | 24740192        |
| Remove Startup Hashes | 2.272126e+09    | 24574929        |
---
 lld/MachO/BPSectionOrderer.cpp             | 20 +++++++++++++++++---
 lld/MachO/BPSectionOrderer.h               |  2 +-
 lld/MachO/Config.h                         |  1 +
 lld/MachO/Driver.cpp                       |  3 +++
 lld/MachO/Options.td                       |  2 ++
 lld/MachO/SectionPriorities.cpp            |  1 +
 lld/test/MachO/bp-section-orderer-stress.s |  4 ++--
 7 files changed, 27 insertions(+), 6 deletions(-)

diff --git a/lld/MachO/BPSectionOrderer.cpp b/lld/MachO/BPSectionOrderer.cpp
index 3ba844f44fb791..71758070ddc8ae 100644
--- a/lld/MachO/BPSectionOrderer.cpp
+++ b/lld/MachO/BPSectionOrderer.cpp
@@ -155,7 +155,8 @@ getUnsForCompression(
 
 DenseMap<const InputSection *, size_t> lld::macho::runBalancedPartitioning(
     size_t &highestAvailablePriority, StringRef profilePath,
-    bool forFunctionCompression, bool forDataCompression, bool verbose) {
+    bool forFunctionCompression, bool forDataCompression,
+    bool compressionSortStartupFunctions, bool verbose) {
 
   SmallVector<const InputSection *> sections;
   DenseMap<const InputSection *, uint64_t> sectionToIdx;
@@ -266,8 +267,15 @@ DenseMap<const InputSection *, size_t> lld::macho::runBalancedPartitioning(
     }
   }
 
-  // Map a section index (to be ordered for compression) to a list of duplicate
-  // section indices (not ordered for compression).
+  SmallVector<unsigned> startupIdxs;
+  if (compressionSortStartupFunctions)
+    for (auto &[sectionIdx, uns] : startupSectionIdxUNs)
+      startupIdxs.push_back(sectionIdx);
+  auto unsForStartupFunctionCompression =
+      getUnsForCompression(sections, sectionToIdx, startupIdxs, nullptr, maxUN);
+
+  // Map a section index (order directly) to a list of duplicate section indices
+  // (not ordered directly).
   DenseMap<unsigned, SmallVector<unsigned>> duplicateSectionIdxs;
   auto unsForFunctionCompression = getUnsForCompression(
       sections, sectionToIdx, sectionIdxsForFunctionCompression,
@@ -276,6 +284,12 @@ DenseMap<const InputSection *, size_t> lld::macho::runBalancedPartitioning(
       sections, sectionToIdx, sectionIdxsForDataCompression,
       &duplicateSectionIdxs, maxUN);
 
+  for (auto &[sectionIdx, compressionUns] : unsForStartupFunctionCompression) {
+    auto &uns = startupSectionIdxUNs[sectionIdx];
+    uns.append(compressionUns);
+    llvm::sort(uns);
+    uns.erase(std::unique(uns.begin(), uns.end()), uns.end());
+  }
   std::vector<BPFunctionNode> nodesForStartup, nodesForFunctionCompression,
       nodesForDataCompression;
   for (auto &[sectionIdx, uns] : startupSectionIdxUNs)
diff --git a/lld/MachO/BPSectionOrderer.h b/lld/MachO/BPSectionOrderer.h
index 6f9eefd5d82beb..cefd0ceb10e561 100644
--- a/lld/MachO/BPSectionOrderer.h
+++ b/lld/MachO/BPSectionOrderer.h
@@ -30,7 +30,7 @@ llvm::DenseMap<const lld::macho::InputSection *, size_t>
 runBalancedPartitioning(size_t &highestAvailablePriority,
                         llvm::StringRef profilePath,
                         bool forFunctionCompression, bool forDataCompression,
-                        bool verbose);
+                        bool compressionSortStartupFunctions, bool verbose);
 
 } // namespace lld::macho
 
diff --git a/lld/MachO/Config.h b/lld/MachO/Config.h
index 5beb0662ba7274..17259ee75aa4db 100644
--- a/lld/MachO/Config.h
+++ b/lld/MachO/Config.h
@@ -218,6 +218,7 @@ struct Configuration {
   llvm::StringRef printSymbolOrder;
 
   llvm::StringRef irpgoProfileSortProfilePath;
+  bool compressionSortStartupFunctions = false;
   bool functionOrderForCompression = false;
   bool dataOrderForCompression = false;
   bool verboseBpSectionOrderer = false;
diff --git a/lld/MachO/Driver.cpp b/lld/MachO/Driver.cpp
index 6a1ff96ed65697..514ee1933f2d1a 100644
--- a/lld/MachO/Driver.cpp
+++ b/lld/MachO/Driver.cpp
@@ -1772,6 +1772,9 @@ bool link(ArrayRef<const char *> argsArr, llvm::raw_ostream &stdoutOS,
     config->irpgoProfileSortProfilePath = arg->getValue();
     IncompatWithCGSort(arg->getSpelling());
   }
+  config->compressionSortStartupFunctions =
+      args.hasFlag(OPT_compression_sort_startup_functions,
+                   OPT_no_compression_sort_startup_functions, false);
   if (const Arg *arg = args.getLastArg(OPT_compression_sort)) {
     StringRef compressionSortStr = arg->getValue();
     if (compressionSortStr == "function") {
diff --git a/lld/MachO/Options.td b/lld/MachO/Options.td
index 9c9570cdbeb05c..4ac1e2080de5cc 100644
--- a/lld/MachO/Options.td
+++ b/lld/MachO/Options.td
@@ -131,6 +131,8 @@ def irpgo_profile_sort_eq: Joined<["--"], "irpgo-profile-sort=">,
     Alias<!cast<Separate>(irpgo_profile_sort)>, MetaVarName<"<profile>">,
     HelpText<"Read the IRPGO profile at <profile> to order sections to improve startup time">,
     Group<grp_lld>;
+def compression_sort_startup_functions: Flag<["--"], "compression-sort-startup-functions">, HelpText<"TODO">, Group<grp_lld>;
+def no_compression_sort_startup_functions: Flag<["--"], "no-compression-sort-startup-functions">, HelpText<"TODO">, Group<grp_lld>;
 def compression_sort: Joined<["--"], "compression-sort=">,
     MetaVarName<"[none,function,data,both]">,
     HelpText<"Order sections to improve compressed size">, Group<grp_lld>;
diff --git a/lld/MachO/SectionPriorities.cpp b/lld/MachO/SectionPriorities.cpp
index 69c301d8ff8a71..1e7fb5973b8086 100644
--- a/lld/MachO/SectionPriorities.cpp
+++ b/lld/MachO/SectionPriorities.cpp
@@ -359,6 +359,7 @@ macho::PriorityBuilder::buildInputSectionPriorities() {
     sectionPriorities = runBalancedPartitioning(
         highestAvailablePriority, config->irpgoProfileSortProfilePath,
         config->functionOrderForCompression, config->dataOrderForCompression,
+        config->compressionSortStartupFunctions,
         config->verboseBpSectionOrderer);
   } else if (config->callGraphProfileSort) {
     // Sort sections by the profile data provided by __LLVM,__cg_profile
diff --git a/lld/test/MachO/bp-section-orderer-stress.s b/lld/test/MachO/bp-section-orderer-stress.s
index fdc6a20e2655b9..986e2d8fd1069b 100644
--- a/lld/test/MachO/bp-section-orderer-stress.s
+++ b/lld/test/MachO/bp-section-orderer-stress.s
@@ -7,8 +7,8 @@
 # RUN: llvm-mc -filetype=obj -triple=arm64-apple-darwin %t.s -o %t.o
 # RUN: llvm-profdata merge %t.proftext -o %t.profdata
 
-# RUN: %lld -arch arm64 -lSystem -e _main --icf=all -o - %t.o --irpgo-profile-sort=%t.profdata --compression-sort=both | llvm-nm --numeric-sort --format=just-symbols - > %t.order1.txt
-# RUN: %lld -arch arm64 -lSystem -e _main --icf=all -o - %t.o --irpgo-profile-sort=%t.profdata --compression-sort=both | llvm-nm --numeric-sort --format=just-symbols - > %t.order2.txt
+# RUN: %lld -arch arm64 -lSystem -e _main --icf=all -o - %t.o --irpgo-profile-sort=%t.profdata --compression-sort-startup-functions --compression-sort=both | llvm-nm --numeric-sort --format=just-symbols - > %t.order1.txt
+# RUN: %lld -arch arm64 -lSystem -e _main --icf=all -o - %t.o --irpgo-profile-sort=%t.profdata --compression-sort-startup-functions --compression-sort=both | llvm-nm --numeric-sort --format=just-symbols - > %t.order2.txt
 # RUN: diff %t.order1.txt %t.order2.txt
 
 import random

>From a9d9531fb72d695cc618cd9be63daee1f3e56607 Mon Sep 17 00:00:00 2001
From: Ellis Hoag <ellis.sparky.hoag at gmail.com>
Date: Thu, 5 Sep 2024 10:08:21 -0700
Subject: [PATCH 3/3] Add help message

---
 lld/MachO/Driver.cpp                     | 4 ++++
 lld/MachO/Options.td                     | 7 +++++--
 lld/test/MachO/bp-section-orderer-errs.s | 3 +++
 3 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/lld/MachO/Driver.cpp b/lld/MachO/Driver.cpp
index 514ee1933f2d1a..12dcceecb56e14 100644
--- a/lld/MachO/Driver.cpp
+++ b/lld/MachO/Driver.cpp
@@ -1775,6 +1775,10 @@ bool link(ArrayRef<const char *> argsArr, llvm::raw_ostream &stdoutOS,
   config->compressionSortStartupFunctions =
       args.hasFlag(OPT_compression_sort_startup_functions,
                    OPT_no_compression_sort_startup_functions, false);
+  if (config->irpgoProfileSortProfilePath.empty() &&
+      config->compressionSortStartupFunctions)
+    error("--compression-sort-startup-functions must be used with "
+          "--irpgo-profile-sort");
   if (const Arg *arg = args.getLastArg(OPT_compression_sort)) {
     StringRef compressionSortStr = arg->getValue();
     if (compressionSortStr == "function") {
diff --git a/lld/MachO/Options.td b/lld/MachO/Options.td
index 4ac1e2080de5cc..f264c040c91794 100644
--- a/lld/MachO/Options.td
+++ b/lld/MachO/Options.td
@@ -131,8 +131,11 @@ def irpgo_profile_sort_eq: Joined<["--"], "irpgo-profile-sort=">,
     Alias<!cast<Separate>(irpgo_profile_sort)>, MetaVarName<"<profile>">,
     HelpText<"Read the IRPGO profile at <profile> to order sections to improve startup time">,
     Group<grp_lld>;
-def compression_sort_startup_functions: Flag<["--"], "compression-sort-startup-functions">, HelpText<"TODO">, Group<grp_lld>;
-def no_compression_sort_startup_functions: Flag<["--"], "no-compression-sort-startup-functions">, HelpText<"TODO">, Group<grp_lld>;
+def compression_sort_startup_functions: Flag<["--"], "compression-sort-startup-functions">,
+    HelpText<"Order startup functions to improve compressed size in addition to startup time">,
+    Group<grp_lld>;
+def no_compression_sort_startup_functions: Flag<["--"], "no-compression-sort-startup-functions">, 
+    HelpText<"Do not order startup function for compression">, Group<grp_lld>;
 def compression_sort: Joined<["--"], "compression-sort=">,
     MetaVarName<"[none,function,data,both]">,
     HelpText<"Order sections to improve compressed size">, Group<grp_lld>;
diff --git a/lld/test/MachO/bp-section-orderer-errs.s b/lld/test/MachO/bp-section-orderer-errs.s
index 8392b88508481f..682eb0c08bf1f9 100644
--- a/lld/test/MachO/bp-section-orderer-errs.s
+++ b/lld/test/MachO/bp-section-orderer-errs.s
@@ -7,3 +7,6 @@
 
 # RUN: not %lld -o /dev/null --compression-sort=malformed 2>&1 | FileCheck %s --check-prefix=COMPRESSION-MALFORM
 # COMPRESSION-MALFORM: unknown value `malformed` for --compression-sort=
+
+# RUN: not %lld -o /dev/null --compression-sort-startup-functions 2>&1 | FileCheck %s --check-prefix=STARTUP
+# STARTUP: --compression-sort-startup-functions must be used with --irpgo-profile-sort