[lld] [lld][ELF] Extend profile guided function ordering to ELF binaries (PR #117514)

Sun Nov 24 19:22:28 PST 2024

https://github.com/Colibrow created https://github.com/llvm/llvm-project/pull/117514

Extend balanced partitioning implementation to support ELF binaries, enabling the same startup time and compressed size optimizations previously available for MachO.

This allows ELF binaries to benefit from profile-guided function ordering and compression-based section ordering.

Add the lld flags `--irpgo-profile-sort=<profile>` and `--compression-sort={function,data,both}`.

Thanks to the @ellishg, @thevinster, and their team's work.

>From e5ab745e4aa1e9579550a473c4c1c1da290a65a6 Mon Sep 17 00:00:00 2001
From: xupengying <xpy66swsry at gmail.com>
Date: Mon, 25 Nov 2024 11:10:04 +0800
Subject: [PATCH] [lld][ELF] Extend profile guided function ordering to ELF
 binaries

Extend balanced partitioning implementation to support ELF binaries, enabling
the same startup time and compressed size optimizations previously available for
MachO.

This allows ELF binaries to benefit from profile-guided function ordering
and compression-based section ordering.

Add the lld flags `--irpgo-profile-sort=<profile>` and
`--compression-sort={function,data,both}`.

Thanks to the ellishg, thevinster, and their team's work.
---
 lld/Common/CMakeLists.txt                |   1 +
 lld/Common/SectionOrderer.cpp            | 383 +++++++++++++++++++++
 lld/ELF/BPSectionOrderer.cpp             |  50 +++
 lld/ELF/BPSectionOrderer.h               | 140 ++++++++
 lld/ELF/CMakeLists.txt                   |   1 +
 lld/ELF/Config.h                         |   5 +
 lld/ELF/Driver.cpp                       |  21 ++
 lld/ELF/Options.td                       |  14 +
 lld/ELF/Writer.cpp                       |  13 +-
 lld/MachO/BPSectionOrderer.cpp           | 410 +----------------------
 lld/MachO/BPSectionOrderer.h             | 136 ++++++++
 lld/include/lld/Common/SectionOrderer.h  |  75 +++++
 lld/test/ELF/bp-section-orderer-errs.s   |  19 ++
 lld/test/ELF/bp-section-orderer-stress.s | 105 ++++++
 lld/test/ELF/bp-section-orderer.s        | 123 +++++++
 15 files changed, 1098 insertions(+), 398 deletions(-)
 create mode 100644 lld/Common/SectionOrderer.cpp
 create mode 100644 lld/ELF/BPSectionOrderer.cpp
 create mode 100644 lld/ELF/BPSectionOrderer.h
 create mode 100644 lld/include/lld/Common/SectionOrderer.h
 create mode 100644 lld/test/ELF/bp-section-orderer-errs.s
 create mode 100644 lld/test/ELF/bp-section-orderer-stress.s
 create mode 100644 lld/test/ELF/bp-section-orderer.s

diff --git a/lld/Common/CMakeLists.txt b/lld/Common/CMakeLists.txt
index 4f503d04f7844f..bd5a40af41c1bc 100644
--- a/lld/Common/CMakeLists.txt
+++ b/lld/Common/CMakeLists.txt
@@ -31,6 +31,7 @@ add_lld_library(lldCommon
   Filesystem.cpp
   Memory.cpp
   Reproduce.cpp
+  SectionOrderer.cpp
   Strings.cpp
   TargetOptionsCommandFlags.cpp
   Timer.cpp
diff --git a/lld/Common/SectionOrderer.cpp b/lld/Common/SectionOrderer.cpp
new file mode 100644
index 00000000000000..64c78030f3427f
--- /dev/null
+++ b/lld/Common/SectionOrderer.cpp
@@ -0,0 +1,383 @@
+//===- SectionOrderer.cpp---------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "lld/Common/SectionOrderer.h"
+#include "lld/Common/ErrorHandler.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ProfileData/InstrProfReader.h"
+#include "llvm/Support/BalancedPartitioning.h"
+#include "llvm/Support/TimeProfiler.h"
+#include "llvm/Support/VirtualFileSystem.h"
+#include "llvm/Support/xxhash.h"
+
+#define DEBUG_TYPE "bp-section-orderer"
+using namespace llvm;
+using UtilityNodes = SmallVector<BPFunctionNode::UtilityNodeT>;
+
+namespace lld {
+
+static SmallVector<std::pair<unsigned, UtilityNodes>> getUnsForCompression(
+    ArrayRef<const BPSectionBase *> sections,
+    const DenseMap<const BPSectionBase *, uint64_t> &sectionToIdx,
+    ArrayRef<unsigned> sectionIdxs,
+    DenseMap<unsigned, SmallVector<unsigned>> *duplicateSectionIdxs,
+    BPFunctionNode::UtilityNodeT &maxUN) {
+  TimeTraceScope timeScope("Build nodes for compression");
+
+  SmallVector<std::pair<unsigned, SmallVector<uint64_t>>> sectionHashes;
+  sectionHashes.reserve(sectionIdxs.size());
+  SmallVector<uint64_t> hashes;
+
+  for (unsigned sectionIdx : sectionIdxs) {
+    const auto *isec = sections[sectionIdx];
+    isec->getSectionHash(hashes, sectionToIdx);
+    sectionHashes.emplace_back(sectionIdx, std::move(hashes));
+    hashes.clear();
+  }
+
+  DenseMap<uint64_t, unsigned> hashFrequency;
+  for (auto &[sectionIdx, hashes] : sectionHashes)
+    for (auto hash : hashes)
+      ++hashFrequency[hash];
+
+  if (duplicateSectionIdxs) {
+    // Merge section that are nearly identical
+    SmallVector<std::pair<unsigned, SmallVector<uint64_t>>> newSectionHashes;
+    DenseMap<uint64_t, unsigned> wholeHashToSectionIdx;
+    for (auto &[sectionIdx, hashes] : sectionHashes) {
+      uint64_t wholeHash = 0;
+      for (auto hash : hashes)
+        if (hashFrequency[hash] > 5)
+          wholeHash ^= hash;
+      auto [it, wasInserted] =
+          wholeHashToSectionIdx.insert(std::make_pair(wholeHash, sectionIdx));
+      if (wasInserted) {
+        newSectionHashes.emplace_back(sectionIdx, hashes);
+      } else {
+        (*duplicateSectionIdxs)[it->getSecond()].push_back(sectionIdx);
+      }
+    }
+    sectionHashes = newSectionHashes;
+
+    // Recompute hash frequencies
+    hashFrequency.clear();
+    for (auto &[sectionIdx, hashes] : sectionHashes)
+      for (auto hash : hashes)
+        ++hashFrequency[hash];
+  }
+
+  // Filter rare and common hashes and assign each a unique utility node that
+  // doesn't conflict with the trace utility nodes
+  DenseMap<uint64_t, BPFunctionNode::UtilityNodeT> hashToUN;
+  for (auto &[hash, frequency] : hashFrequency) {
+    if (frequency <= 1 || frequency * 2 > sectionHashes.size())
+      continue;
+    hashToUN[hash] = ++maxUN;
+  }
+
+  SmallVector<std::pair<unsigned, UtilityNodes>> sectionUns;
+  for (auto &[sectionIdx, hashes] : sectionHashes) {
+    UtilityNodes uns;
+    for (auto &hash : hashes) {
+      auto it = hashToUN.find(hash);
+      if (it != hashToUN.end())
+        uns.push_back(it->second);
+    }
+    sectionUns.emplace_back(sectionIdx, uns);
+  }
+  return sectionUns;
+}
+
+llvm::DenseMap<const BPSectionBase *, size_t>
+SectionOrderer::reorderSectionsByBalancedPartitioning(
+    size_t &highestAvailablePriority, llvm::StringRef profilePath,
+    bool forFunctionCompression, bool forDataCompression,
+    bool compressionSortStartupFunctions, bool verbose,
+    SmallVector<BPSectionBase *> inputSections) {
+  TimeTraceScope timeScope("Balanced Partitioning");
+  SmallVector<const BPSectionBase *> sections;
+  DenseMap<const BPSectionBase *, uint64_t> sectionToIdx;
+  StringMap<DenseSet<unsigned>> symbolToSectionIdxs;
+
+  // Process input sections
+  for (const auto *isec : inputSections) {
+    if (!isec->hasValidData())
+      continue;
+
+    unsigned sectionIdx = sections.size();
+    sectionToIdx.try_emplace(isec, sectionIdx);
+    sections.push_back(isec);
+
+    for (auto *sym : isec->getSymbols()) {
+      if (auto *d = sym->asDefinedSymbol())
+        symbolToSectionIdxs[d->getName()].insert(sectionIdx);
+    }
+  }
+  StringMap<DenseSet<unsigned>> rootSymbolToSectionIdxs;
+  for (auto &entry : symbolToSectionIdxs) {
+    StringRef name = entry.getKey();
+    auto &sectionIdxs = entry.getValue();
+    name = BPSectionBase::getRootSymbol(name);
+    rootSymbolToSectionIdxs[name].insert(sectionIdxs.begin(),
+                                         sectionIdxs.end());
+    // Linkage names can be prefixed with "_" or "l_" on Mach-O. See
+    // Mangler::getNameWithPrefix() for details.
+    if (name.consume_front("_") || name.consume_front("l_"))
+      rootSymbolToSectionIdxs[name].insert(sectionIdxs.begin(),
+                                           sectionIdxs.end());
+  }
+
+  BPFunctionNode::UtilityNodeT maxUN = 0;
+  DenseMap<unsigned, UtilityNodes> startupSectionIdxUNs;
+  // Used to define the initial order for startup functions.
+  DenseMap<unsigned, size_t> sectionIdxToTimestamp;
+  std::unique_ptr<InstrProfReader> reader;
+  if (!profilePath.empty()) {
+    auto fs = vfs::getRealFileSystem();
+    auto readerOrErr = InstrProfReader::create(profilePath, *fs);
+    lld::checkError(readerOrErr.takeError());
+
+    reader = std::move(readerOrErr.get());
+    for (auto &entry : *reader) {
+      // Read all entries
+      (void)entry;
+    }
+    auto &traces = reader->getTemporalProfTraces();
+
+    DenseMap<unsigned, BPFunctionNode::UtilityNodeT> sectionIdxToFirstUN;
+    for (size_t traceIdx = 0; traceIdx < traces.size(); traceIdx++) {
+      uint64_t currentSize = 0, cutoffSize = 1;
+      size_t cutoffTimestamp = 1;
+      auto &trace = traces[traceIdx].FunctionNameRefs;
+      for (size_t timestamp = 0; timestamp < trace.size(); timestamp++) {
+        auto [Filename, ParsedFuncName] = getParsedIRPGOName(
+            reader->getSymtab().getFuncOrVarName(trace[timestamp]));
+        ParsedFuncName = BPSectionBase::getRootSymbol(ParsedFuncName);
+
+        auto sectionIdxsIt = rootSymbolToSectionIdxs.find(ParsedFuncName);
+        if (sectionIdxsIt == rootSymbolToSectionIdxs.end())
+          continue;
+        auto &sectionIdxs = sectionIdxsIt->getValue();
+        // If the same symbol is found in multiple sections, they might be
+        // identical, so we arbitrarily use the size from the first section.
+        currentSize += sections[*sectionIdxs.begin()]->getSize();
+
+        // Since BalancedPartitioning is sensitive to the initial order, we need
+        // to explicitly define it to be ordered by earliest timestamp.
+        for (unsigned sectionIdx : sectionIdxs) {
+          auto [it, wasInserted] =
+              sectionIdxToTimestamp.try_emplace(sectionIdx, timestamp);
+          if (!wasInserted)
+            it->getSecond() = std::min<size_t>(it->getSecond(), timestamp);
+        }
+
+        if (timestamp >= cutoffTimestamp || currentSize >= cutoffSize) {
+          ++maxUN;
+          cutoffSize = 2 * currentSize;
+          cutoffTimestamp = 2 * cutoffTimestamp;
+        }
+        for (unsigned sectionIdx : sectionIdxs)
+          sectionIdxToFirstUN.try_emplace(sectionIdx, maxUN);
+      }
+      for (auto &[sectionIdx, firstUN] : sectionIdxToFirstUN)
+        for (auto un = firstUN; un <= maxUN; ++un)
+          startupSectionIdxUNs[sectionIdx].push_back(un);
+      ++maxUN;
+      sectionIdxToFirstUN.clear();
+    }
+  }
+
+  SmallVector<unsigned> sectionIdxsForFunctionCompression,
+      sectionIdxsForDataCompression;
+  for (unsigned sectionIdx = 0; sectionIdx < sections.size(); sectionIdx++) {
+    if (startupSectionIdxUNs.count(sectionIdx))
+      continue;
+    const auto *isec = sections[sectionIdx];
+    if (isec->isCodeSection()) {
+      if (forFunctionCompression)
+        sectionIdxsForFunctionCompression.push_back(sectionIdx);
+    } else {
+      if (forDataCompression)
+        sectionIdxsForDataCompression.push_back(sectionIdx);
+    }
+  }
+
+  if (compressionSortStartupFunctions) {
+    SmallVector<unsigned> startupIdxs;
+    for (auto &[sectionIdx, uns] : startupSectionIdxUNs)
+      startupIdxs.push_back(sectionIdx);
+    auto unsForStartupFunctionCompression =
+        getUnsForCompression(sections, sectionToIdx, startupIdxs,
+                             /*duplicateSectionIdxs=*/nullptr, maxUN);
+    for (auto &[sectionIdx, compressionUns] :
+         unsForStartupFunctionCompression) {
+      auto &uns = startupSectionIdxUNs[sectionIdx];
+      uns.append(compressionUns);
+      llvm::sort(uns);
+      uns.erase(std::unique(uns.begin(), uns.end()), uns.end());
+    }
+  }
+
+  // Map a section index (order directly) to a list of duplicate section indices
+  // (not ordered directly).
+  DenseMap<unsigned, SmallVector<unsigned>> duplicateSectionIdxs;
+  auto unsForFunctionCompression = getUnsForCompression(
+      sections, sectionToIdx, sectionIdxsForFunctionCompression,
+      &duplicateSectionIdxs, maxUN);
+  auto unsForDataCompression = getUnsForCompression(
+      sections, sectionToIdx, sectionIdxsForDataCompression,
+      &duplicateSectionIdxs, maxUN);
+
+  std::vector<BPFunctionNode> nodesForStartup, nodesForFunctionCompression,
+      nodesForDataCompression;
+  for (auto &[sectionIdx, uns] : startupSectionIdxUNs)
+    nodesForStartup.emplace_back(sectionIdx, uns);
+  for (auto &[sectionIdx, uns] : unsForFunctionCompression)
+    nodesForFunctionCompression.emplace_back(sectionIdx, uns);
+  for (auto &[sectionIdx, uns] : unsForDataCompression)
+    nodesForDataCompression.emplace_back(sectionIdx, uns);
+
+  // Use the first timestamp to define the initial order for startup nodes.
+  llvm::sort(nodesForStartup, [&sectionIdxToTimestamp](auto &L, auto &R) {
+    return std::make_pair(sectionIdxToTimestamp[L.Id], L.Id) <
+           std::make_pair(sectionIdxToTimestamp[R.Id], R.Id);
+  });
+  // Sort compression nodes by their Id (which is the section index) because the
+  // input linker order tends to be not bad.
+  llvm::sort(nodesForFunctionCompression,
+             [](auto &L, auto &R) { return L.Id < R.Id; });
+  llvm::sort(nodesForDataCompression,
+             [](auto &L, auto &R) { return L.Id < R.Id; });
+
+  {
+    TimeTraceScope timeScope("Balanced Partitioning");
+    BalancedPartitioningConfig config;
+    BalancedPartitioning bp(config);
+    bp.run(nodesForStartup);
+    bp.run(nodesForFunctionCompression);
+    bp.run(nodesForDataCompression);
+  }
+
+  unsigned numStartupSections = 0;
+  unsigned numCodeCompressionSections = 0;
+  unsigned numDuplicateCodeSections = 0;
+  unsigned numDataCompressionSections = 0;
+  unsigned numDuplicateDataSections = 0;
+  SetVector<const BPSectionBase *> orderedSections;
+  // Order startup functions,
+  for (auto &node : nodesForStartup) {
+    const auto *isec = sections[node.Id];
+    if (orderedSections.insert(isec))
+      ++numStartupSections;
+  }
+  // then functions for compression,
+  for (auto &node : nodesForFunctionCompression) {
+    const auto *isec = sections[node.Id];
+    if (orderedSections.insert(isec))
+      ++numCodeCompressionSections;
+
+    auto It = duplicateSectionIdxs.find(node.Id);
+    if (It == duplicateSectionIdxs.end())
+      continue;
+    for (auto dupSecIdx : It->getSecond()) {
+      const auto *dupIsec = sections[dupSecIdx];
+      if (orderedSections.insert(dupIsec))
+        ++numDuplicateCodeSections;
+    }
+  }
+  // then data for compression.
+  for (auto &node : nodesForDataCompression) {
+    const auto *isec = sections[node.Id];
+    if (orderedSections.insert(isec))
+      ++numDataCompressionSections;
+    auto It = duplicateSectionIdxs.find(node.Id);
+    if (It == duplicateSectionIdxs.end())
+      continue;
+    for (auto dupSecIdx : It->getSecond()) {
+      const auto *dupIsec = sections[dupSecIdx];
+      if (orderedSections.insert(dupIsec))
+        ++numDuplicateDataSections;
+    }
+  }
+
+  if (verbose) {
+    unsigned numTotalOrderedSections =
+        numStartupSections + numCodeCompressionSections +
+        numDuplicateCodeSections + numDataCompressionSections +
+        numDuplicateDataSections;
+    dbgs()
+        << "Ordered " << numTotalOrderedSections
+        << " sections using balanced partitioning:\n  Functions for startup: "
+        << numStartupSections
+        << "\n  Functions for compression: " << numCodeCompressionSections
+        << "\n  Duplicate functions: " << numDuplicateCodeSections
+        << "\n  Data for compression: " << numDataCompressionSections
+        << "\n  Duplicate data: " << numDuplicateDataSections << "\n";
+
+    if (!profilePath.empty()) {
+      // Evaluate this function order for startup
+      StringMap<std::pair<uint64_t, uint64_t>> symbolToPageNumbers;
+      const uint64_t pageSize = (1 << 14);
+      uint64_t currentAddress = 0;
+      for (const auto *isec : orderedSections) {
+        for (auto *sym : isec->getSymbols()) {
+          if (auto *d = sym->asDefinedSymbol()) {
+            uint64_t startAddress = currentAddress + d->getValue();
+            uint64_t endAddress = startAddress + d->getSize();
+            uint64_t firstPage = startAddress / pageSize;
+            // I think the kernel might pull in a few pages when one it touched,
+            // so it might be more accurate to force lastPage to be aligned by
+            // 4?
+            uint64_t lastPage = endAddress / pageSize;
+            StringRef rootSymbol = d->getName();
+            rootSymbol = BPSectionBase::getRootSymbol(rootSymbol);
+            symbolToPageNumbers.try_emplace(rootSymbol, firstPage, lastPage);
+            if (rootSymbol.consume_front("_") || rootSymbol.consume_front("l_"))
+              symbolToPageNumbers.try_emplace(rootSymbol, firstPage, lastPage);
+          }
+        }
+        currentAddress += isec->getSize();
+      }
+
+      // The area under the curve F where F(t) is the total number of page
+      // faults at step t.
+      unsigned area = 0;
+      for (auto &trace : reader->getTemporalProfTraces()) {
+        SmallSet<uint64_t, 0> touchedPages;
+        for (unsigned step = 0; step < trace.FunctionNameRefs.size(); step++) {
+          auto traceId = trace.FunctionNameRefs[step];
+          auto [Filename, ParsedFuncName] =
+              getParsedIRPGOName(reader->getSymtab().getFuncOrVarName(traceId));
+          ParsedFuncName = BPSectionBase::getRootSymbol(ParsedFuncName);
+          auto it = symbolToPageNumbers.find(ParsedFuncName);
+          if (it != symbolToPageNumbers.end()) {
+            auto &[firstPage, lastPage] = it->getValue();
+            for (uint64_t i = firstPage; i <= lastPage; i++)
+              touchedPages.insert(i);
+          }
+          area += touchedPages.size();
+        }
+      }
+      dbgs() << "Total area under the page fault curve: " << (float)area
+             << "\n";
+    }
+  }
+
+  DenseMap<const BPSectionBase *, size_t> sectionPriorities;
+  for (const auto *isec : orderedSections)
+    sectionPriorities[isec] = --highestAvailablePriority;
+  return sectionPriorities;
+}
+
+} // namespace lld
diff --git a/lld/ELF/BPSectionOrderer.cpp b/lld/ELF/BPSectionOrderer.cpp
new file mode 100644
index 00000000000000..ac3024a69e681a
--- /dev/null
+++ b/lld/ELF/BPSectionOrderer.cpp
@@ -0,0 +1,50 @@
+//===- BPSectionOrderer.cpp--------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "BPSectionOrderer.h"
+#include "Config.h"
+#include "InputFiles.h"
+#include "InputSection.h"
+#include "lld/Common/CommonLinkerContext.h"
+#include "lld/Common/SectionOrderer.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/Support/BalancedPartitioning.h"
+#include "llvm/Support/TimeProfiler.h"
+
+using namespace llvm;
+using namespace lld::elf;
+
+llvm::DenseMap<const lld::elf::InputSectionBase *, int>
+lld::elf::runBalancedPartitioning(Ctx &ctx, llvm::StringRef profilePath,
+                                  bool forFunctionCompression,
+                                  bool forDataCompression,
+                                  bool compressionSortStartupFunctions,
+                                  bool verbose) {
+  size_t highestAvailablePriority = std::numeric_limits<int>::max();
+  SmallVector<lld::BPSectionBase *> sections;
+  for (auto *isec : ctx.inputSections) {
+    if (!isec || isec->content().empty())
+      continue;
+    sections.push_back(new ELFSection(isec));
+  }
+
+  auto reorderedSections =
+      lld::SectionOrderer::reorderSectionsByBalancedPartitioning(
+          highestAvailablePriority, profilePath, forFunctionCompression,
+          forDataCompression, compressionSortStartupFunctions, verbose,
+          sections);
+
+  DenseMap<const InputSectionBase *, int> result;
+  for (const auto &[BPSectionBase, priority] : reorderedSections) {
+    if (auto *elfSection = dyn_cast<ELFSection>(BPSectionBase)) {
+      result[elfSection->getSection()] = static_cast<int>(priority);
+      delete elfSection;
+    }
+  }
+  return result;
+}
diff --git a/lld/ELF/BPSectionOrderer.h b/lld/ELF/BPSectionOrderer.h
new file mode 100644
index 00000000000000..c24f8d1277c108
--- /dev/null
+++ b/lld/ELF/BPSectionOrderer.h
@@ -0,0 +1,140 @@
+//===- BPSectionOrderer.h ---------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// This file uses Balanced Partitioning to order sections to improve startup
+/// time and compressed size.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLD_ELF_BPSECTION_ORDERER_H
+#define LLD_ELF_BPSECTION_ORDERER_H
+
+#include "InputFiles.h"
+#include "InputSection.h"
+#include "Relocations.h"
+#include "Symbols.h"
+#include "lld/Common/SectionOrderer.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/STLFunctionalExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/BinaryFormat/ELF.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/xxhash.h"
+
+namespace lld::elf {
+
+class InputSection;
+
+class ELFSymbol : public BPSymbol {
+  const Symbol *sym;
+
+public:
+  explicit ELFSymbol(const Symbol *s) : sym(s) {}
+
+  llvm::StringRef getName() const override { return sym->getName(); }
+
+  BPSymbol *asDefinedSymbol() override {
+    if (auto *d = llvm::dyn_cast<Defined>(sym))
+      return this;
+    return nullptr;
+  }
+
+  uint64_t getValue() const override {
+    if (auto *d = llvm::dyn_cast<Defined>(sym))
+      return d->value;
+    return 0;
+  }
+
+  uint64_t getSize() const override {
+    if (auto *d = llvm::dyn_cast<Defined>(sym))
+      return d->size;
+    return 0;
+  }
+
+  const Symbol *getSymbol() const { return sym; }
+};
+
+class ELFSection : public BPSectionBase {
+  const InputSectionBase *isec;
+  mutable std::vector<std::unique_ptr<ELFSymbol>> symbolCache;
+
+public:
+  explicit ELFSection(const InputSectionBase *sec) : isec(sec) {}
+
+  const InputSectionBase *getSection() const { return isec; }
+
+  llvm::StringRef getName() const override { return isec->name; }
+
+  uint64_t getSize() const override { return isec->getSize(); }
+
+  bool isCodeSection() const override {
+    return isec->flags & llvm::ELF::SHF_EXECINSTR;
+  }
+
+  bool hasValidData() const override {
+    return isec && !isec->content().empty();
+  }
+
+  llvm::ArrayRef<uint8_t> getSectionData() const override {
+    return isec->content();
+  }
+
+  llvm::ArrayRef<BPSymbol *> getSymbols() const override {
+    if (symbolCache.empty()) {
+      for (Symbol *sym : isec->file->getSymbols())
+        symbolCache.push_back(std::make_unique<ELFSymbol>(sym));
+    }
+    static std::vector<BPSymbol *> result;
+    result.clear();
+    for (const auto &sym : symbolCache)
+      result.push_back(sym.get());
+    return result;
+  }
+
+  void getSectionHash(llvm::SmallVectorImpl<uint64_t> &hashes,
+                      const llvm::DenseMap<const BPSectionBase *, uint64_t>
+                          &sectionToIdx) const override {
+    constexpr unsigned windowSize = 4;
+
+    // Convert BPSectionBase map to InputSection map
+    llvm::DenseMap<const InputSectionBase *, uint64_t> elfSectionToIdx;
+    for (const auto &[sec, idx] : sectionToIdx) {
+      if (auto *elfSec = llvm::dyn_cast<ELFSection>(sec))
+        elfSectionToIdx[elfSec->getSection()] = idx;
+    }
+
+    // Calculate content hashes
+    for (size_t i = 0; i < isec->content().size(); i++) {
+      auto window = isec->content().slice(i, windowSize);
+      hashes.push_back(xxHash64(window));
+    }
+
+    // TODO: Calculate relocation hashes.
+    // Since in ELF, relocations are complex, but the effect without them are
+    // good enough, we just use 0 as their hash.
+
+    llvm::sort(hashes);
+    hashes.erase(std::unique(hashes.begin(), hashes.end()), hashes.end());
+  }
+
+  static bool classof(const BPSectionBase *s) { return true; }
+};
+
+/// Run Balanced Partitioning to find the optimal function and data order to
+/// improve startup time and compressed size.
+///
+/// It is important that .subsections_via_symbols is used to ensure functions
+/// and data are in their own sections and thus can be reordered.
+llvm::DenseMap<const lld::elf::InputSectionBase *, int>
+runBalancedPartitioning(Ctx &ctx, llvm::StringRef profilePath,
+                        bool forFunctionCompression, bool forDataCompression,
+                        bool compressionSortStartupFunctions, bool verbose);
+} // namespace lld::elf
+
+#endif
diff --git a/lld/ELF/CMakeLists.txt b/lld/ELF/CMakeLists.txt
index 83d816ddb0601e..7bf70c3cbbadc5 100644
--- a/lld/ELF/CMakeLists.txt
+++ b/lld/ELF/CMakeLists.txt
@@ -56,6 +56,7 @@ add_lld_library(lldELF
   SymbolTable.cpp
   Symbols.cpp
   SyntheticSections.cpp
+  BPSectionOrderer.cpp
   Target.cpp
   Thunks.cpp
   Writer.cpp
diff --git a/lld/ELF/Config.h b/lld/ELF/Config.h
index a2836733c2715e..10054f01a5fe5c 100644
--- a/lld/ELF/Config.h
+++ b/lld/ELF/Config.h
@@ -264,6 +264,11 @@ struct Config {
   bool armBe8 = false;
   BsymbolicKind bsymbolic = BsymbolicKind::None;
   CGProfileSortKind callGraphProfileSort;
+  llvm::StringRef irpgoProfileSortProfilePath;
+  bool compressionSortStartupFunctions = false;
+  bool functionOrderForCompression = false;
+  bool dataOrderForCompression = false;
+  bool verboseBpSectionOrderer = false;
   bool checkSections;
   bool checkDynamicRelocs;
   std::optional<llvm::DebugCompressionType> compressDebugSections;
diff --git a/lld/ELF/Driver.cpp b/lld/ELF/Driver.cpp
index 10c52d7206b805..e79713d3227907 100644
--- a/lld/ELF/Driver.cpp
+++ b/lld/ELF/Driver.cpp
@@ -1255,6 +1255,27 @@ static void readConfigs(Ctx &ctx, opt::InputArgList &args) {
       ctx.arg.bsymbolic = BsymbolicKind::All;
   }
   ctx.arg.callGraphProfileSort = getCGProfileSortKind(ctx, args);
+  ctx.arg.irpgoProfileSortProfilePath =
+      args.getLastArgValue(OPT_irpgo_profile_sort);
+  ctx.arg.compressionSortStartupFunctions =
+      args.hasFlag(OPT_compression_sort_startup_functions,
+                   OPT_no_compression_sort_startup_functions, false);
+  if (auto *arg = args.getLastArg(OPT_compression_sort)) {
+    StringRef compressionSortStr = arg->getValue();
+    if (compressionSortStr == "function") {
+      ctx.arg.functionOrderForCompression = true;
+    } else if (compressionSortStr == "data") {
+      ctx.arg.dataOrderForCompression = true;
+    } else if (compressionSortStr == "both") {
+      ctx.arg.functionOrderForCompression = true;
+      ctx.arg.dataOrderForCompression = true;
+    } else if (compressionSortStr != "none") {
+      ErrAlways(ctx) << arg->getSpelling() << ": invalid argument '"
+                     << arg->getValue()
+                     << "', only 'function' or 'data' or 'both' is supported";
+    }
+  }
+  ctx.arg.verboseBpSectionOrderer = args.hasArg(OPT_verbose_bp_section_orderer);
   ctx.arg.checkSections =
       args.hasFlag(OPT_check_sections, OPT_no_check_sections, true);
   ctx.arg.chroot = args.getLastArgValue(OPT_chroot);
diff --git a/lld/ELF/Options.td b/lld/ELF/Options.td
index ebe77204264210..0c51bf4e555868 100644
--- a/lld/ELF/Options.td
+++ b/lld/ELF/Options.td
@@ -141,6 +141,20 @@ def call_graph_profile_sort: JJ<"call-graph-profile-sort=">,
 def : FF<"no-call-graph-profile-sort">, Alias<call_graph_profile_sort>, AliasArgs<["none"]>,
   Flags<[HelpHidden]>;
 
+defm irpgo_profile_sort: Eq<"irpgo-profile-sort",
+    "Read the IRPGO profile at <profile> to order sections to improve startup time">;
+
+defm compression_sort_startup_functions: BB<"compression-sort-startup-functions",
+    "Order startup functions to improve compressed size in addition to startup time",
+    "Do not order startup function for compression">;
+
+def compression_sort: JJ<"compression-sort=">,
+    MetaVarName<"[none,function,data,both]">,
+    HelpText<"Order sections to improve compressed size">;
+
+def verbose_bp_section_orderer: FF<"verbose-bp-section-orderer">,
+    HelpText<"Print information on how many sections were ordered by balanced partitioning and a measure of the expected number of page faults">;
+
 // --chroot doesn't have a help text because it is an internal option.
 def chroot: Separate<["--"], "chroot">;
 
diff --git a/lld/ELF/Writer.cpp b/lld/ELF/Writer.cpp
index 67497bad7cb235..8befd8be15277b 100644
--- a/lld/ELF/Writer.cpp
+++ b/lld/ELF/Writer.cpp
@@ -9,6 +9,7 @@
 #include "Writer.h"
 #include "AArch64ErrataFix.h"
 #include "ARMErrataFix.h"
+#include "BPSectionOrderer.h"
 #include "CallGraphSort.h"
 #include "Config.h"
 #include "InputFiles.h"
@@ -1078,8 +1079,18 @@ static void maybeShuffle(Ctx &ctx,
 // Builds section order for handling --symbol-ordering-file.
 static DenseMap<const InputSectionBase *, int> buildSectionOrder(Ctx &ctx) {
   DenseMap<const InputSectionBase *, int> sectionOrder;
+  if (!ctx.arg.irpgoProfileSortProfilePath.empty() ||
+      ctx.arg.compressionSortStartupFunctions ||
+      ctx.arg.dataOrderForCompression) {
+    TimeTraceScope timeScope("Balanced Partitioning Section Orderer");
+    sectionOrder = runBalancedPartitioning(
+        ctx, ctx.arg.irpgoProfileSortProfilePath,
+        ctx.arg.compressionSortStartupFunctions,
+        ctx.arg.functionOrderForCompression, ctx.arg.dataOrderForCompression,
+        ctx.arg.verboseBpSectionOrderer);
+  }
   // Use the rarely used option --call-graph-ordering-file to sort sections.
-  if (!ctx.arg.callGraphProfile.empty())
+  else if (!ctx.arg.callGraphProfile.empty())
     return computeCallGraphProfileOrder(ctx);
 
   if (ctx.arg.symbolOrderingFile.empty())
diff --git a/lld/MachO/BPSectionOrderer.cpp b/lld/MachO/BPSectionOrderer.cpp
index 5db2242a35ef28..1edf883209f719 100644
--- a/lld/MachO/BPSectionOrderer.cpp
+++ b/lld/MachO/BPSectionOrderer.cpp
@@ -11,425 +11,41 @@
 #include "lld/Common/ErrorHandler.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/StringMap.h"
-#include "llvm/ProfileData/InstrProfReader.h"
 #include "llvm/Support/BalancedPartitioning.h"
 #include "llvm/Support/TimeProfiler.h"
-#include "llvm/Support/VirtualFileSystem.h"
-#include "llvm/Support/xxhash.h"
 
-#define DEBUG_TYPE "bp-section-orderer"
 using namespace llvm;
 using namespace lld::macho;
 
-using UtilityNodes = SmallVector<BPFunctionNode::UtilityNodeT>;
-
-/// Symbols can be appended with "(.__uniq.xxxx)?.llvm.yyyy" where "xxxx" and
-/// "yyyy" are numbers that could change between builds. We need to use the root
-/// symbol name before this suffix so these symbols can be matched with profiles
-/// which may have different suffixes.
-static StringRef getRootSymbol(StringRef Name) {
-  auto [P0, S0] = Name.rsplit(".llvm.");
-  auto [P1, S1] = P0.rsplit(".__uniq.");
-  return P1;
-}
-
-static uint64_t getRelocHash(StringRef kind, uint64_t sectionIdx,
-                             uint64_t offset, uint64_t addend) {
-  return xxHash64((kind + ": " + Twine::utohexstr(sectionIdx) + " + " +
-                   Twine::utohexstr(offset) + " + " + Twine::utohexstr(addend))
-                      .str());
-}
-
-static uint64_t
-getRelocHash(const Reloc &reloc,
-             const DenseMap<const InputSection *, uint64_t> &sectionToIdx) {
-  auto *isec = reloc.getReferentInputSection();
-  std::optional<uint64_t> sectionIdx;
-  auto sectionIdxIt = sectionToIdx.find(isec);
-  if (sectionIdxIt != sectionToIdx.end())
-    sectionIdx = sectionIdxIt->getSecond();
-  std::string kind;
-  if (isec)
-    kind = ("Section " + Twine(static_cast<uint8_t>(isec->kind()))).str();
-  if (auto *sym = reloc.referent.dyn_cast<Symbol *>()) {
-    kind += (" Symbol " + Twine(static_cast<uint8_t>(sym->kind()))).str();
-    if (auto *d = dyn_cast<Defined>(sym))
-      return getRelocHash(kind, sectionIdx.value_or(0), d->value, reloc.addend);
-  }
-  return getRelocHash(kind, sectionIdx.value_or(0), 0, reloc.addend);
-}
-
-/// Given \p sectionIdxs, a list of section indexes, return a list of utility
-/// nodes for each section index. If \p duplicateSectionIdx is provided,
-/// populate it with nearly identical sections. Increment \p maxUN to be the
-/// largest utility node we have used so far.
-static SmallVector<std::pair<unsigned, UtilityNodes>> getUnsForCompression(
-    ArrayRef<const InputSection *> sections,
-    const DenseMap<const InputSection *, uint64_t> &sectionToIdx,
-    ArrayRef<unsigned> sectionIdxs,
-    DenseMap<unsigned, SmallVector<unsigned>> *duplicateSectionIdxs,
-    BPFunctionNode::UtilityNodeT &maxUN) {
-  TimeTraceScope timeScope("Build nodes for compression");
-
-  SmallVector<std::pair<unsigned, SmallVector<uint64_t>>> sectionHashes;
-  sectionHashes.reserve(sectionIdxs.size());
-  SmallVector<uint64_t> hashes;
-  for (unsigned sectionIdx : sectionIdxs) {
-    const auto *isec = sections[sectionIdx];
-    constexpr unsigned windowSize = 4;
-
-    for (size_t i = 0; i < isec->data.size(); i++) {
-      auto window = isec->data.drop_front(i).take_front(windowSize);
-      hashes.push_back(xxHash64(window));
-    }
-    for (const auto &r : isec->relocs) {
-      if (r.length == 0 || r.referent.isNull() || r.offset >= isec->data.size())
-        continue;
-      uint64_t relocHash = getRelocHash(r, sectionToIdx);
-      uint32_t start = (r.offset < windowSize) ? 0 : r.offset - windowSize + 1;
-      for (uint32_t i = start; i < r.offset + r.length; i++) {
-        auto window = isec->data.drop_front(i).take_front(windowSize);
-        hashes.push_back(xxHash64(window) + relocHash);
-      }
-    }
-
-    llvm::sort(hashes);
-    hashes.erase(std::unique(hashes.begin(), hashes.end()), hashes.end());
-
-    sectionHashes.emplace_back(sectionIdx, hashes);
-    hashes.clear();
-  }
-
-  DenseMap<uint64_t, unsigned> hashFrequency;
-  for (auto &[sectionIdx, hashes] : sectionHashes)
-    for (auto hash : hashes)
-      ++hashFrequency[hash];
-
-  if (duplicateSectionIdxs) {
-    // Merge section that are nearly identical
-    SmallVector<std::pair<unsigned, SmallVector<uint64_t>>> newSectionHashes;
-    DenseMap<uint64_t, unsigned> wholeHashToSectionIdx;
-    for (auto &[sectionIdx, hashes] : sectionHashes) {
-      uint64_t wholeHash = 0;
-      for (auto hash : hashes)
-        if (hashFrequency[hash] > 5)
-          wholeHash ^= hash;
-      auto [it, wasInserted] =
-          wholeHashToSectionIdx.insert(std::make_pair(wholeHash, sectionIdx));
-      if (wasInserted) {
-        newSectionHashes.emplace_back(sectionIdx, hashes);
-      } else {
-        (*duplicateSectionIdxs)[it->getSecond()].push_back(sectionIdx);
-      }
-    }
-    sectionHashes = newSectionHashes;
-
-    // Recompute hash frequencies
-    hashFrequency.clear();
-    for (auto &[sectionIdx, hashes] : sectionHashes)
-      for (auto hash : hashes)
-        ++hashFrequency[hash];
-  }
-
-  // Filter rare and common hashes and assign each a unique utility node that
-  // doesn't conflict with the trace utility nodes
-  DenseMap<uint64_t, BPFunctionNode::UtilityNodeT> hashToUN;
-  for (auto &[hash, frequency] : hashFrequency) {
-    if (frequency <= 1 || frequency * 2 > sectionHashes.size())
-      continue;
-    hashToUN[hash] = ++maxUN;
-  }
-
-  SmallVector<std::pair<unsigned, UtilityNodes>> sectionUns;
-  for (auto &[sectionIdx, hashes] : sectionHashes) {
-    UtilityNodes uns;
-    for (auto &hash : hashes) {
-      auto it = hashToUN.find(hash);
-      if (it != hashToUN.end())
-        uns.push_back(it->second);
-    }
-    sectionUns.emplace_back(sectionIdx, uns);
-  }
-  return sectionUns;
-}
-
 DenseMap<const InputSection *, size_t> lld::macho::runBalancedPartitioning(
     size_t &highestAvailablePriority, StringRef profilePath,
     bool forFunctionCompression, bool forDataCompression,
     bool compressionSortStartupFunctions, bool verbose) {
 
-  SmallVector<const InputSection *> sections;
-  DenseMap<const InputSection *, uint64_t> sectionToIdx;
-  StringMap<DenseSet<unsigned>> symbolToSectionIdxs;
+  SmallVector<BPSectionBase *> sections;
   for (const auto *file : inputFiles) {
     for (auto *sec : file->sections) {
       for (auto &subsec : sec->subsections) {
         auto *isec = subsec.isec;
         if (!isec || isec->data.empty() || !isec->data.data())
           continue;
-        unsigned sectionIdx = sections.size();
-        sectionToIdx.try_emplace(isec, sectionIdx);
-        sections.push_back(isec);
-        for (Symbol *sym : isec->symbols)
-          if (auto *d = dyn_cast_or_null<Defined>(sym))
-            symbolToSectionIdxs[d->getName()].insert(sectionIdx);
+        sections.push_back(new MachoSection(isec));
       }
     }
   }
 
-  StringMap<DenseSet<unsigned>> rootSymbolToSectionIdxs;
-  for (auto &entry : symbolToSectionIdxs) {
-    StringRef name = entry.getKey();
-    auto &sectionIdxs = entry.getValue();
-    name = getRootSymbol(name);
-    rootSymbolToSectionIdxs[name].insert(sectionIdxs.begin(),
-                                         sectionIdxs.end());
-    // Linkage names can be prefixed with "_" or "l_" on Mach-O. See
-    // Mangler::getNameWithPrefix() for details.
-    if (name.consume_front("_") || name.consume_front("l_"))
-      rootSymbolToSectionIdxs[name].insert(sectionIdxs.begin(),
-                                           sectionIdxs.end());
-  }
-
-  BPFunctionNode::UtilityNodeT maxUN = 0;
-  DenseMap<unsigned, UtilityNodes> startupSectionIdxUNs;
-  // Used to define the initial order for startup functions.
-  DenseMap<unsigned, size_t> sectionIdxToTimestamp;
-  std::unique_ptr<InstrProfReader> reader;
-  if (!profilePath.empty()) {
-    auto fs = vfs::getRealFileSystem();
-    auto readerOrErr = InstrProfReader::create(profilePath, *fs);
-    lld::checkError(readerOrErr.takeError());
+  auto reorderedSections =
+      lld::SectionOrderer::reorderSectionsByBalancedPartitioning(
+          highestAvailablePriority, profilePath, forFunctionCompression,
+          forDataCompression, compressionSortStartupFunctions, verbose,
+          sections);
 
-    reader = std::move(readerOrErr.get());
-    for (auto &entry : *reader) {
-      // Read all entries
-      (void)entry;
+  DenseMap<const InputSection *, size_t> result;
+  for (const auto &[BPSectionBase, priority] : reorderedSections) {
+    if (auto *machoSection = dyn_cast<MachoSection>(BPSectionBase)) {
+      result[machoSection->getSection()] = priority;
+      delete machoSection;
     }
-    auto &traces = reader->getTemporalProfTraces();
-
-    DenseMap<unsigned, BPFunctionNode::UtilityNodeT> sectionIdxToFirstUN;
-    for (size_t traceIdx = 0; traceIdx < traces.size(); traceIdx++) {
-      uint64_t currentSize = 0, cutoffSize = 1;
-      size_t cutoffTimestamp = 1;
-      auto &trace = traces[traceIdx].FunctionNameRefs;
-      for (size_t timestamp = 0; timestamp < trace.size(); timestamp++) {
-        auto [Filename, ParsedFuncName] = getParsedIRPGOName(
-            reader->getSymtab().getFuncOrVarName(trace[timestamp]));
-        ParsedFuncName = getRootSymbol(ParsedFuncName);
-
-        auto sectionIdxsIt = rootSymbolToSectionIdxs.find(ParsedFuncName);
-        if (sectionIdxsIt == rootSymbolToSectionIdxs.end())
-          continue;
-        auto &sectionIdxs = sectionIdxsIt->getValue();
-        // If the same symbol is found in multiple sections, they might be
-        // identical, so we arbitrarily use the size from the first section.
-        currentSize += sections[*sectionIdxs.begin()]->getSize();
-
-        // Since BalancedPartitioning is sensitive to the initial order, we need
-        // to explicitly define it to be ordered by earliest timestamp.
-        for (unsigned sectionIdx : sectionIdxs) {
-          auto [it, wasInserted] =
-              sectionIdxToTimestamp.try_emplace(sectionIdx, timestamp);
-          if (!wasInserted)
-            it->getSecond() = std::min<size_t>(it->getSecond(), timestamp);
-        }
-
-        if (timestamp >= cutoffTimestamp || currentSize >= cutoffSize) {
-          ++maxUN;
-          cutoffSize = 2 * currentSize;
-          cutoffTimestamp = 2 * cutoffTimestamp;
-        }
-        for (unsigned sectionIdx : sectionIdxs)
-          sectionIdxToFirstUN.try_emplace(sectionIdx, maxUN);
-      }
-      for (auto &[sectionIdx, firstUN] : sectionIdxToFirstUN)
-        for (auto un = firstUN; un <= maxUN; ++un)
-          startupSectionIdxUNs[sectionIdx].push_back(un);
-      ++maxUN;
-      sectionIdxToFirstUN.clear();
-    }
-  }
-
-  SmallVector<unsigned> sectionIdxsForFunctionCompression,
-      sectionIdxsForDataCompression;
-  for (unsigned sectionIdx = 0; sectionIdx < sections.size(); sectionIdx++) {
-    if (startupSectionIdxUNs.count(sectionIdx))
-      continue;
-    const auto *isec = sections[sectionIdx];
-    if (isCodeSection(isec)) {
-      if (forFunctionCompression)
-        sectionIdxsForFunctionCompression.push_back(sectionIdx);
-    } else {
-      if (forDataCompression)
-        sectionIdxsForDataCompression.push_back(sectionIdx);
-    }
-  }
-
-  if (compressionSortStartupFunctions) {
-    SmallVector<unsigned> startupIdxs;
-    for (auto &[sectionIdx, uns] : startupSectionIdxUNs)
-      startupIdxs.push_back(sectionIdx);
-    auto unsForStartupFunctionCompression =
-        getUnsForCompression(sections, sectionToIdx, startupIdxs,
-                             /*duplicateSectionIdxs=*/nullptr, maxUN);
-    for (auto &[sectionIdx, compressionUns] :
-         unsForStartupFunctionCompression) {
-      auto &uns = startupSectionIdxUNs[sectionIdx];
-      uns.append(compressionUns);
-      llvm::sort(uns);
-      uns.erase(std::unique(uns.begin(), uns.end()), uns.end());
-    }
-  }
-
-  // Map a section index (order directly) to a list of duplicate section indices
-  // (not ordered directly).
-  DenseMap<unsigned, SmallVector<unsigned>> duplicateSectionIdxs;
-  auto unsForFunctionCompression = getUnsForCompression(
-      sections, sectionToIdx, sectionIdxsForFunctionCompression,
-      &duplicateSectionIdxs, maxUN);
-  auto unsForDataCompression = getUnsForCompression(
-      sections, sectionToIdx, sectionIdxsForDataCompression,
-      &duplicateSectionIdxs, maxUN);
-
-  std::vector<BPFunctionNode> nodesForStartup, nodesForFunctionCompression,
-      nodesForDataCompression;
-  for (auto &[sectionIdx, uns] : startupSectionIdxUNs)
-    nodesForStartup.emplace_back(sectionIdx, uns);
-  for (auto &[sectionIdx, uns] : unsForFunctionCompression)
-    nodesForFunctionCompression.emplace_back(sectionIdx, uns);
-  for (auto &[sectionIdx, uns] : unsForDataCompression)
-    nodesForDataCompression.emplace_back(sectionIdx, uns);
-
-  // Use the first timestamp to define the initial order for startup nodes.
-  llvm::sort(nodesForStartup, [&sectionIdxToTimestamp](auto &L, auto &R) {
-    return std::make_pair(sectionIdxToTimestamp[L.Id], L.Id) <
-           std::make_pair(sectionIdxToTimestamp[R.Id], R.Id);
-  });
-  // Sort compression nodes by their Id (which is the section index) because the
-  // input linker order tends to be not bad.
-  llvm::sort(nodesForFunctionCompression,
-             [](auto &L, auto &R) { return L.Id < R.Id; });
-  llvm::sort(nodesForDataCompression,
-             [](auto &L, auto &R) { return L.Id < R.Id; });
-
-  {
-    TimeTraceScope timeScope("Balanced Partitioning");
-    BalancedPartitioningConfig config;
-    BalancedPartitioning bp(config);
-    bp.run(nodesForStartup);
-    bp.run(nodesForFunctionCompression);
-    bp.run(nodesForDataCompression);
   }
-
-  unsigned numStartupSections = 0;
-  unsigned numCodeCompressionSections = 0;
-  unsigned numDuplicateCodeSections = 0;
-  unsigned numDataCompressionSections = 0;
-  unsigned numDuplicateDataSections = 0;
-  SetVector<const InputSection *> orderedSections;
-  // Order startup functions,
-  for (auto &node : nodesForStartup) {
-    const auto *isec = sections[node.Id];
-    if (orderedSections.insert(isec))
-      ++numStartupSections;
-  }
-  // then functions for compression,
-  for (auto &node : nodesForFunctionCompression) {
-    const auto *isec = sections[node.Id];
-    if (orderedSections.insert(isec))
-      ++numCodeCompressionSections;
-
-    auto It = duplicateSectionIdxs.find(node.Id);
-    if (It == duplicateSectionIdxs.end())
-      continue;
-    for (auto dupSecIdx : It->getSecond()) {
-      const auto *dupIsec = sections[dupSecIdx];
-      if (orderedSections.insert(dupIsec))
-        ++numDuplicateCodeSections;
-    }
-  }
-  // then data for compression.
-  for (auto &node : nodesForDataCompression) {
-    const auto *isec = sections[node.Id];
-    if (orderedSections.insert(isec))
-      ++numDataCompressionSections;
-    auto It = duplicateSectionIdxs.find(node.Id);
-    if (It == duplicateSectionIdxs.end())
-      continue;
-    for (auto dupSecIdx : It->getSecond()) {
-      const auto *dupIsec = sections[dupSecIdx];
-      if (orderedSections.insert(dupIsec))
-        ++numDuplicateDataSections;
-    }
-  }
-
-  if (verbose) {
-    unsigned numTotalOrderedSections =
-        numStartupSections + numCodeCompressionSections +
-        numDuplicateCodeSections + numDataCompressionSections +
-        numDuplicateDataSections;
-    dbgs()
-        << "Ordered " << numTotalOrderedSections
-        << " sections using balanced partitioning:\n  Functions for startup: "
-        << numStartupSections
-        << "\n  Functions for compression: " << numCodeCompressionSections
-        << "\n  Duplicate functions: " << numDuplicateCodeSections
-        << "\n  Data for compression: " << numDataCompressionSections
-        << "\n  Duplicate data: " << numDuplicateDataSections << "\n";
-
-    if (!profilePath.empty()) {
-      // Evaluate this function order for startup
-      StringMap<std::pair<uint64_t, uint64_t>> symbolToPageNumbers;
-      const uint64_t pageSize = (1 << 14);
-      uint64_t currentAddress = 0;
-      for (const auto *isec : orderedSections) {
-        for (Symbol *sym : isec->symbols) {
-          if (auto *d = dyn_cast_or_null<Defined>(sym)) {
-            uint64_t startAddress = currentAddress + d->value;
-            uint64_t endAddress = startAddress + d->size;
-            uint64_t firstPage = startAddress / pageSize;
-            // I think the kernel might pull in a few pages when one it touched,
-            // so it might be more accurate to force lastPage to be aligned by
-            // 4?
-            uint64_t lastPage = endAddress / pageSize;
-            StringRef rootSymbol = d->getName();
-            rootSymbol = getRootSymbol(rootSymbol);
-            symbolToPageNumbers.try_emplace(rootSymbol, firstPage, lastPage);
-            if (rootSymbol.consume_front("_") || rootSymbol.consume_front("l_"))
-              symbolToPageNumbers.try_emplace(rootSymbol, firstPage, lastPage);
-          }
-        }
-
-        currentAddress += isec->getSize();
-      }
-
-      // The area under the curve F where F(t) is the total number of page
-      // faults at step t.
-      unsigned area = 0;
-      for (auto &trace : reader->getTemporalProfTraces()) {
-        SmallSet<uint64_t, 0> touchedPages;
-        for (unsigned step = 0; step < trace.FunctionNameRefs.size(); step++) {
-          auto traceId = trace.FunctionNameRefs[step];
-          auto [Filename, ParsedFuncName] =
-              getParsedIRPGOName(reader->getSymtab().getFuncOrVarName(traceId));
-          ParsedFuncName = getRootSymbol(ParsedFuncName);
-          auto it = symbolToPageNumbers.find(ParsedFuncName);
-          if (it != symbolToPageNumbers.end()) {
-            auto &[firstPage, lastPage] = it->getValue();
-            for (uint64_t i = firstPage; i <= lastPage; i++)
-              touchedPages.insert(i);
-          }
-          area += touchedPages.size();
-        }
-      }
-      dbgs() << "Total area under the page fault curve: " << (float)area
-             << "\n";
-    }
-  }
-
-  DenseMap<const InputSection *, size_t> sectionPriorities;
-  for (const auto *isec : orderedSections)
-    sectionPriorities[isec] = --highestAvailablePriority;
-  return sectionPriorities;
+  return result;
 }
diff --git a/lld/MachO/BPSectionOrderer.h b/lld/MachO/BPSectionOrderer.h
index cefd0ceb10e561..3f006ed2fb3618 100644
--- a/lld/MachO/BPSectionOrderer.h
+++ b/lld/MachO/BPSectionOrderer.h
@@ -14,13 +14,149 @@
 #ifndef LLD_MACHO_BPSECTION_ORDERER_H
 #define LLD_MACHO_BPSECTION_ORDERER_H
 
+#include "InputSection.h"
+#include "Relocations.h"
+#include "Symbols.h"
+#include "lld/Common/SectionOrderer.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/TinyPtrVector.h"
 
 namespace lld::macho {
 
 class InputSection;
 
+class MachoSymbol : public BPSymbol {
+  const Symbol *sym;
+
+public:
+  explicit MachoSymbol(const Symbol *s) : sym(s) {}
+
+  llvm::StringRef getName() const override { return sym->getName(); }
+
+  BPSymbol *asDefinedSymbol() override {
+    if (auto *d = llvm::dyn_cast<Defined>(sym))
+      return this;
+    return nullptr;
+  }
+
+  uint64_t getValue() const override {
+    if (auto *d = llvm::dyn_cast<Defined>(sym))
+      return d->value;
+    return 0;
+  }
+
+  uint64_t getSize() const override {
+    if (auto *d = llvm::dyn_cast<Defined>(sym))
+      return d->size;
+    return 0;
+  }
+
+  const Symbol *getSymbol() const { return sym; }
+};
+
+class MachoSection : public BPSectionBase {
+  const InputSection *isec;
+  mutable std::vector<std::unique_ptr<MachoSymbol>> symbolCache;
+
+public:
+  explicit MachoSection(const InputSection *sec) : isec(sec) {}
+
+  const InputSection *getSection() const { return isec; }
+
+  llvm::StringRef getName() const override { return isec->getName(); }
+
+  uint64_t getSize() const override { return isec->getSize(); }
+
+  bool isCodeSection() const override { return macho::isCodeSection(isec); }
+
+  bool hasValidData() const override {
+    return isec && !isec->data.empty() && isec->data.data();
+  }
+
+  llvm::ArrayRef<uint8_t> getSectionData() const override { return isec->data; }
+
+  llvm::ArrayRef<BPSymbol *> getSymbols() const override {
+    // Lazy initialization of symbol cache
+    if (symbolCache.empty()) {
+      for (const auto *sym : isec->symbols)
+        symbolCache.push_back(std::make_unique<MachoSymbol>(sym));
+    }
+    static std::vector<BPSymbol *> result;
+    result.clear();
+    for (const auto &sym : symbolCache)
+      result.push_back(sym.get());
+    return result;
+  }
+
+  void getSectionHash(llvm::SmallVectorImpl<uint64_t> &hashes,
+                      const llvm::DenseMap<const BPSectionBase *, uint64_t>
+                          &sectionToIdx) const override {
+    constexpr unsigned windowSize = 4;
+
+    // Convert BPSectionBase map to InputSection map
+    llvm::DenseMap<const InputSection *, uint64_t> machoSectionToIdx;
+    for (const auto &[sec, idx] : sectionToIdx) {
+      if (auto *machoSec = llvm::dyn_cast<MachoSection>(sec))
+        machoSectionToIdx[machoSec->getInputSection()] = idx;
+    }
+
+    // Calculate content hashes
+    for (size_t i = 0; i < isec->data.size(); i++) {
+      auto window = isec->data.drop_front(i).take_front(windowSize);
+      hashes.push_back(xxHash64(window));
+    }
+
+    // Calculate relocation hashes
+    for (const auto &r : isec->relocs) {
+      if (r.length == 0 || r.referent.isNull() || r.offset >= isec->data.size())
+        continue;
+
+      uint64_t relocHash = getRelocHash(r, machoSectionToIdx);
+      uint32_t start = (r.offset < windowSize) ? 0 : r.offset - windowSize + 1;
+      for (uint32_t i = start; i < r.offset + r.length; i++) {
+        auto window = isec->data.drop_front(i).take_front(windowSize);
+        hashes.push_back(xxHash64(window) + relocHash);
+      }
+    }
+
+    llvm::sort(hashes);
+    hashes.erase(std::unique(hashes.begin(), hashes.end()), hashes.end());
+  }
+
+  const InputSection *getInputSection() const { return isec; }
+
+  static bool classof(const BPSectionBase *s) { return true; }
+
+private:
+  static uint64_t getRelocHash(
+      const Reloc &reloc,
+      const llvm::DenseMap<const InputSection *, uint64_t> &sectionToIdx) {
+    auto *isec = reloc.getReferentInputSection();
+    std::optional<uint64_t> sectionIdx;
+    auto sectionIdxIt = sectionToIdx.find(isec);
+    if (sectionIdxIt != sectionToIdx.end())
+      sectionIdx = sectionIdxIt->getSecond();
+
+    std::string kind;
+    if (isec)
+      kind = ("Section " + Twine(isec->kind())).str();
+
+    if (auto *sym = reloc.referent.dyn_cast<Symbol *>()) {
+      kind += (" Symbol " + Twine(sym->kind())).str();
+      if (auto *d = llvm::dyn_cast<Defined>(sym)) {
+        if (llvm::isa_and_nonnull<CStringInputSection>(isec))
+          return BPSectionBase::getRelocHash(kind, 0, isec->getOffset(d->value),
+                                             reloc.addend);
+        return BPSectionBase::getRelocHash(kind, sectionIdx.value_or(0),
+                                           d->value, reloc.addend);
+      }
+    }
+    return BPSectionBase::getRelocHash(kind, sectionIdx.value_or(0), 0,
+                                       reloc.addend);
+  }
+};
+
 /// Run Balanced Partitioning to find the optimal function and data order to
 /// improve startup time and compressed size.
 ///
diff --git a/lld/include/lld/Common/SectionOrderer.h b/lld/include/lld/Common/SectionOrderer.h
new file mode 100644
index 00000000000000..7edb79c57338d1
--- /dev/null
+++ b/lld/include/lld/Common/SectionOrderer.h
@@ -0,0 +1,75 @@
+//===- SectionOrderer.h ---------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the common interfaces which may be used by
+// BPSectionOrderer.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLD_COMMON_SECTION_ORDERER_H
+#define LLD_COMMON_SECTION_ORDERER_H
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Support/xxhash.h"
+
+namespace lld {
+
+class BPSymbol {
+
+public:
+  virtual ~BPSymbol() = default;
+  virtual llvm::StringRef getName() const = 0;
+  virtual BPSymbol *asDefinedSymbol() = 0;
+  virtual uint64_t getValue() const = 0;
+  virtual uint64_t getSize() const = 0;
+};
+
+class BPSectionBase {
+public:
+  virtual ~BPSectionBase() = default;
+  virtual llvm::StringRef getName() const = 0;
+  virtual uint64_t getSize() const = 0;
+  virtual bool hasValidData() const = 0;
+  virtual bool isCodeSection() const = 0;
+  virtual llvm::ArrayRef<uint8_t> getSectionData() const = 0;
+  virtual llvm::ArrayRef<BPSymbol *> getSymbols() const = 0;
+  virtual void
+  getSectionHash(llvm::SmallVectorImpl<uint64_t> &hashes,
+                 const llvm::DenseMap<const BPSectionBase *, uint64_t>
+                     &sectionToIdx) const = 0;
+  static llvm::StringRef getRootSymbol(llvm::StringRef Name) {
+    auto [P0, S0] = Name.rsplit(".llvm.");
+    auto [P1, S1] = P0.rsplit(".__uniq.");
+    return P1;
+  }
+
+  static uint64_t getRelocHash(llvm::StringRef kind, uint64_t sectionIdx,
+                               uint64_t offset, uint64_t addend) {
+    return llvm::xxHash64((kind + ": " + llvm::Twine::utohexstr(sectionIdx) +
+                           " + " + llvm::Twine::utohexstr(offset) + " + " +
+                           llvm::Twine::utohexstr(addend))
+                              .str());
+  }
+};
+
+class SectionOrderer {
+public:
+  static llvm::DenseMap<const BPSectionBase *, size_t>
+  reorderSectionsByBalancedPartitioning(
+      size_t &highestAvailablePriority, llvm::StringRef profilePath,
+      bool forFunctionCompression, bool forDataCompression,
+      bool compressionSortStartupFunctions, bool verbose,
+      llvm::SmallVector<BPSectionBase *> inputSections);
+};
+
+} // namespace lld
+
+#endif
diff --git a/lld/test/ELF/bp-section-orderer-errs.s b/lld/test/ELF/bp-section-orderer-errs.s
new file mode 100644
index 00000000000000..063b53f92cc74f
--- /dev/null
+++ b/lld/test/ELF/bp-section-orderer-errs.s
@@ -0,0 +1,19 @@
+# REQUIRES: aarch64
+
+# RUN: llvm-mc -filetype=obj -triple=aarch64 %s -o %t
+# RUN: echo "A B 5" > %t.call_graph
+# RUN: echo "B C 50" >> %t.call_graph
+# RUN: echo "C D 40" >> %t.call_graph
+# RUN: echo "D B 10" >> %t.call_graph
+# RUN: not ld.lld -o /dev/null --irpgo-profile-sort %s --call-graph-ordering-file=%t.call_graph 2>&1 | FileCheck %s --check-prefix=IRPGO-ERR
+# RUN: not ld.lld -o /dev/null --irpgo-profile-sort=%s --call-graph-ordering-file=%t.call_graph 2>&1 | FileCheck %s --check-prefix=IRPGO-ERR
+# IRPGO-ERR: --irpgo-profile-sort is incompatible with --call-graph-ordering-file
+
+# RUN: not ld.lld -o /dev/null --compression-sort=function --call-graph-ordering-file %t.call_graph 2>&1 | FileCheck %s --check-prefix=COMPRESSION-ERR
+# COMPRESSION-ERR: --compression-sort= is incompatible with --call-graph-ordering-file
+
+# RUN: not ld.lld -o /dev/null --compression-sort=malformed 2>&1 | FileCheck %s --check-prefix=COMPRESSION-MALFORM
+# COMPRESSION-MALFORM: unknown value `malformed` for --compression-sort=
+
+# RUN: not ld.lld -o /dev/null --compression-sort-startup-functions 2>&1 | FileCheck %s --check-prefix=STARTUP
+# STARTUP: --compression-sort-startup-functions must be used with --irpgo-profile-sort
diff --git a/lld/test/ELF/bp-section-orderer-stress.s b/lld/test/ELF/bp-section-orderer-stress.s
new file mode 100644
index 00000000000000..3e42d5e96969d9
--- /dev/null
+++ b/lld/test/ELF/bp-section-orderer-stress.s
@@ -0,0 +1,105 @@
+# REQUIRES: aarch64
+
+# Generate a large test case and check that the output is deterministic.
+
+# RUN: %python %s %t.s %t.proftext
+
+# RUN: llvm-mc -filetype=obj -triple=aarch64 %t.s -o %t.o
+# RUN: llvm-profdata merge %t.proftext -o %t.profdata
+
+# RUN: ld.lld -arch arm64 -lSystem -e _main --icf=all -o - %t.o --irpgo-profile-sort=%t.profdata --compression-sort-startup-functions --compression-sort=both | llvm-nm --numeric-sort --format=just-symbols - > %t.order1.txt
+# RUN: ld.lld -arch arm64 -lSystem -e _main --icf=all -o - %t.o --irpgo-profile-sort=%t.profdata --compression-sort-startup-functions --compression-sort=both | llvm-nm --numeric-sort --format=just-symbols - > %t.order2.txt
+# RUN: diff %t.order1.txt %t.order2.txt
+
+import random
+import sys
+
+assembly_filepath = sys.argv[1]
+proftext_filepath = sys.argv[2]
+
+random.seed(1234)
+num_functions = 1000
+num_data = 100
+num_traces = 10
+
+function_names = [f"f{n}" for n in range(num_functions)]
+data_names = [f"d{n}" for n in range(num_data)]
+profiled_functions = function_names[: int(num_functions / 2)]
+
+function_contents = [
+    f"""
+{name}:
+  add w0, w0, #{i % 4096}
+  add w1, w1, #{i % 10}
+  add w2, w0, #{i % 20}
+  adrp x3, {name}@PAGE
+  ret
+"""
+    for i, name in enumerate(function_names)
+]
+
+data_contents = [
+      f"""
+{name}:
+  .ascii "s{i % 2}-{i % 3}-{i % 5}"
+  .xword {name}
+"""
+    for i, name in enumerate(data_names)
+]
+
+trace_contents = [
+    f"""
+# Weight
+1
+{", ".join(random.sample(profiled_functions, len(profiled_functions)))}
+"""
+    for i in range(num_traces)
+]
+
+profile_contents = [
+    f"""
+{name}
+# Func Hash:
+{i}
+# Num Counters:
+1
+# Counter Values:
+1
+"""
+    for i, name in enumerate(profiled_functions)
+]
+
+with open(assembly_filepath, "w") as f:
+    f.write(
+        f"""
+.text
+.globl _main
+
+_main:
+  ret
+
+{"".join(function_contents)}
+
+.data
+{"".join(data_contents)}
+
+.subsections_via_symbols
+"""
+    )
+
+with open(proftext_filepath, "w") as f:
+    f.write(
+        f"""
+:ir
+:temporal_prof_traces
+
+# Num Traces
+{num_traces}
+# Trace Stream Size:
+{num_traces}
+
+{"".join(trace_contents)}
+
+{"".join(profile_contents)}
+"""
+    )
diff --git a/lld/test/ELF/bp-section-orderer.s b/lld/test/ELF/bp-section-orderer.s
new file mode 100644
index 00000000000000..100cfad9d67297
--- /dev/null
+++ b/lld/test/ELF/bp-section-orderer.s
@@ -0,0 +1,123 @@
+# REQUIRES: aarch64
+
+# RUN: rm -rf %t && split-file %s %t
+# RUN: llvm-mc -filetype=obj -triple=aarch64 %t/a.s -o %t/a.o
+# RUN: llvm-profdata merge %t/a.proftext -o %t/a.profdata
+
+# RUN: ld.lld -arch arm64 -lSystem -e _main -o %t/a.out %t/a.o --irpgo-profile-sort=%t/a.profdata --verbose-bp-section-orderer 2>&1 | FileCheck %s --check-prefix=STARTUP
+# RUN: ld.lld -arch arm64 -lSystem -e _main -o %t/a.out %t/a.o --irpgo-profile-sort=%t/a.profdata --verbose-bp-section-orderer --icf=all --compression-sort=none 2>&1 | FileCheck %s --check-prefix=STARTUP
+
+# STARTUP: Ordered 3 sections using balanced partitioning
+
+# RUN: ld.lld -arch arm64 -lSystem -e _main -o - %t/a.o --irpgo-profile-sort=%t/a.profdata -order_file %t/a.orderfile | llvm-nm --numeric-sort --format=just-symbols - | FileCheck %s --check-prefix=ORDERFILE
+
+# ORDERFILE: A
+# ORDERFILE: F
+# ORDERFILE: E
+# ORDERFILE: D
+# ORDERFILE-DAG: _main
+# ORDERFILE-DAG: _B
+# ORDERFILE-DAG: l_C
+# ORDERFILE-DAG: s1
+# ORDERFILE-DAG: s2
+# ORDERFILE-DAG: r1
+# ORDERFILE-DAG: r2
+
+# RUN: ld.lld -arch arm64 -lSystem -e _main -o %t/a.out %t/a.o --verbose-bp-section-orderer --compression-sort=function 2>&1 | FileCheck %s --check-prefix=COMPRESSION-FUNC
+# RUN: ld.lld -arch arm64 -lSystem -e _main -o %t/a.out %t/a.o --verbose-bp-section-orderer --compression-sort=data 2>&1 | FileCheck %s --check-prefix=COMPRESSION-DATA
+# RUN: ld.lld -arch arm64 -lSystem -e _main -o %t/a.out %t/a.o --verbose-bp-section-orderer --compression-sort=both 2>&1 | FileCheck %s --check-prefix=COMPRESSION-BOTH
+# RUN: ld.lld -arch arm64 -lSystem -e _main -o %t/a.out %t/a.o --verbose-bp-section-orderer --compression-sort=both --irpgo-profile-sort=%t/a.profdata 2>&1 | FileCheck %s --check-prefix=COMPRESSION-BOTH
+
+# COMPRESSION-FUNC: Ordered 7 sections using balanced partitioning
+# COMPRESSION-DATA: Ordered 4 sections using balanced partitioning
+# COMPRESSION-BOTH: Ordered 11 sections using balanced partitioning
+
+#--- a.s
+.text
+.globl _main, A, _B, l_C.__uniq.111111111111111111111111111111111111111.llvm.2222222222222222222
+
+_main:
+  ret
+A:
+  ret
+_B:
+  add w0, w0, #1
+  bl  A
+  ret
+l_C.__uniq.111111111111111111111111111111111111111.llvm.2222222222222222222:
+  add w0, w0, #2
+  bl  A
+  ret
+D:
+  add w0, w0, #2
+  bl _B
+  ret
+E:
+  add w0, w0, #2
+  bl l_C.__uniq.111111111111111111111111111111111111111.llvm.2222222222222222222
+  ret
+F:
+  add w0, w0, #3
+  bl l_C.__uniq.111111111111111111111111111111111111111.llvm.2222222222222222222
+  ret
+
+.data
+s1:
+  .ascii "hello world"
+s2:
+  .ascii "i am a string"
+r1:
+  .quad s1
+r2:
+  .quad r1
+
+.subsections_via_symbols
+
+#--- a.proftext
+:ir
+:temporal_prof_traces
+# Num Traces
+1
+# Trace Stream Size:
+1
+# Weight
+1
+A, B, C.__uniq.555555555555555555555555555555555555555.llvm.6666666666666666666
+
+A
+# Func Hash:
+1111
+# Num Counters:
+1
+# Counter Values:
+1
+
+B
+# Func Hash:
+2222
+# Num Counters:
+1
+# Counter Values:
+1
+
+C.__uniq.555555555555555555555555555555555555555.llvm.6666666666666666666
+# Func Hash:
+3333
+# Num Counters:
+1
+# Counter Values:
+1
+
+D
+# Func Hash:
+4444
+# Num Counters:
+1
+# Counter Values:
+1
+
+#--- a.orderfile
+A
+F
+E
+D