[lld] [lld][InstrProf] Profile guided function order (PR #96268)

Mon Jun 24 19:06:34 PDT 2024

================
@@ -0,0 +1,422 @@
+//===- BPSectionOrderer.cpp--------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "BPSectionOrderer.h"
+#include "InputSection.h"
+#include "lld/Common/ErrorHandler.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ProfileData/InstrProfReader.h"
+#include "llvm/Support/BalancedPartitioning.h"
+#include "llvm/Support/TimeProfiler.h"
+#include "llvm/Support/VirtualFileSystem.h"
+#include "llvm/Support/xxhash.h"
+
+#define DEBUG_TYPE "bp-section-orderer"
+using namespace llvm;
+using namespace lld::macho;
+
+// TODO: Move to StringRef.h
+static bool isNumber(StringRef S) {
+  return !S.empty() && S.find_first_not_of("0123456789") == StringRef::npos;
+}
+
+/// Symbols can be appended with "(.__uniq.xxxx)?.llvm.yyyy" where "xxxx" and
+/// "yyyy" are numbers that could change between builds. We need to use the root
+/// symbol name before this suffix so these symbols can be matched with profiles
+/// which may have different suffixes.
+static StringRef getRootSymbol(StringRef Name) {
+  auto [P0, S0] = Name.rsplit(".llvm.");
+  if (isNumber(S0))
+    Name = P0;
+  auto [P1, S1] = Name.rsplit(".__uniq.");
+  if (isNumber(S1))
+    return P1;
+  return Name;
+}
+
+static uint64_t getRelocHash(StringRef kind, uint64_t sectionIdx,
+                             uint64_t offset, uint64_t addend) {
+  return xxHash64((kind + ": " + Twine::utohexstr(sectionIdx) + " + " +
+                   Twine::utohexstr(offset) + " + " + Twine::utohexstr(addend))
+                      .str());
+}
+
+static uint64_t
+getRelocHash(const Reloc &reloc,
+             const DenseMap<const InputSection *, uint64_t> &sectionToIdx) {
+  auto *isec = reloc.getReferentInputSection();
+  std::optional<uint64_t> sectionIdx;
+  auto sectionIdxIt = sectionToIdx.find(isec);
+  if (sectionIdxIt != sectionToIdx.end())
+    sectionIdx = sectionIdxIt->getSecond();
+  std::string kind;
+  if (isec)
+    kind = ("Section " + Twine(isec->kind())).str();
+  if (auto *sym = reloc.referent.dyn_cast<Symbol *>()) {
+    kind += (" Symbol " + Twine(sym->kind())).str();
+    if (auto *d = dyn_cast<Defined>(sym)) {
+      if (isa_and_nonnull<CStringInputSection>(isec))
+        return getRelocHash(kind, 0, isec->getOffset(d->value), reloc.addend);
+      return getRelocHash(kind, sectionIdx.value_or(0), d->value, reloc.addend);
+    }
+  }
+  return getRelocHash(kind, sectionIdx.value_or(0), 0, reloc.addend);
+}
+
+static void constructNodesForCompression(
+    const SmallVector<const InputSection *> &sections,
+    const DenseMap<const InputSection *, uint64_t> &sectionToIdx,
+    const SmallVector<unsigned> &sectionIdxs,
+    std::vector<BPFunctionNode> &nodes,
+    DenseMap<unsigned, SmallVector<unsigned>> &duplicateSectionIdxs,
+    BPFunctionNode::UtilityNodeT &maxUN) {
+
+  SmallVector<std::pair<unsigned, SmallVector<uint64_t>>> sectionHashes;
+  sectionHashes.reserve(sectionIdxs.size());
+  SmallVector<uint64_t> hashes;
+  for (unsigned sectionIdx : sectionIdxs) {
+    const auto *isec = sections[sectionIdx];
+    constexpr unsigned windowSize = 4;
+
+    for (size_t i = 0; i < isec->data.size(); i++) {
+      auto window = isec->data.drop_front(i).take_front(windowSize);
+      hashes.push_back(xxHash64(window));
+    }
+    for (const auto &r : isec->relocs) {
+      if (r.length == 0 || r.referent.isNull() || r.offset >= isec->data.size())
+        continue;
+      uint64_t relocHash = getRelocHash(r, sectionToIdx);
+      uint32_t start = (r.offset < windowSize) ? 0 : r.offset - windowSize + 1;
+      for (uint32_t i = start; i < r.offset + r.length; i++) {
+        auto window = isec->data.drop_front(i).take_front(windowSize);
+        hashes.push_back(xxHash64(window) + relocHash);
+      }
+    }
+
+    llvm::sort(hashes);
+    hashes.erase(std::unique(hashes.begin(), hashes.end()), hashes.end());
+
+    sectionHashes.emplace_back(sectionIdx, hashes);
+    hashes.clear();
+  }
+
+  DenseMap<uint64_t, unsigned> hashFrequency;
+  for (auto &[sectionIdx, hashes] : sectionHashes)
+    for (auto hash : hashes)
+      ++hashFrequency[hash];
+
+  // Merge section that are nearly identical
+  SmallVector<std::pair<unsigned, SmallVector<uint64_t>>> newSectionHashes;
+  DenseMap<uint64_t, unsigned> wholeHashToSectionIdx;
+  for (auto &[sectionIdx, hashes] : sectionHashes) {
+    uint64_t wholeHash = 0;
+    for (auto hash : hashes)
+      if (hashFrequency[hash] > 5)
+        wholeHash ^= hash;
+    auto [it, wasInserted] =
+        wholeHashToSectionIdx.insert(std::make_pair(wholeHash, sectionIdx));
+    if (wasInserted) {
+      newSectionHashes.emplace_back(sectionIdx, hashes);
+    } else {
+      duplicateSectionIdxs[it->getSecond()].push_back(sectionIdx);
+    }
+  }
+  sectionHashes = newSectionHashes;
+
+  // Recompute hash frequencies
+  hashFrequency.clear();
+  for (auto &[sectionIdx, hashes] : sectionHashes)
+    for (auto hash : hashes)
+      ++hashFrequency[hash];
+
+  // Filter rare and common hashes and assign each a unique utility node that
+  // doesn't conflict with the trace utility nodes
+  DenseMap<uint64_t, BPFunctionNode::UtilityNodeT> hashToUN;
+  for (auto &[hash, frequency] : hashFrequency) {
+    if (frequency <= 1 || frequency * 2 > wholeHashToSectionIdx.size())
+      continue;
+    hashToUN[hash] = ++maxUN;
+  }
+
+  std::vector<BPFunctionNode::UtilityNodeT> uns;
+  for (auto &[sectionIdx, hashes] : sectionHashes) {
+    for (auto &hash : hashes) {
+      auto it = hashToUN.find(hash);
+      if (it != hashToUN.end())
+        uns.push_back(it->second);
+    }
+    nodes.emplace_back(sectionIdx, uns);
+    uns.clear();
+  }
+}
+
+DenseMap<const InputSection *, size_t> lld::macho::runBalancedPartitioning(
+    size_t &highestAvailablePriority, StringRef profilePath,
+    bool forFunctionCompression, bool forDataCompression) {
+
+  SmallVector<const InputSection *> sections;
+  DenseMap<const InputSection *, uint64_t> sectionToIdx;
+  StringMap<DenseSet<unsigned>> symbolToSectionIdxs;
+  for (const auto *file : inputFiles) {
+    for (auto *sec : file->sections) {
+      for (auto &subsec : sec->subsections) {
+        auto *isec = subsec.isec;
+        if (!isec || isec->data.empty() || !isec->data.data())
+          continue;
+        unsigned sectionIdx = sections.size();
+        sectionToIdx.try_emplace(isec, sectionIdx);
+        sections.push_back(isec);
+        for (Symbol *sym : isec->symbols)
+          if (auto *d = dyn_cast_or_null<Defined>(sym))
+            symbolToSectionIdxs[d->getName()].insert(sectionIdx);
+      }
+    }
+  }
+
+  StringMap<DenseSet<unsigned>> rootSymbolToSectionIdxs;
+  for (auto &entry : symbolToSectionIdxs) {
+    StringRef name = entry.getKey();
+    auto &sectionIdxs = entry.getValue();
+    name = getRootSymbol(name);
+    rootSymbolToSectionIdxs[name].insert(sectionIdxs.begin(),
+                                         sectionIdxs.end());
+    // Linkage names can be prefixed with "_" or "l_" on Mach-O. See
+    // Mangler::getNameWithPrefix() for details.
+    if (name.consume_front("_") || name.consume_front("l_"))
+      rootSymbolToSectionIdxs[name].insert(sectionIdxs.begin(),
+                                           sectionIdxs.end());
+  }
+
+  std::vector<BPFunctionNode> nodesForStartup;
+  BPFunctionNode::UtilityNodeT maxUN = 0;
+  DenseMap<unsigned, SmallVector<BPFunctionNode::UtilityNodeT>>
+      startupSectionIdxUNs;
+  std::unique_ptr<InstrProfReader> reader;
+  if (!profilePath.empty()) {
+    auto fs = vfs::getRealFileSystem();
+    auto readerOrErr = InstrProfReader::create(profilePath, *fs);
+    lld::checkError(readerOrErr.takeError());
+
+    reader = std::move(readerOrErr.get());
+    for (auto &entry : *reader) {
+      // Read all entries
+      (void)entry;
+    }
+    auto &traces = reader->getTemporalProfTraces();
+
+    // Used to define the initial order for startup functions.
+    DenseMap<unsigned, size_t> sectionIdxToTimestamp;
+    DenseMap<unsigned, BPFunctionNode::UtilityNodeT> sectionIdxToFirstUN;
+    for (size_t traceIdx = 0; traceIdx < traces.size(); traceIdx++) {
+      uint64_t currentSize = 0, cutoffSize = 1;
+      size_t cutoffTimestamp = 1;
+      auto &trace = traces[traceIdx].FunctionNameRefs;
+      for (size_t timestamp = 0; timestamp < trace.size(); timestamp++) {
+        auto [Filename, ParsedFuncName] = getParsedIRPGOName(
+            reader->getSymtab().getFuncOrVarName(trace[timestamp]));
+        ParsedFuncName = getRootSymbol(ParsedFuncName);
+
+        auto sectionIdxsIt = rootSymbolToSectionIdxs.find(ParsedFuncName);
+        if (sectionIdxsIt == rootSymbolToSectionIdxs.end())
+          continue;
+        auto &sectionIdxs = sectionIdxsIt->getValue();
+        // If the same symbol is found in multiple sections, they might be
+        // identical, so we arbitrarily use the size from the first section.
+        currentSize += sections[*sectionIdxs.begin()]->getSize();
+
+        // Since BalancedPartitioning is sensitive to the initial order, we need
+        // to explicitly define it to be ordered by earliest timestamp.
+        for (unsigned sectionIdx : sectionIdxs) {
+          auto [it, wasInserted] =
+              sectionIdxToTimestamp.try_emplace(sectionIdx, timestamp);
+          if (!wasInserted)
+            it->getSecond() = std::min<size_t>(it->getSecond(), timestamp);
+        }
+
+        if (timestamp >= cutoffTimestamp || currentSize >= cutoffSize) {
+          ++maxUN;
+          cutoffSize = 2 * currentSize;
+          cutoffTimestamp = 2 * cutoffTimestamp;
+        }
+        for (unsigned sectionIdx : sectionIdxs)
+          sectionIdxToFirstUN.try_emplace(sectionIdx, maxUN);
+      }
+      for (auto &[sectionIdx, firstUN] : sectionIdxToFirstUN)
+        for (auto un = firstUN; un <= maxUN; ++un)
+          startupSectionIdxUNs[sectionIdx].push_back(un);
+      ++maxUN;
+      sectionIdxToFirstUN.clear();
+    }
+
+    // These uns should already be sorted without duplicates.
+    for (auto &[sectionIdx, uns] : startupSectionIdxUNs)
+      nodesForStartup.emplace_back(sectionIdx, uns);
+
+    llvm::sort(nodesForStartup, [&sectionIdxToTimestamp](auto &L, auto &R) {
+      return std::make_pair(sectionIdxToTimestamp[L.Id], L.Id) <
+             std::make_pair(sectionIdxToTimestamp[R.Id], R.Id);
+    });
+  }
+
+  SmallVector<unsigned> sectionIdxsForFunctionCompression,
+      sectionIdxsForDataCompression;
+  for (unsigned sectionIdx = 0; sectionIdx < sections.size(); sectionIdx++) {
----------------
thevinster wrote:

Looks like we pay the cost of this loop even if we don't enable compression for function or data. Should this be guarded under the condition if either flag is present? 

https://github.com/llvm/llvm-project/pull/96268