[PATCH] D36351: [lld][ELF] Add profile guided section layout

Tue Mar 6 15:03:42 PST 2018

Ah, and please git-clang-format :-)

Cheers,
Rafael

Michael Spencer via Phabricator via llvm-commits
<llvm-commits at lists.llvm.org> writes:

> Bigcheese updated this revision to Diff 137040.
> Bigcheese added a comment.
>
> Rewrote the algorithm to match hfsort.  Now gets the same performance as hfsort in Rafael's testcase.
>
>
> https://reviews.llvm.org/D36351
>
> Files:
>   ELF/CMakeLists.txt
>   ELF/CallGraphSort.cpp
>   ELF/CallGraphSort.h
>   ELF/Config.h
>   ELF/Driver.cpp
>   ELF/Options.td
>   ELF/Writer.cpp
>   test/ELF/cgprofile-txt.s
>
> Index: test/ELF/cgprofile-txt.s
> ===================================================================
> --- /dev/null
> +++ test/ELF/cgprofile-txt.s
> @@ -0,0 +1,106 @@
> +# REQUIRES: x86
> +
> +# RUN: llvm-mc -filetype=obj -triple=x86_64-unknown-linux %s -o %t
> +# RUN: ld.lld -e A %t -o %t2
> +# RUN: llvm-readobj -symbols %t2 | FileCheck %s --check-prefix=NOSORT
> +
> +# RUN: echo "A B 100" > %t.call_graph
> +# RUN: echo "A C 40" >> %t.call_graph
> +# RUN: echo "B C 30" >> %t.call_graph
> +# RUN: echo "C D 90" >> %t.call_graph
> +# RUN: echo "PP TS 100" >> %t.call_graph
> +# RUN: echo "_init2 _init 24567837" >> %t.call_graph
> +# RUN: echo "TS QC 9001" >> %t.call_graph
> +# RUN: ld.lld -e A %t --call-graph-ordering-file %t.call_graph -o %t2
> +# RUN: llvm-readobj -symbols %t2 | FileCheck %s
> +
> +    .section    .text.D,"ax", at progbits
> +D:
> +    retq
> +
> +    .section    .text.C,"ax", at progbits
> +    .globl  C
> +C:
> +    retq
> +
> +    .section    .text.B,"ax", at progbits
> +    .globl  B
> +B:
> +    retq
> +
> +    .section    .text.A,"ax", at progbits
> +    .globl  A
> +A:
> +    retq
> +
> +    .section    .ponies,"ax", at progbits,unique,1
> +    .globl TS
> +TS:
> +    retq
> +
> +    .section    .ponies,"ax", at progbits,unique,2
> +    .globl PP
> +PP:
> +    retq
> +
> +    .section    .other,"ax", at progbits,unique,1
> +    .globl QC
> +QC:
> +    retq
> +
> +    .section    .other,"ax", at progbits,unique,2
> +    .globl GB
> +GB:
> +    retq
> +
> +    .section    .init,"ax", at progbits,unique,1
> +    .globl _init
> +_init:
> +    retq
> +
> +    .section    .init,"ax", at progbits,unique,2
> +    .globl _init2
> +_init2:
> +    retq
> +
> +# CHECK:          Name: D
> +# CHECK-NEXT:     Value: 0x201003
> +# CHECK:          Name: A
> +# CHECK-NEXT:     Value: 0x201000
> +# CHECK:          Name: B
> +# CHECK-NEXT:     Value: 0x201001
> +# CHECK:          Name: C
> +# CHECK-NEXT:     Value: 0x201002
> +# CHECK:          Name: GB
> +# CHECK-NEXT:     Value: 0x201007
> +# CHECK:          Name: PP
> +# CHECK-NEXT:     Value: 0x201004
> +# CHECK:          Name: QC
> +# CHECK-NEXT:     Value: 0x201006
> +# CHECK:          Name: TS
> +# CHECK-NEXT:     Value: 0x201005
> +# CHECK:          Name: _init
> +# CHECK-NEXT:     Value: 0x201008
> +# CHECK:          Name: _init2
> +# CHECK-NEXT:     Value: 0x201009
> +
> +# NOSORT:          Name: D
> +# NOSORT-NEXT:     Value: 0x201000
> +# NOSORT:          Name: A
> +# NOSORT-NEXT:     Value: 0x201003
> +# NOSORT:          Name: B
> +# NOSORT-NEXT:     Value: 0x201002
> +# NOSORT:          Name: C
> +# NOSORT-NEXT:     Value: 0x201001
> +# NOSORT:          Name: GB
> +# NOSORT-NEXT:     Value: 0x201007
> +# NOSORT:          Name: PP
> +# NOSORT-NEXT:     Value: 0x201005
> +# NOSORT:          Name: QC
> +# NOSORT-NEXT:     Value: 0x201006
> +# NOSORT:          Name: TS
> +# NOSORT-NEXT:     Value: 0x201004
> +# NOSORT:          Name: _init
> +# NOSORT-NEXT:     Value: 0x201008
> +# NOSORT:          Name: _init2
> +# NOSORT-NEXT:     Value: 0x201009
> Index: ELF/Writer.cpp
> ===================================================================
> --- ELF/Writer.cpp
> +++ ELF/Writer.cpp
> @@ -9,6 +9,7 @@
>  
>  #include "Writer.h"
>  #include "AArch64ErrataFix.h"
> +#include "CallGraphSort.h"
>  #include "Config.h"
>  #include "Filesystem.h"
>  #include "LinkerScript.h"
> @@ -1110,6 +1111,14 @@
>  // If no layout was provided by linker script, we want to apply default
>  // sorting for special input sections. This also handles --symbol-ordering-file.
>  template <class ELFT> void Writer<ELFT>::sortInputSections() {
> +  // Use the rarely used option -call-graph-ordering-file to sort sections.
> +  if (!Config->CallGraphProfile.empty()) {
> +    DenseMap<const InputSectionBase *, int> Order =
> +        computeCallGraphProfileOrder();
> +    for (BaseCommand *Base : Script->SectionCommands)
> +      if (auto *Sec = dyn_cast<OutputSection>(Base))
> +        sortSection(Sec, Order);
> +  }
>    // Build the order once since it is expensive.
>    DenseMap<const InputSectionBase *, int> Order = buildSectionOrder();
>    for (BaseCommand *Base : Script->SectionCommands)
> Index: ELF/Options.td
> ===================================================================
> --- ELF/Options.td
> +++ ELF/Options.td
> @@ -66,6 +66,9 @@
>      "Only set DT_NEEDED for shared libraries if used",
>      "Always set DT_NEEDED for shared libraries">;
>  
> +defm call_graph_ordering_file: Eq<"call-graph-ordering-file">,
> +  HelpText<"Layout sections to optimize the given callgraph">;
> +
>  // -chroot doesn't have a help text because it is an internal option.
>  defm chroot: Eq<"chroot">;
>  
> Index: ELF/Driver.cpp
> ===================================================================
> --- ELF/Driver.cpp
> +++ ELF/Driver.cpp
> @@ -571,6 +571,31 @@
>    return {BuildIdKind::None, {}};
>  }
>  
> +static void readCallGraph(MemoryBufferRef MB) {
> +  // Build a map from symbol name to section
> +  DenseMap<StringRef, InputSectionBase *> SymbolSection;
> +  for (InputFile *File : ObjectFiles)
> +    for (Symbol *Sym : File->getSymbols())
> +      if (auto *D = dyn_cast<Defined>(Sym))
> +        if (auto *IS = dyn_cast_or_null<InputSectionBase>(D->Section))
> +          SymbolSection[D->getName()] = IS;
> +
> +  std::vector<StringRef> Lines = args::getLines(MB);
> +  for (StringRef L : Lines) {
> +    SmallVector<StringRef, 3> Fields;
> +    L.split(Fields, ' ');
> +    if (Fields.size() != 3)
> +      fatal("parse error");
> +    uint64_t Count;
> +    if (!to_integer(Fields[2], Count))
> +      fatal("parse error");
> +    InputSectionBase *FromSec = SymbolSection.lookup(Fields[0]);
> +    InputSectionBase *ToSec = SymbolSection.lookup(Fields[1]);
> +    if (FromSec && ToSec)
> +      Config->CallGraphProfile[std::make_pair(FromSec, ToSec)] = Count;
> +  }
> +}
> +
>  static bool getCompressDebugSections(opt::InputArgList &Args) {
>    StringRef S = Args.getLastArgValue(OPT_compress_debug_sections, "none");
>    if (S == "none")
> @@ -1118,6 +1143,10 @@
>    // Apply symbol renames for -wrap.
>    Symtab->applySymbolWrap();
>  
> +  if (auto *Arg = Args.getLastArg(OPT_call_graph_ordering_file))
> +    if (Optional<MemoryBufferRef> Buffer = readFile(Arg->getValue()))
> +      readCallGraph(*Buffer);
> +
>    // Now that we have a complete list of input files.
>    // Beyond this point, no new files are added.
>    // Aggregate all input sections into one place.
> Index: ELF/Config.h
> ===================================================================
> --- ELF/Config.h
> +++ ELF/Config.h
> @@ -24,6 +24,7 @@
>  namespace elf {
>  
>  class InputFile;
> +class InputSectionBase;
>  
>  enum ELFKind {
>    ELFNoneKind,
> @@ -103,6 +104,9 @@
>    std::vector<SymbolVersion> VersionScriptGlobals;
>    std::vector<SymbolVersion> VersionScriptLocals;
>    std::vector<uint8_t> BuildIdVector;
> +  llvm::MapVector<std::pair<const InputSectionBase *, const InputSectionBase *>,
> +                  uint64_t>
> +      CallGraphProfile;
>    bool AllowMultipleDefinition;
>    bool AndroidPackDynRelocs = false;
>    bool ARMHasBlx = false;
> Index: ELF/CallGraphSort.h
> ===================================================================
> --- /dev/null
> +++ ELF/CallGraphSort.h
> @@ -0,0 +1,23 @@
> +//===- CallGraphSort.h ------------------------------------------*- C++ -*-===//
> +//
> +//                             The LLVM Linker
> +//
> +// This file is distributed under the University of Illinois Open Source
> +// License. See LICENSE.TXT for details.
> +//
> +//===----------------------------------------------------------------------===//
> +
> +#ifndef LLD_ELF_CALL_GRAPH_SORT_H
> +#define LLD_ELF_CALL_GRAPH_SORT_H
> +
> +#include "llvm/ADT/DenseMap.h"
> +
> +namespace lld {
> +namespace elf {
> +class InputSectionBase;
> +
> +llvm::DenseMap<const InputSectionBase *, int> computeCallGraphProfileOrder();
> +} // namespace elf
> +} // namespace lld
> +
> +#endif
> Index: ELF/CallGraphSort.cpp
> ===================================================================
> --- /dev/null
> +++ ELF/CallGraphSort.cpp
> @@ -0,0 +1,350 @@
> +//===- CallGraphSort.cpp --------------------------------------------------===//
> +//
> +//                             The LLVM Linker
> +//
> +// This file is distributed under the University of Illinois Open Source
> +// License. See LICENSE.TXT for details.
> +//
> +//===----------------------------------------------------------------------===//
> +///
> +/// Implementation of Call-Chain Clustering from: Optimizing Function Placement
> +/// for Large-Scale Data-Center Applications
> +/// https://research.fb.com/wp-content/uploads/2017/01/cgo2017-hfsort-final1.pdf
> +///
> +/// The goal of this algorithm is to improve runtime performance of the final
> +/// executable by arranging code sections such that page table and i-cache
> +/// misses are minimized.
> +///
> +/// Definitions:
> +/// * Cluster
> +///   * An ordered list of input sections which are layed out as a unit. At the
> +///     beginning of the algorithm each input section has its own cluster and
> +///     the weight of the cluster is the sum of the weight of all incomming
> +///     edges.
> +/// * Call-Chain Clustering (C³) Heuristic
> +///   * Defines when and how clusters are combined. Pick the highest weighted
> +///     input section then add it to its most likely predecessor if it wouldn't
> +///     penalize it too much.
> +/// * Density
> +///   * The weight of the cluster divided by the size of the cluster. This is a
> +///     proxy for the ammount of execution time spent per byte of the cluster.
> +///
> +/// It does so given a call graph profile by the following:
> +/// * Build a weighted call graph from the profile
> +/// * Sort input sections by weight
> +/// * For each input section starting with the highest weight
> +///   * Find its most likely predecessor cluster
> +///   * Check if the combined cluster would be too large, or would have too low
> +///     a density.
> +///   * If not, then combine the clusters.
> +/// * Sort non-empty clusters by density
> +///
> +//===----------------------------------------------------------------------===//
> +
> +#include "CallGraphSort.h"
> +#include "OutputSections.h"
> +#include "SymbolTable.h"
> +#include "Symbols.h"
> +
> +#include "llvm/Support/MathExtras.h"
> +
> +using namespace llvm;
> +using namespace lld;
> +using namespace lld::elf;
> +
> +namespace {
> +using ClusterIndex = std::ptrdiff_t;
> +using SectionIndex = std::ptrdiff_t;
> +using EdgeIndex = std::ptrdiff_t;
> +
> +// Used for calculating an comparing density.  Use soft-float for determinism.
> +struct Double : APFloat {
> +  Double() : APFloat(APFloat::IEEEdouble(), 0) {}
> +  Double(uint64_t Val) : APFloat(APFloat::IEEEdouble(), Val) {}
> +  Double(APFloat A) : APFloat(A) {}
> +  bool operator>(const Double Other) const {
> +    return compare(Other) == cmpGreaterThan;
> +  }
> +  bool operator<(const Double Other) const {
> +    return compare(Other) == cmpLessThan;
> +  }
> +};
> +
> +struct Cluster {
> +  Cluster(SectionIndex Sec, const InputSectionBase *IS);
> +
> +  Double getDensity() const {
> +    if (Size == 0)
> +      return 0;
> +    return Double(Weight) / Double(Size);
> +  }
> +
> +  std::vector<const InputSectionBase *> ISBs;
> +  std::vector<SectionIndex> Sections;
> +  int64_t Size = 0;
> +  uint64_t Weight = 0;
> +};
> +
> +struct Section {
> +  Section(const InputSectionBase *IS) : ISB(IS) { Size = ISB->getSize(); }
> +
> +  Double getDensity() const {
> +    if (Size == 0)
> +      return 0;
> +    return Double(Weight) / Double(Size);
> +  }
> +
> +  int64_t Size = 0;
> +  uint64_t Weight = 0;
> +  const InputSectionBase *ISB;
> +  std::vector<SectionIndex> Preds;
> +  std::vector<SectionIndex> Succs;
> +};
> +
> +struct Edge {
> +  SectionIndex From;
> +  SectionIndex To;
> +  uint64_t Weight;
> +  Double NormalizedWeight = 0;
> +
> +  bool operator==(const Edge Other) const;
> +};
> +
> +struct EdgeDenseMapInfo {
> +  static Edge getEmptyKey() {
> +    return {DenseMapInfo<SectionIndex>::getEmptyKey(),
> +            DenseMapInfo<SectionIndex>::getEmptyKey(), 0, 0};
> +  }
> +  static Edge getTombstoneKey() {
> +    return {DenseMapInfo<SectionIndex>::getTombstoneKey(),
> +            DenseMapInfo<SectionIndex>::getTombstoneKey(), 0, 0};
> +  }
> +  static unsigned getHashValue(const Edge &Val) {
> +    return hash_combine(DenseMapInfo<SectionIndex>::getHashValue(Val.From),
> +                        DenseMapInfo<SectionIndex>::getHashValue(Val.To));
> +  }
> +  static bool isEqual(const Edge &LHS, const Edge &RHS) { return LHS == RHS; }
> +};
> +
> +class CallGraphSort {
> +public:
> +  CallGraphSort();
> +
> +  DenseMap<const InputSectionBase *, int> run();
> +
> +private:
> +  DenseMap<Edge, EdgeIndex, EdgeDenseMapInfo> EdgeMap;
> +  std::vector<Cluster> Clusters;
> +  std::vector<Edge> Edges;
> +  std::vector<Section> Sections;
> +
> +  void normalizeEdgeWeights();
> +  void generateClusters();
> +};
> +
> +// Maximum ammount the combined cluster density can be worse than the original
> +// cluster to consider merging.
> +constexpr int MAX_DENSITY_DEGRADATION = 8;
> +
> +// Maximum cluster size in bytes.
> +constexpr uint64_t MAX_CLUSTER_SIZE = 1024 * 1024;
> +} // end anonymous namespace
> +
> +Cluster::Cluster(SectionIndex Sec, const InputSectionBase *IS) {
> +  ISBs.push_back(IS);
> +  Sections.push_back(Sec);
> +  Size = IS->getSize();
> +}
> +
> +bool Edge::operator==(const Edge Other) const {
> +  return From == Other.From && To == Other.To;
> +}
> +
> +// Take the edge list in Config->CallGraphProfile, resolve symbol names to
> +// Symbols, and generate a graph between InputSections with the provided
> +// weights.
> +CallGraphSort::CallGraphSort() {
> +  MapVector<std::pair<const InputSectionBase *, const InputSectionBase *>,
> +            uint64_t> &Profile = Config->CallGraphProfile;
> +  DenseMap<const InputSectionBase *, SectionIndex> SecToSec;
> +
> +  auto GetOrCreateNode = [&](const InputSectionBase *IS) -> SectionIndex {
> +    auto Res = SecToSec.insert(std::make_pair(IS, Sections.size()));
> +    if (Res.second)
> +      Sections.emplace_back(IS);
> +    return Res.first->second;
> +  };
> +
> +  // Create the graph.
> +  for (const auto &C : Profile) {
> +    const InputSectionBase *FromSB = C.first.first;
> +    const InputSectionBase *ToSB = C.first.second;
> +    uint64_t Weight = C.second;
> +
> +    if (Weight == 0)
> +      continue;
> +
> +    // Ignore edges between input sections belonging to different output
> +    // sections.  This is done because otherwise we would end up with clusters
> +    // containing input sections that can't actually be placed adjacently in the
> +    // output.  This messes with the cluster size and density calculations.  We
> +    // would also end up moving input sections in other output sections without
> +    // moving them closer to what calls them.
> +    if (FromSB->getOutputSection() != ToSB->getOutputSection())
> +      continue;
> +
> +    SectionIndex From = GetOrCreateNode(FromSB);
> +    SectionIndex To = GetOrCreateNode(ToSB);
> +
> +    Sections[To].Weight = SaturatingAdd(Sections[To].Weight, Weight);
> +
> +    if (From == To)
> +      continue;
> +
> +    Edge E{From, To, Weight};
> +
> +    // Add or increment an edge
> +    auto Res = EdgeMap.insert(std::make_pair(E, Edges.size()));
> +    EdgeIndex EI = Res.first->second;
> +    if (Res.second) {
> +      Edges.push_back(E);
> +      Sections[From].Succs.push_back(To);
> +      Sections[To].Preds.push_back(From);
> +    } else
> +      Edges[EI].Weight = SaturatingAdd(Edges[EI].Weight, Weight);
> +  }
> +  normalizeEdgeWeights();
> +}
> +
> +// Normalize the edge weights so that we can reject edges which have a low
> +// probibility.
> +void CallGraphSort::normalizeEdgeWeights() {
> +  for (SectionIndex SI = 0, SE = Sections.size(); SI != SE; ++SI) {
> +    Section &S = Sections[SI];
> +    for (SectionIndex PI : S.Preds) {
> +      Edge &E = Edges[EdgeMap[{PI, SI, 0, 0}]];
> +      if (S.Weight == 0)
> +        continue;
> +      E.NormalizedWeight = Double(E.Weight) / Double(S.Weight);
> +    }
> +  }
> +}
> +
> +// It's bad to merge clusters which would degrade the density too much.
> +static bool isNewDensityBad(Cluster &A, Cluster &B) {
> +  Double NewDensity = Double(A.Weight + B.Weight) / Double(A.Size + B.Size);
> +  if (A.getDensity() >
> +      NewDensity * Double(MAX_DENSITY_DEGRADATION))
> +    return true;
> +  return false;
> +}
> +
> +static void mergeClusters(Cluster &Into, Cluster &From) {
> +  Into.ISBs.insert(Into.ISBs.end(), From.ISBs.begin(), From.ISBs.end());
> +  Into.Sections.insert(Into.Sections.end(), From.Sections.begin(),
> +                       From.Sections.end());
> +  Into.Size += From.Size;
> +  Into.Weight += From.Weight;
> +  From.ISBs.clear();
> +  From.Sections.clear();
> +  From.Size = 0;
> +  From.Weight = 0;
> +}
> +
> +// Group InputSections into clusters using the Call-Chain Clustering heuristic
> +// then sort the clusters by density.
> +void CallGraphSort::generateClusters() {
> +  // Minimum edge probability to consider merging.
> +  const Double MIN_EDGE_PROBABILITY = Double(1) / Double(10);
> +
> +  std::vector<SectionIndex> SortedSecs;
> +  std::vector<Cluster *> SecToCluster(Sections.size());
> +
> +  Clusters.reserve(Sections.size());
> +
> +  for (SectionIndex SI = 0, SE = Sections.size(); SI != SE; ++SI) {
> +    Cluster C(SI, Sections[SI].ISB);
> +    C.Size = Sections[SI].Size;
> +    C.Weight = Sections[SI].Weight;
> +    Clusters.push_back(C);
> +    SortedSecs.push_back(SI);
> +  }
> +
> +  for (Cluster &C : Clusters) {
> +    SecToCluster[C.Sections.front()] = &C;
> +  }
> +
> +  std::stable_sort(SortedSecs.begin(), SortedSecs.end(),
> +                   [&](SectionIndex A, SectionIndex B) {
> +                     return Sections[A].getDensity() > Sections[B].getDensity();
> +                   });
> +
> +  for (SectionIndex SI : SortedSecs) {
> +    Cluster &C = *SecToCluster[SI];
> +
> +    SectionIndex BestPred = -1;
> +    Double BestWeight = 0;
> +
> +    for (SectionIndex PI : Sections[SI].Preds) {
> +      Edge &E = Edges[EdgeMap[{PI, SI, 0, 0}]];
> +      if (BestPred == -1 || E.NormalizedWeight > BestWeight) {
> +        BestPred = PI;
> +        BestWeight = E.NormalizedWeight;
> +      }
> +    }
> +
> +    if (BestWeight < MIN_EDGE_PROBABILITY)
> +      continue;
> +
> +    Cluster *PredC = SecToCluster[BestPred];
> +    if (PredC == nullptr || PredC == &C)
> +      continue;
> +
> +    if (C.Size + PredC->Size > MAX_CLUSTER_SIZE)
> +      continue;
> +
> +    if (isNewDensityBad(*PredC, C))
> +      continue;
> +
> +    for (SectionIndex SI : C.Sections)
> +      SecToCluster[SI] = PredC;
> +
> +    mergeClusters(*PredC, C);
> +  }
> +
> +  // Remove empty or dead nodes.
> +  Clusters.erase(std::remove_if(Clusters.begin(), Clusters.end(),
> +                                [](const Cluster &C) {
> +                                  return C.Size == 0 || C.Sections.empty();
> +                                }),
> +                 Clusters.end());
> +
> +  // Sort by density. Invalidates all NodeIndexs.
> +  std::sort(Clusters.begin(), Clusters.end(),
> +            [](const Cluster &A, const Cluster &B) {
> +              return A.getDensity() > B.getDensity();
> +            });
> +}
> +
> +DenseMap<const InputSectionBase *, int> CallGraphSort::run() {
> +  generateClusters();
> +
> +  // Generate order.
> +  llvm::DenseMap<const InputSectionBase *, int> OrderMap;
> +  ssize_t CurOrder = 1;
> +
> +  for (const Cluster &C : Clusters)
> +    for (const InputSectionBase *IS : C.ISBs)
> +      OrderMap[IS] = CurOrder++;
> +
> +  return OrderMap;
> +}
> +
> +// Sort sections by the profile data provided by -callgraph-profile-file
> +//
> +// This first builds a call graph based on the profile data then merges sections
> +// according to the C³ huristic. All clusters are then sorted by a density
> +// metric to further improve locality.
> +DenseMap<const InputSectionBase *, int> elf::computeCallGraphProfileOrder() {
> +  return CallGraphSort().run();
> +}
> Index: ELF/CMakeLists.txt
> ===================================================================
> --- ELF/CMakeLists.txt
> +++ ELF/CMakeLists.txt
> @@ -19,6 +19,7 @@
>    Arch/SPARCV9.cpp
>    Arch/X86.cpp
>    Arch/X86_64.cpp
> +  CallGraphSort.cpp
>    Driver.cpp
>    DriverUtils.cpp
>    EhFrame.cpp
> _______________________________________________
> llvm-commits mailing list
> llvm-commits at lists.llvm.org
> http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-commits