[PATCH] D36351: [lld][ELF] Add profile guided section layout

Thu Mar 8 09:50:42 PST 2018

rafael added a comment.

I now get this warning:

/home/espindola/llvm/llvm/tools/lld/ELF/CallGraphSort.cpp:204:28:
warning: missing field 'NormalizedWeight' initializer
[-Wmissing-field-initializers]

  Edge E{From, To, Weight};

And git-clang-format still produces this diff:

- if (A.getDensity() >
- NewDensity * Double(MAX_DENSITY_DEGRADATION))

+  if (A.getDensity() > NewDensity * Double(MAX_DENSITY_DEGRADATION))

Cheers,
Rafael

Michael Spencer via Phabricator via llvm-commits
<llvm-commits at lists.llvm.org> writes:

> Bigcheese updated this revision to Diff 137539.
>  Bigcheese added a comment.
> 
> Address review comments.
> 
> https://reviews.llvm.org/D36351
> 
> Files:
> 
>   ELF/CMakeLists.txt
>   ELF/CallGraphSort.cpp
>   ELF/CallGraphSort.h
>   ELF/Config.h
>   ELF/Driver.cpp
>   ELF/Options.td
>   ELF/Writer.cpp
>   test/ELF/cgprofile-txt.s
> 
> Index: test/ELF/cgprofile-txt.s
>  ===================================================================
> 
> - /dev/null +++ test/ELF/cgprofile-txt.s @@ -0,0 +1,106 @@ +# REQUIRES: x86 + +# RUN: llvm-mc -filetype=obj -triple=x86_64-unknown-linux %s -o %t +# RUN: ld.lld -e A %t -o %t2 +# RUN: llvm-readobj -symbols %t2 | FileCheck %s --check-prefix=NOSORT + +# RUN: echo "A B 100" > %t.call_graph +# RUN: echo "A C 40" >> %t.call_graph +# RUN: echo "B C 30" >> %t.call_graph +# RUN: echo "C D 90" >> %t.call_graph +# RUN: echo "PP TS 100" >> %t.call_graph +# RUN: echo "_init2 _init 24567837" >> %t.call_graph +# RUN: echo "TS QC 9001" >> %t.call_graph +# RUN: ld.lld -e A %t --call-graph-ordering-file %t.call_graph -o %t2 +# RUN: llvm-readobj -symbols %t2 | FileCheck %s + +    .section    .text.D,"ax", at progbits +D: +    retq + +    .section    .text.C,"ax", at progbits +    .globl  C +C: +    retq + +    .section    .text.B,"ax", at progbits +    .globl  B +B: +    retq + +    .section    .text.A,"ax", at progbits +    .globl  A +A: +    retq + +    .section    .ponies,"ax", at progbits,unique,1 +    .globl TS +TS: +    retq + +    .section    .ponies,"ax", at progbits,unique,2 +    .globl PP +PP: +    retq + +    .section    .other,"ax", at progbits,unique,1 +    .globl QC +QC: +    retq + +    .section    .other,"ax", at progbits,unique,2 +    .globl GB +GB: +    retq + +    .section    .init,"ax", at progbits,unique,1 +    .globl _init +_init: +    retq + +    .section    .init,"ax", at progbits,unique,2 +    .globl _init2 +_init2: +    retq + +# CHECK:          Name: D +# CHECK-NEXT:     Value: 0x201003 +# CHECK:          Name: A +# CHECK-NEXT:     Value: 0x201000 +# CHECK:          Name: B +# CHECK-NEXT:     Value: 0x201001 +# CHECK:          Name: C +# CHECK-NEXT:     Value: 0x201002 +# CHECK:          Name: GB +# CHECK-NEXT:     Value: 0x201007 +# CHECK:          Name: PP +# CHECK-NEXT:     Value: 0x201004 +# CHECK:          Name: QC +# CHECK-NEXT:     Value: 0x201006 +# CHECK:          Name: TS +# CHECK-NEXT:     Value: 0x201005 +# CHECK:          Name: _init +# CHECK-NEXT:     Value: 0x201008 +# CHECK:          Name: _init2 +# CHECK-NEXT:     Value: 0x201009 + +# NOSORT:          Name: D +# NOSORT-NEXT:     Value: 0x201000 +# NOSORT:          Name: A +# NOSORT-NEXT:     Value: 0x201003 +# NOSORT:          Name: B +# NOSORT-NEXT:     Value: 0x201002 +# NOSORT:          Name: C +# NOSORT-NEXT:     Value: 0x201001 +# NOSORT:          Name: GB +# NOSORT-NEXT:     Value: 0x201007 +# NOSORT:          Name: PP +# NOSORT-NEXT:     Value: 0x201005 +# NOSORT:          Name: QC +# NOSORT-NEXT:     Value: 0x201006 +# NOSORT:          Name: TS +# NOSORT-NEXT:     Value: 0x201004 +# NOSORT:          Name: _init +# NOSORT-NEXT:     Value: 0x201008 +# NOSORT:          Name: _init2 +# NOSORT-NEXT:     Value: 0x201009 Index: ELF/Writer.cpp ===================================================================
> - ELF/Writer.cpp +++ ELF/Writer.cpp @@ -9,6 +9,7 @@
> 
>   #include "Writer.h" #include "AArch64ErrataFix.h" +#include "CallGraphSort.h" #include "Config.h" #include "Filesystem.h" #include "LinkerScript.h" @@ -1020,6 +1021,10 @@ // Builds section order for handling --symbol-ordering-file. static DenseMap<const InputSectionBase *, int> buildSectionOrder() { DenseMap<const InputSectionBase *, int> SectionOrder; +  // Use the rarely used option -call-graph-ordering-file to sort sections. +  if (!Config->CallGraphProfile.empty()) +    return computeCallGraphProfileOrder(); + if (Config->SymbolOrderingFile.empty()) return SectionOrder;
> 
>   Index: ELF/Options.td ===================================================================
> - ELF/Options.td +++ ELF/Options.td @@ -66,6 +66,9 @@ "Only set DT_NEEDED for shared libraries if used", "Always set DT_NEEDED for shared libraries">;
> 
>   +defm call_graph_ordering_file: Eq<"call-graph-ordering-file">, +  HelpText<"Layout sections to optimize the given callgraph">; + // -chroot doesn't have a help text because it is an internal option. defm chroot: Eq<"chroot">;
> 
>   Index: ELF/Driver.cpp ===================================================================
> - ELF/Driver.cpp +++ ELF/Driver.cpp @@ -571,6 +571,31 @@ return {BuildIdKind::None, {}}; }
> 
>   +static void readCallGraph(MemoryBufferRef MB) { +  // Build a map from symbol name to section +  DenseMap<StringRef, InputSectionBase *> SymbolSection; +  for (InputFile *File : ObjectFiles) +    for (Symbol *Sym : File->getSymbols()) +      if (auto *D = dyn_cast<Defined>(Sym)) +        if (auto *IS = dyn_cast_or_null<InputSectionBase>(D->Section)) +          SymbolSection[D->getName()] = IS; + +  std::vector<StringRef> Lines = args::getLines(MB); +  for (StringRef L : Lines) { +    SmallVector<StringRef, 3> Fields; +    L.split(Fields, ' '); +    if (Fields.size() != 3) +      fatal("parse error"); +    uint64_t Count; +    if (!to_integer(Fields[2], Count)) +      fatal("parse error"); +    InputSectionBase *FromSec = SymbolSection.lookup(Fields[0]); +    InputSectionBase *ToSec = SymbolSection.lookup(Fields[1]); +    if (FromSec && ToSec) +      Config->CallGraphProfile[std::make_pair(FromSec, ToSec)] += Count; +  } +} + static bool getCompressDebugSections(opt::InputArgList &Args) { StringRef S = Args.getLastArgValue(OPT_compress_debug_sections, "none"); if (S == "none") @@ -1124,6 +1149,10 @@ // Apply symbol renames for -wrap. Symtab->applySymbolWrap();
> 
>   +  if (auto *Arg = Args.getLastArg(OPT_call_graph_ordering_file)) +    if (Optional<MemoryBufferRef> Buffer = readFile(Arg->getValue())) +      readCallGraph(*Buffer); + // Now that we have a complete list of input files. // Beyond this point, no new files are added. // Aggregate all input sections into one place. Index: ELF/Config.h ===================================================================
> - ELF/Config.h +++ ELF/Config.h @@ -24,6 +24,7 @@ namespace elf {
> 
>   class InputFile; +class InputSectionBase;
> 
>   enum ELFKind { ELFNoneKind, @@ -103,6 +104,9 @@ std::vector<SymbolVersion> VersionScriptGlobals; std::vector<SymbolVersion> VersionScriptLocals; std::vector<uint8_t> BuildIdVector; +  llvm::MapVector<std::pair<const InputSectionBase *, const InputSectionBase *>, +                  uint64_t> +      CallGraphProfile; bool AllowMultipleDefinition; bool AndroidPackDynRelocs = false; bool ARMHasBlx = false; Index: ELF/CallGraphSort.h ===================================================================
> - /dev/null +++ ELF/CallGraphSort.h @@ -0,0 +1,23 @@ +//===- CallGraphSort.h ------------------------------------------*- C++ -*-===// +// +//                             The LLVM Linker +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLD_ELF_CALL_GRAPH_SORT_H +#define LLD_ELF_CALL_GRAPH_SORT_H + +#include "llvm/ADT/DenseMap.h" + +namespace lld { +namespace elf { +class InputSectionBase; + +llvm::DenseMap<const InputSectionBase *, int> computeCallGraphProfileOrder(); +} // namespace elf +} // namespace lld + +#endif Index: ELF/CallGraphSort.cpp ===================================================================
> - /dev/null +++ ELF/CallGraphSort.cpp @@ -0,0 +1,350 @@ +//===- CallGraphSort.cpp --------------------------------------------------===// +// +//                             The LLVM Linker +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// Implementation of Call-Chain Clustering from: Optimizing Function Placement +/// for Large-Scale Data-Center Applications +/// https://research.fb.com/wp-content/uploads/2017/01/cgo2017-hfsort-final1.pdf +/// +/// The goal of this algorithm is to improve runtime performance of the final +/// executable by arranging code sections such that page table and i-cache +/// misses are minimized. +/// +/// Definitions: +/// * Cluster +///   * An ordered list of input sections which are layed out as a unit. At the +///     beginning of the algorithm each input section has its own cluster and +///     the weight of the cluster is the sum of the weight of all incomming +///     edges. +/// * Call-Chain Clustering (C³) Heuristic +///   * Defines when and how clusters are combined. Pick the highest weighted +///     input section then add it to its most likely predecessor if it wouldn't +///     penalize it too much. +/// * Density +///   * The weight of the cluster divided by the size of the cluster. This is a +///     proxy for the ammount of execution time spent per byte of the cluster. +/// +/// It does so given a call graph profile by the following: +/// * Build a weighted call graph from the profile +/// * Sort input sections by weight +/// * For each input section starting with the highest weight +///   * Find its most likely predecessor cluster +///   * Check if the combined cluster would be too large, or would have too low +///     a density. +///   * If not, then combine the clusters. +/// * Sort non-empty clusters by density +/// +//===----------------------------------------------------------------------===// + +#include "CallGraphSort.h" +#include "OutputSections.h" +#include "SymbolTable.h" +#include "Symbols.h" + +#include "llvm/Support/MathExtras.h" + +using namespace llvm; +using namespace lld; +using namespace lld::elf; + +namespace { +using ClusterIndex = std::ptrdiff_t; +using SectionIndex = std::ptrdiff_t; +using EdgeIndex = std::ptrdiff_t; + +// Used for calculating an comparing density.  Use soft-float for determinism. +struct Double : APFloat { +  Double() : APFloat(APFloat::IEEEdouble(), 0) {} +  Double(uint64_t Val) : APFloat(APFloat::IEEEdouble(), Val) {} +  Double(APFloat A) : APFloat(A) {} +  bool operator>(const Double Other) const { +    return compare(Other) == cmpGreaterThan; +  } +  bool operator<(const Double Other) const { +    return compare(Other) == cmpLessThan; +  } +}; + +struct Cluster { +  Cluster(SectionIndex Sec, const InputSectionBase *IS); + +  Double getDensity() const { +    if (Size == 0) +      return 0; +    return Double(Weight) / Double(Size); +  } + +  std::vector<const InputSectionBase *> ISBs; +  std::vector<SectionIndex> Sections; +  size_t Size = 0; +  uint64_t Weight = 0; +}; + +struct Section { +  Section(const InputSectionBase *IS) : ISB(IS) { Size = ISB->getSize(); } + +  Double getDensity() const { +    if (Size == 0) +      return 0; +    return Double(Weight) / Double(Size); +  } + +  size_t Size = 0; +  uint64_t Weight = 0; +  const InputSectionBase *ISB; +  std::vector<SectionIndex> Preds; +  std::vector<SectionIndex> Succs; +}; + +struct Edge { +  SectionIndex From; +  SectionIndex To; +  uint64_t Weight; +  Double NormalizedWeight; + +  bool operator==(const Edge Other) const; +}; + +struct EdgeDenseMapInfo { +  static Edge getEmptyKey() { +    return {DenseMapInfo<SectionIndex>::getEmptyKey(), +            DenseMapInfo<SectionIndex>::getEmptyKey(), 0, 0}; +  } +  static Edge getTombstoneKey() { +    return {DenseMapInfo<SectionIndex>::getTombstoneKey(), +            DenseMapInfo<SectionIndex>::getTombstoneKey(), 0, 0}; +  } +  static unsigned getHashValue(const Edge &Val) { +    return hash_combine(DenseMapInfo<SectionIndex>::getHashValue(Val.From), +                        DenseMapInfo<SectionIndex>::getHashValue(Val.To)); +  } +  static bool isEqual(const Edge &LHS, const Edge &RHS) { return LHS == RHS; } +}; + +class CallGraphSort { +public: +  CallGraphSort(); + +  DenseMap<const InputSectionBase *, int> run(); + +private: +  DenseMap<Edge, EdgeIndex, EdgeDenseMapInfo> EdgeMap; +  std::vector<Cluster> Clusters; +  std::vector<Edge> Edges; +  std::vector<Section> Sections; + +  void normalizeEdgeWeights(); +  void generateClusters(); +}; + +// Maximum ammount the combined cluster density can be worse than the original +// cluster to consider merging. +constexpr int MAX_DENSITY_DEGRADATION = 8; + +// Maximum cluster size in bytes. +constexpr uint64_t MAX_CLUSTER_SIZE = 1024 * 1024; +} // end anonymous namespace + +Cluster::Cluster(SectionIndex Sec, const InputSectionBase *IS) { +  ISBs.push_back(IS); +  Sections.push_back(Sec); +  Size = IS->getSize(); +} + +bool Edge::operator==(const Edge Other) const { +  return From == Other.From && To == Other.To; +} + +// Take the edge list in Config->CallGraphProfile, resolve symbol names to +// Symbols, and generate a graph between InputSections with the provided +// weights. +CallGraphSort::CallGraphSort() { +  MapVector<std::pair<const InputSectionBase *, const InputSectionBase *>, +            uint64_t> &Profile = Config->CallGraphProfile; +  DenseMap<const InputSectionBase *, SectionIndex> SecToSec; + +  auto GetOrCreateNode = [&](const InputSectionBase *IS) -> SectionIndex { +    auto Res = SecToSec.insert(std::make_pair(IS, Sections.size())); +    if (Res.second) +      Sections.emplace_back(IS); +    return Res.first->second; +  }; + +  // Create the graph. +  for (const auto &C : Profile) { +    const InputSectionBase *FromSB = C.first.first; +    const InputSectionBase *ToSB = C.first.second; +    uint64_t Weight = C.second; + +    if (Weight == 0) +      continue; + +    // Ignore edges between input sections belonging to different output +    // sections.  This is done because otherwise we would end up with clusters +    // containing input sections that can't actually be placed adjacently in the +    // output.  This messes with the cluster size and density calculations.  We +    // would also end up moving input sections in other output sections without +    // moving them closer to what calls them. +    if (FromSB->getOutputSection() != ToSB->getOutputSection()) +      continue; + +    SectionIndex From = GetOrCreateNode(FromSB); +    SectionIndex To = GetOrCreateNode(ToSB); + +    Sections[To].Weight = SaturatingAdd(Sections[To].Weight, Weight); + +    if (From == To) +      continue; + +    Edge E{From, To, Weight}; + +    // Add or increment an edge +    auto Res = EdgeMap.insert(std::make_pair(E, Edges.size())); +    EdgeIndex EI = Res.first->second; +    if (Res.second) { +      Edges.push_back(E); +      Sections[From].Succs.push_back(To); +      Sections[To].Preds.push_back(From); +    } else +      Edges[EI].Weight = SaturatingAdd(Edges[EI].Weight, Weight); +  } +  normalizeEdgeWeights(); +} + +// Normalize the edge weights so that we can reject edges which have a low +// probibility. +void CallGraphSort::normalizeEdgeWeights() { +  for (SectionIndex SI = 0, SE = Sections.size(); SI != SE; ++SI) { +    Section &S = Sections[SI]; +    for (SectionIndex PI : S.Preds) { +      Edge &E = Edges[EdgeMap[{PI, SI, 0, 0}]]; +      if (S.Weight == 0) +        continue; +      E.NormalizedWeight = Double(E.Weight) / Double(S.Weight); +    } +  } +} + +// It's bad to merge clusters which would degrade the density too much. +static bool isNewDensityBad(Cluster &A, Cluster &B) { +  Double NewDensity = Double(A.Weight + B.Weight) / Double(A.Size + B.Size); +  if (A.getDensity() > +      NewDensity * Double(MAX_DENSITY_DEGRADATION)) +    return true; +  return false; +} + +static void mergeClusters(Cluster &Into, Cluster &From) { +  Into.ISBs.insert(Into.ISBs.end(), From.ISBs.begin(), From.ISBs.end()); +  Into.Sections.insert(Into.Sections.end(), From.Sections.begin(), +                       From.Sections.end()); +  Into.Size += From.Size; +  Into.Weight += From.Weight; +  From.ISBs.clear(); +  From.Sections.clear(); +  From.Size = 0; +  From.Weight = 0; +} + +// Group InputSections into clusters using the Call-Chain Clustering heuristic +// then sort the clusters by density. +void CallGraphSort::generateClusters() { +  // Minimum edge probability to consider merging. +  const Double MIN_EDGE_PROBABILITY = Double(1) / Double(10); + +  std::vector<SectionIndex> SortedSecs; +  std::vector<Cluster *> SecToCluster(Sections.size()); + +  Clusters.reserve(Sections.size()); + +  for (SectionIndex SI = 0, SE = Sections.size(); SI != SE; ++SI) { +    Cluster C(SI, Sections[SI].ISB); +    C.Size = Sections[SI].Size; +    C.Weight = Sections[SI].Weight; +    Clusters.push_back(C); +    SortedSecs.push_back(SI); +  } + +  for (Cluster &C : Clusters) { +    SecToCluster[C.Sections.front()] = &C; +  } + +  std::stable_sort(SortedSecs.begin(), SortedSecs.end(), +                   [&](SectionIndex A, SectionIndex B) { +                     return Sections[A].getDensity() > Sections[B].getDensity(); +                   }); + +  for (SectionIndex SI : SortedSecs) { +    Cluster &C = *SecToCluster[SI]; + +    SectionIndex BestPred = -1; +    Double BestWeight = 0; + +    for (SectionIndex PI : Sections[SI].Preds) { +      Edge &E = Edges[EdgeMap[{PI, SI, 0, 0}]]; +      if (BestPred == -1 || E.NormalizedWeight > BestWeight) { +        BestPred = PI; +        BestWeight = E.NormalizedWeight; +      } +    } + +    if (BestWeight < MIN_EDGE_PROBABILITY) +      continue; + +    Cluster *PredC = SecToCluster[BestPred]; +    if (PredC == nullptr || PredC == &C) +      continue; + +    if (C.Size + PredC->Size > MAX_CLUSTER_SIZE) +      continue; + +    if (isNewDensityBad(*PredC, C)) +      continue; + +    for (SectionIndex SI : C.Sections) +      SecToCluster[SI] = PredC; + +    mergeClusters(*PredC, C); +  } + +  // Remove empty or dead nodes. +  Clusters.erase(std::remove_if(Clusters.begin(), Clusters.end(), +                                [](const Cluster &C) { +                                  return C.Size == 0 || C.Sections.empty(); +                                }), +                 Clusters.end()); + +  // Sort by density. Invalidates all NodeIndexs. +  std::sort(Clusters.begin(), Clusters.end(), +            [](const Cluster &A, const Cluster &B) { +              return A.getDensity() > B.getDensity(); +            }); +} + +DenseMap<const InputSectionBase *, int> CallGraphSort::run() { +  generateClusters(); + +  // Generate order. +  llvm::DenseMap<const InputSectionBase *, int> OrderMap; +  ssize_t CurOrder = 1; + +  for (const Cluster &C : Clusters) +    for (const InputSectionBase *IS : C.ISBs) +      OrderMap[IS] = CurOrder++; + +  return OrderMap; +} + +// Sort sections by the profile data provided by -callgraph-profile-file +// +// This first builds a call graph based on the profile data then merges sections +// according to the C³ huristic. All clusters are then sorted by a density +// metric to further improve locality. +DenseMap<const InputSectionBase *, int> elf::computeCallGraphProfileOrder() { +  return CallGraphSort().run(); +} Index: ELF/CMakeLists.txt ===================================================================
> - ELF/CMakeLists.txt +++ ELF/CMakeLists.txt @@ -19,6 +19,7 @@ Arch/SPARCV9.cpp Arch/X86.cpp Arch/X86_64.cpp +  CallGraphSort.cpp Driver.cpp DriverUtils.cpp EhFrame.cpp _______________________________________________ llvm-commits mailing list llvm-commits at lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-commits

https://reviews.llvm.org/D36351