[PATCH] D36351: [lld][ELF] Add profile guided section layout
Rafael Avila de Espindola via llvm-commits
llvm-commits at lists.llvm.org
Tue Mar 6 15:03:42 PST 2018
Ah, and please git-clang-format :-)
Cheers,
Rafael
Michael Spencer via Phabricator via llvm-commits
<llvm-commits at lists.llvm.org> writes:
> Bigcheese updated this revision to Diff 137040.
> Bigcheese added a comment.
>
> Rewrote the algorithm to match hfsort. Now gets the same performance as hfsort in Rafael's testcase.
>
>
> https://reviews.llvm.org/D36351
>
> Files:
> ELF/CMakeLists.txt
> ELF/CallGraphSort.cpp
> ELF/CallGraphSort.h
> ELF/Config.h
> ELF/Driver.cpp
> ELF/Options.td
> ELF/Writer.cpp
> test/ELF/cgprofile-txt.s
>
> Index: test/ELF/cgprofile-txt.s
> ===================================================================
> --- /dev/null
> +++ test/ELF/cgprofile-txt.s
> @@ -0,0 +1,106 @@
> +# REQUIRES: x86
> +
> +# RUN: llvm-mc -filetype=obj -triple=x86_64-unknown-linux %s -o %t
> +# RUN: ld.lld -e A %t -o %t2
> +# RUN: llvm-readobj -symbols %t2 | FileCheck %s --check-prefix=NOSORT
> +
> +# RUN: echo "A B 100" > %t.call_graph
> +# RUN: echo "A C 40" >> %t.call_graph
> +# RUN: echo "B C 30" >> %t.call_graph
> +# RUN: echo "C D 90" >> %t.call_graph
> +# RUN: echo "PP TS 100" >> %t.call_graph
> +# RUN: echo "_init2 _init 24567837" >> %t.call_graph
> +# RUN: echo "TS QC 9001" >> %t.call_graph
> +# RUN: ld.lld -e A %t --call-graph-ordering-file %t.call_graph -o %t2
> +# RUN: llvm-readobj -symbols %t2 | FileCheck %s
> +
> + .section .text.D,"ax", at progbits
> +D:
> + retq
> +
> + .section .text.C,"ax", at progbits
> + .globl C
> +C:
> + retq
> +
> + .section .text.B,"ax", at progbits
> + .globl B
> +B:
> + retq
> +
> + .section .text.A,"ax", at progbits
> + .globl A
> +A:
> + retq
> +
> + .section .ponies,"ax", at progbits,unique,1
> + .globl TS
> +TS:
> + retq
> +
> + .section .ponies,"ax", at progbits,unique,2
> + .globl PP
> +PP:
> + retq
> +
> + .section .other,"ax", at progbits,unique,1
> + .globl QC
> +QC:
> + retq
> +
> + .section .other,"ax", at progbits,unique,2
> + .globl GB
> +GB:
> + retq
> +
> + .section .init,"ax", at progbits,unique,1
> + .globl _init
> +_init:
> + retq
> +
> + .section .init,"ax", at progbits,unique,2
> + .globl _init2
> +_init2:
> + retq
> +
> +# CHECK: Name: D
> +# CHECK-NEXT: Value: 0x201003
> +# CHECK: Name: A
> +# CHECK-NEXT: Value: 0x201000
> +# CHECK: Name: B
> +# CHECK-NEXT: Value: 0x201001
> +# CHECK: Name: C
> +# CHECK-NEXT: Value: 0x201002
> +# CHECK: Name: GB
> +# CHECK-NEXT: Value: 0x201007
> +# CHECK: Name: PP
> +# CHECK-NEXT: Value: 0x201004
> +# CHECK: Name: QC
> +# CHECK-NEXT: Value: 0x201006
> +# CHECK: Name: TS
> +# CHECK-NEXT: Value: 0x201005
> +# CHECK: Name: _init
> +# CHECK-NEXT: Value: 0x201008
> +# CHECK: Name: _init2
> +# CHECK-NEXT: Value: 0x201009
> +
> +# NOSORT: Name: D
> +# NOSORT-NEXT: Value: 0x201000
> +# NOSORT: Name: A
> +# NOSORT-NEXT: Value: 0x201003
> +# NOSORT: Name: B
> +# NOSORT-NEXT: Value: 0x201002
> +# NOSORT: Name: C
> +# NOSORT-NEXT: Value: 0x201001
> +# NOSORT: Name: GB
> +# NOSORT-NEXT: Value: 0x201007
> +# NOSORT: Name: PP
> +# NOSORT-NEXT: Value: 0x201005
> +# NOSORT: Name: QC
> +# NOSORT-NEXT: Value: 0x201006
> +# NOSORT: Name: TS
> +# NOSORT-NEXT: Value: 0x201004
> +# NOSORT: Name: _init
> +# NOSORT-NEXT: Value: 0x201008
> +# NOSORT: Name: _init2
> +# NOSORT-NEXT: Value: 0x201009
> Index: ELF/Writer.cpp
> ===================================================================
> --- ELF/Writer.cpp
> +++ ELF/Writer.cpp
> @@ -9,6 +9,7 @@
>
> #include "Writer.h"
> #include "AArch64ErrataFix.h"
> +#include "CallGraphSort.h"
> #include "Config.h"
> #include "Filesystem.h"
> #include "LinkerScript.h"
> @@ -1110,6 +1111,14 @@
> // If no layout was provided by linker script, we want to apply default
> // sorting for special input sections. This also handles --symbol-ordering-file.
> template <class ELFT> void Writer<ELFT>::sortInputSections() {
> + // Use the rarely used option -call-graph-ordering-file to sort sections.
> + if (!Config->CallGraphProfile.empty()) {
> + DenseMap<const InputSectionBase *, int> Order =
> + computeCallGraphProfileOrder();
> + for (BaseCommand *Base : Script->SectionCommands)
> + if (auto *Sec = dyn_cast<OutputSection>(Base))
> + sortSection(Sec, Order);
> + }
> // Build the order once since it is expensive.
> DenseMap<const InputSectionBase *, int> Order = buildSectionOrder();
> for (BaseCommand *Base : Script->SectionCommands)
> Index: ELF/Options.td
> ===================================================================
> --- ELF/Options.td
> +++ ELF/Options.td
> @@ -66,6 +66,9 @@
> "Only set DT_NEEDED for shared libraries if used",
> "Always set DT_NEEDED for shared libraries">;
>
> +defm call_graph_ordering_file: Eq<"call-graph-ordering-file">,
> + HelpText<"Layout sections to optimize the given callgraph">;
> +
> // -chroot doesn't have a help text because it is an internal option.
> defm chroot: Eq<"chroot">;
>
> Index: ELF/Driver.cpp
> ===================================================================
> --- ELF/Driver.cpp
> +++ ELF/Driver.cpp
> @@ -571,6 +571,31 @@
> return {BuildIdKind::None, {}};
> }
>
> +static void readCallGraph(MemoryBufferRef MB) {
> + // Build a map from symbol name to section
> + DenseMap<StringRef, InputSectionBase *> SymbolSection;
> + for (InputFile *File : ObjectFiles)
> + for (Symbol *Sym : File->getSymbols())
> + if (auto *D = dyn_cast<Defined>(Sym))
> + if (auto *IS = dyn_cast_or_null<InputSectionBase>(D->Section))
> + SymbolSection[D->getName()] = IS;
> +
> + std::vector<StringRef> Lines = args::getLines(MB);
> + for (StringRef L : Lines) {
> + SmallVector<StringRef, 3> Fields;
> + L.split(Fields, ' ');
> + if (Fields.size() != 3)
> + fatal("parse error");
> + uint64_t Count;
> + if (!to_integer(Fields[2], Count))
> + fatal("parse error");
> + InputSectionBase *FromSec = SymbolSection.lookup(Fields[0]);
> + InputSectionBase *ToSec = SymbolSection.lookup(Fields[1]);
> + if (FromSec && ToSec)
> + Config->CallGraphProfile[std::make_pair(FromSec, ToSec)] = Count;
> + }
> +}
> +
> static bool getCompressDebugSections(opt::InputArgList &Args) {
> StringRef S = Args.getLastArgValue(OPT_compress_debug_sections, "none");
> if (S == "none")
> @@ -1118,6 +1143,10 @@
> // Apply symbol renames for -wrap.
> Symtab->applySymbolWrap();
>
> + if (auto *Arg = Args.getLastArg(OPT_call_graph_ordering_file))
> + if (Optional<MemoryBufferRef> Buffer = readFile(Arg->getValue()))
> + readCallGraph(*Buffer);
> +
> // Now that we have a complete list of input files.
> // Beyond this point, no new files are added.
> // Aggregate all input sections into one place.
> Index: ELF/Config.h
> ===================================================================
> --- ELF/Config.h
> +++ ELF/Config.h
> @@ -24,6 +24,7 @@
> namespace elf {
>
> class InputFile;
> +class InputSectionBase;
>
> enum ELFKind {
> ELFNoneKind,
> @@ -103,6 +104,9 @@
> std::vector<SymbolVersion> VersionScriptGlobals;
> std::vector<SymbolVersion> VersionScriptLocals;
> std::vector<uint8_t> BuildIdVector;
> + llvm::MapVector<std::pair<const InputSectionBase *, const InputSectionBase *>,
> + uint64_t>
> + CallGraphProfile;
> bool AllowMultipleDefinition;
> bool AndroidPackDynRelocs = false;
> bool ARMHasBlx = false;
> Index: ELF/CallGraphSort.h
> ===================================================================
> --- /dev/null
> +++ ELF/CallGraphSort.h
> @@ -0,0 +1,23 @@
> +//===- CallGraphSort.h ------------------------------------------*- C++ -*-===//
> +//
> +// The LLVM Linker
> +//
> +// This file is distributed under the University of Illinois Open Source
> +// License. See LICENSE.TXT for details.
> +//
> +//===----------------------------------------------------------------------===//
> +
> +#ifndef LLD_ELF_CALL_GRAPH_SORT_H
> +#define LLD_ELF_CALL_GRAPH_SORT_H
> +
> +#include "llvm/ADT/DenseMap.h"
> +
> +namespace lld {
> +namespace elf {
> +class InputSectionBase;
> +
> +llvm::DenseMap<const InputSectionBase *, int> computeCallGraphProfileOrder();
> +} // namespace elf
> +} // namespace lld
> +
> +#endif
> Index: ELF/CallGraphSort.cpp
> ===================================================================
> --- /dev/null
> +++ ELF/CallGraphSort.cpp
> @@ -0,0 +1,350 @@
> +//===- CallGraphSort.cpp --------------------------------------------------===//
> +//
> +// The LLVM Linker
> +//
> +// This file is distributed under the University of Illinois Open Source
> +// License. See LICENSE.TXT for details.
> +//
> +//===----------------------------------------------------------------------===//
> +///
> +/// Implementation of Call-Chain Clustering from: Optimizing Function Placement
> +/// for Large-Scale Data-Center Applications
> +/// https://research.fb.com/wp-content/uploads/2017/01/cgo2017-hfsort-final1.pdf
> +///
> +/// The goal of this algorithm is to improve runtime performance of the final
> +/// executable by arranging code sections such that page table and i-cache
> +/// misses are minimized.
> +///
> +/// Definitions:
> +/// * Cluster
> +/// * An ordered list of input sections which are layed out as a unit. At the
> +/// beginning of the algorithm each input section has its own cluster and
> +/// the weight of the cluster is the sum of the weight of all incomming
> +/// edges.
> +/// * Call-Chain Clustering (C³) Heuristic
> +/// * Defines when and how clusters are combined. Pick the highest weighted
> +/// input section then add it to its most likely predecessor if it wouldn't
> +/// penalize it too much.
> +/// * Density
> +/// * The weight of the cluster divided by the size of the cluster. This is a
> +/// proxy for the ammount of execution time spent per byte of the cluster.
> +///
> +/// It does so given a call graph profile by the following:
> +/// * Build a weighted call graph from the profile
> +/// * Sort input sections by weight
> +/// * For each input section starting with the highest weight
> +/// * Find its most likely predecessor cluster
> +/// * Check if the combined cluster would be too large, or would have too low
> +/// a density.
> +/// * If not, then combine the clusters.
> +/// * Sort non-empty clusters by density
> +///
> +//===----------------------------------------------------------------------===//
> +
> +#include "CallGraphSort.h"
> +#include "OutputSections.h"
> +#include "SymbolTable.h"
> +#include "Symbols.h"
> +
> +#include "llvm/Support/MathExtras.h"
> +
> +using namespace llvm;
> +using namespace lld;
> +using namespace lld::elf;
> +
> +namespace {
> +using ClusterIndex = std::ptrdiff_t;
> +using SectionIndex = std::ptrdiff_t;
> +using EdgeIndex = std::ptrdiff_t;
> +
> +// Used for calculating an comparing density. Use soft-float for determinism.
> +struct Double : APFloat {
> + Double() : APFloat(APFloat::IEEEdouble(), 0) {}
> + Double(uint64_t Val) : APFloat(APFloat::IEEEdouble(), Val) {}
> + Double(APFloat A) : APFloat(A) {}
> + bool operator>(const Double Other) const {
> + return compare(Other) == cmpGreaterThan;
> + }
> + bool operator<(const Double Other) const {
> + return compare(Other) == cmpLessThan;
> + }
> +};
> +
> +struct Cluster {
> + Cluster(SectionIndex Sec, const InputSectionBase *IS);
> +
> + Double getDensity() const {
> + if (Size == 0)
> + return 0;
> + return Double(Weight) / Double(Size);
> + }
> +
> + std::vector<const InputSectionBase *> ISBs;
> + std::vector<SectionIndex> Sections;
> + int64_t Size = 0;
> + uint64_t Weight = 0;
> +};
> +
> +struct Section {
> + Section(const InputSectionBase *IS) : ISB(IS) { Size = ISB->getSize(); }
> +
> + Double getDensity() const {
> + if (Size == 0)
> + return 0;
> + return Double(Weight) / Double(Size);
> + }
> +
> + int64_t Size = 0;
> + uint64_t Weight = 0;
> + const InputSectionBase *ISB;
> + std::vector<SectionIndex> Preds;
> + std::vector<SectionIndex> Succs;
> +};
> +
> +struct Edge {
> + SectionIndex From;
> + SectionIndex To;
> + uint64_t Weight;
> + Double NormalizedWeight = 0;
> +
> + bool operator==(const Edge Other) const;
> +};
> +
> +struct EdgeDenseMapInfo {
> + static Edge getEmptyKey() {
> + return {DenseMapInfo<SectionIndex>::getEmptyKey(),
> + DenseMapInfo<SectionIndex>::getEmptyKey(), 0, 0};
> + }
> + static Edge getTombstoneKey() {
> + return {DenseMapInfo<SectionIndex>::getTombstoneKey(),
> + DenseMapInfo<SectionIndex>::getTombstoneKey(), 0, 0};
> + }
> + static unsigned getHashValue(const Edge &Val) {
> + return hash_combine(DenseMapInfo<SectionIndex>::getHashValue(Val.From),
> + DenseMapInfo<SectionIndex>::getHashValue(Val.To));
> + }
> + static bool isEqual(const Edge &LHS, const Edge &RHS) { return LHS == RHS; }
> +};
> +
> +class CallGraphSort {
> +public:
> + CallGraphSort();
> +
> + DenseMap<const InputSectionBase *, int> run();
> +
> +private:
> + DenseMap<Edge, EdgeIndex, EdgeDenseMapInfo> EdgeMap;
> + std::vector<Cluster> Clusters;
> + std::vector<Edge> Edges;
> + std::vector<Section> Sections;
> +
> + void normalizeEdgeWeights();
> + void generateClusters();
> +};
> +
> +// Maximum ammount the combined cluster density can be worse than the original
> +// cluster to consider merging.
> +constexpr int MAX_DENSITY_DEGRADATION = 8;
> +
> +// Maximum cluster size in bytes.
> +constexpr uint64_t MAX_CLUSTER_SIZE = 1024 * 1024;
> +} // end anonymous namespace
> +
> +Cluster::Cluster(SectionIndex Sec, const InputSectionBase *IS) {
> + ISBs.push_back(IS);
> + Sections.push_back(Sec);
> + Size = IS->getSize();
> +}
> +
> +bool Edge::operator==(const Edge Other) const {
> + return From == Other.From && To == Other.To;
> +}
> +
> +// Take the edge list in Config->CallGraphProfile, resolve symbol names to
> +// Symbols, and generate a graph between InputSections with the provided
> +// weights.
> +CallGraphSort::CallGraphSort() {
> + MapVector<std::pair<const InputSectionBase *, const InputSectionBase *>,
> + uint64_t> &Profile = Config->CallGraphProfile;
> + DenseMap<const InputSectionBase *, SectionIndex> SecToSec;
> +
> + auto GetOrCreateNode = [&](const InputSectionBase *IS) -> SectionIndex {
> + auto Res = SecToSec.insert(std::make_pair(IS, Sections.size()));
> + if (Res.second)
> + Sections.emplace_back(IS);
> + return Res.first->second;
> + };
> +
> + // Create the graph.
> + for (const auto &C : Profile) {
> + const InputSectionBase *FromSB = C.first.first;
> + const InputSectionBase *ToSB = C.first.second;
> + uint64_t Weight = C.second;
> +
> + if (Weight == 0)
> + continue;
> +
> + // Ignore edges between input sections belonging to different output
> + // sections. This is done because otherwise we would end up with clusters
> + // containing input sections that can't actually be placed adjacently in the
> + // output. This messes with the cluster size and density calculations. We
> + // would also end up moving input sections in other output sections without
> + // moving them closer to what calls them.
> + if (FromSB->getOutputSection() != ToSB->getOutputSection())
> + continue;
> +
> + SectionIndex From = GetOrCreateNode(FromSB);
> + SectionIndex To = GetOrCreateNode(ToSB);
> +
> + Sections[To].Weight = SaturatingAdd(Sections[To].Weight, Weight);
> +
> + if (From == To)
> + continue;
> +
> + Edge E{From, To, Weight};
> +
> + // Add or increment an edge
> + auto Res = EdgeMap.insert(std::make_pair(E, Edges.size()));
> + EdgeIndex EI = Res.first->second;
> + if (Res.second) {
> + Edges.push_back(E);
> + Sections[From].Succs.push_back(To);
> + Sections[To].Preds.push_back(From);
> + } else
> + Edges[EI].Weight = SaturatingAdd(Edges[EI].Weight, Weight);
> + }
> + normalizeEdgeWeights();
> +}
> +
> +// Normalize the edge weights so that we can reject edges which have a low
> +// probibility.
> +void CallGraphSort::normalizeEdgeWeights() {
> + for (SectionIndex SI = 0, SE = Sections.size(); SI != SE; ++SI) {
> + Section &S = Sections[SI];
> + for (SectionIndex PI : S.Preds) {
> + Edge &E = Edges[EdgeMap[{PI, SI, 0, 0}]];
> + if (S.Weight == 0)
> + continue;
> + E.NormalizedWeight = Double(E.Weight) / Double(S.Weight);
> + }
> + }
> +}
> +
> +// It's bad to merge clusters which would degrade the density too much.
> +static bool isNewDensityBad(Cluster &A, Cluster &B) {
> + Double NewDensity = Double(A.Weight + B.Weight) / Double(A.Size + B.Size);
> + if (A.getDensity() >
> + NewDensity * Double(MAX_DENSITY_DEGRADATION))
> + return true;
> + return false;
> +}
> +
> +static void mergeClusters(Cluster &Into, Cluster &From) {
> + Into.ISBs.insert(Into.ISBs.end(), From.ISBs.begin(), From.ISBs.end());
> + Into.Sections.insert(Into.Sections.end(), From.Sections.begin(),
> + From.Sections.end());
> + Into.Size += From.Size;
> + Into.Weight += From.Weight;
> + From.ISBs.clear();
> + From.Sections.clear();
> + From.Size = 0;
> + From.Weight = 0;
> +}
> +
> +// Group InputSections into clusters using the Call-Chain Clustering heuristic
> +// then sort the clusters by density.
> +void CallGraphSort::generateClusters() {
> + // Minimum edge probability to consider merging.
> + const Double MIN_EDGE_PROBABILITY = Double(1) / Double(10);
> +
> + std::vector<SectionIndex> SortedSecs;
> + std::vector<Cluster *> SecToCluster(Sections.size());
> +
> + Clusters.reserve(Sections.size());
> +
> + for (SectionIndex SI = 0, SE = Sections.size(); SI != SE; ++SI) {
> + Cluster C(SI, Sections[SI].ISB);
> + C.Size = Sections[SI].Size;
> + C.Weight = Sections[SI].Weight;
> + Clusters.push_back(C);
> + SortedSecs.push_back(SI);
> + }
> +
> + for (Cluster &C : Clusters) {
> + SecToCluster[C.Sections.front()] = &C;
> + }
> +
> + std::stable_sort(SortedSecs.begin(), SortedSecs.end(),
> + [&](SectionIndex A, SectionIndex B) {
> + return Sections[A].getDensity() > Sections[B].getDensity();
> + });
> +
> + for (SectionIndex SI : SortedSecs) {
> + Cluster &C = *SecToCluster[SI];
> +
> + SectionIndex BestPred = -1;
> + Double BestWeight = 0;
> +
> + for (SectionIndex PI : Sections[SI].Preds) {
> + Edge &E = Edges[EdgeMap[{PI, SI, 0, 0}]];
> + if (BestPred == -1 || E.NormalizedWeight > BestWeight) {
> + BestPred = PI;
> + BestWeight = E.NormalizedWeight;
> + }
> + }
> +
> + if (BestWeight < MIN_EDGE_PROBABILITY)
> + continue;
> +
> + Cluster *PredC = SecToCluster[BestPred];
> + if (PredC == nullptr || PredC == &C)
> + continue;
> +
> + if (C.Size + PredC->Size > MAX_CLUSTER_SIZE)
> + continue;
> +
> + if (isNewDensityBad(*PredC, C))
> + continue;
> +
> + for (SectionIndex SI : C.Sections)
> + SecToCluster[SI] = PredC;
> +
> + mergeClusters(*PredC, C);
> + }
> +
> + // Remove empty or dead nodes.
> + Clusters.erase(std::remove_if(Clusters.begin(), Clusters.end(),
> + [](const Cluster &C) {
> + return C.Size == 0 || C.Sections.empty();
> + }),
> + Clusters.end());
> +
> + // Sort by density. Invalidates all NodeIndexs.
> + std::sort(Clusters.begin(), Clusters.end(),
> + [](const Cluster &A, const Cluster &B) {
> + return A.getDensity() > B.getDensity();
> + });
> +}
> +
> +DenseMap<const InputSectionBase *, int> CallGraphSort::run() {
> + generateClusters();
> +
> + // Generate order.
> + llvm::DenseMap<const InputSectionBase *, int> OrderMap;
> + ssize_t CurOrder = 1;
> +
> + for (const Cluster &C : Clusters)
> + for (const InputSectionBase *IS : C.ISBs)
> + OrderMap[IS] = CurOrder++;
> +
> + return OrderMap;
> +}
> +
> +// Sort sections by the profile data provided by -callgraph-profile-file
> +//
> +// This first builds a call graph based on the profile data then merges sections
> +// according to the C³ huristic. All clusters are then sorted by a density
> +// metric to further improve locality.
> +DenseMap<const InputSectionBase *, int> elf::computeCallGraphProfileOrder() {
> + return CallGraphSort().run();
> +}
> Index: ELF/CMakeLists.txt
> ===================================================================
> --- ELF/CMakeLists.txt
> +++ ELF/CMakeLists.txt
> @@ -19,6 +19,7 @@
> Arch/SPARCV9.cpp
> Arch/X86.cpp
> Arch/X86_64.cpp
> + CallGraphSort.cpp
> Driver.cpp
> DriverUtils.cpp
> EhFrame.cpp
> _______________________________________________
> llvm-commits mailing list
> llvm-commits at lists.llvm.org
> http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-commits
More information about the llvm-commits
mailing list