[llvm-dev] [RFC] Profile guided section layout
Rafael EspĂndola via llvm-dev
llvm-dev at lists.llvm.org
Mon Jul 31 15:12:48 PDT 2017
A rebased version of the lld patch is attached.
Cheers,
Rafael
On 31 July 2017 at 15:11, Rafael Avila de Espindola
<rafael.espindola at gmail.com> wrote:
> Tobias Edler von Koch <tobias at codeaurora.org> writes:
>
>> Hi Rafael,
>>
>> On 07/31/2017 04:20 PM, Rafael Avila de Espindola via llvm-dev wrote:
>>> However, do we need to start with instrumentation? The original paper
>>> uses sampling with good results and current intel cpus can record every
>>> branch in a program.
>>>
>>> I would propose starting with just an lld patch that reads the call
>>> graph from a file. The format would be very similar to what you propose,
>>> just weight,caller,callee.
>>
>> The advantage of the proposed approach (weighted callgraph section) is
>> that it's completely transparent: it works regardless of the particular
>> profiling methodology (as long as there's !perf metadata when the pass
>> runs). For this reason, it fits neatly into an *existing* PGO-based
>> build flow. I only need to add 1 compiler flag to enable it. That's a
>> big plus.
>>
>> On the other hand, I could see how your idea (callgraph input file for
>> linker) would be useful in situations where I just want to do section
>> layout but no PGO in the compiler... and of course for testing of the
>> linker's sorting algorithm.
>>
>> So there's merits in both, but for my use cases Michael's original
>> approach is the most practical.
>
> Yes, I must stress that I am not proposing having the option of reading
> the callgraph from another file *instead* of reading it from .o
> files. Just that doing it first decouples most of the lld patch form the
> llvm changes and can be useful in cases where only samples are available.
>
> Cheers,
> Rafael
-------------- next part --------------
diff --git a/ELF/Config.h b/ELF/Config.h
index 45c9565..6928583 100644
--- a/ELF/Config.h
+++ b/ELF/Config.h
@@ -10,6 +10,7 @@
#ifndef LLD_ELF_CONFIG_H
#define LLD_ELF_CONFIG_H
+#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/MapVector.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/ADT/StringSet.h"
@@ -108,6 +109,8 @@ struct Configuration {
std::vector<SymbolVersion> VersionScriptLocals;
std::vector<uint8_t> BuildIdVector;
llvm::MapVector<Symbol *, RenamedSymbol> RenamedSymbols;
+ llvm::DenseMap<std::pair<llvm::StringRef, llvm::StringRef>, uint64_t>
+ CFGProfile;
bool AllowMultipleDefinition;
bool AsNeeded = false;
bool Bsymbolic;
@@ -127,6 +130,7 @@ struct Configuration {
bool GnuHash;
bool ICF;
bool MipsN32Abi = false;
+ bool NoCFGProfileReorder;
bool NoGnuUnique;
bool NoUndefinedVersion;
bool NoinhibitExec;
diff --git a/ELF/Driver.cpp b/ELF/Driver.cpp
index 263ba7b..84d4d80 100644
--- a/ELF/Driver.cpp
+++ b/ELF/Driver.cpp
@@ -644,6 +644,7 @@ void LinkerDriver::readConfigs(opt::InputArgList &Args) {
Config->LTOO = getInteger(Args, OPT_lto_O, 2);
Config->LTOPartitions = getInteger(Args, OPT_lto_partitions, 1);
Config->MapFile = Args.getLastArgValue(OPT_Map);
+ Config->NoCFGProfileReorder = Args.hasArg(OPT_no_cfg_profile_reorder);
Config->NoGnuUnique = Args.hasArg(OPT_no_gnu_unique);
Config->NoUndefinedVersion = Args.hasArg(OPT_no_undefined_version);
Config->NoinhibitExec = Args.hasArg(OPT_noinhibit_exec);
diff --git a/ELF/InputFiles.cpp b/ELF/InputFiles.cpp
index a6cd1a6..848223b 100644
--- a/ELF/InputFiles.cpp
+++ b/ELF/InputFiles.cpp
@@ -511,6 +511,35 @@ InputSectionBase *ObjFile<ELFT>::createInputSection(const Elf_Shdr &Sec) {
if (Name == ".eh_frame" && !Config->Relocatable)
return make<EhInputSection>(this, &Sec, Name);
+ // Profile data.
+ if (Name == ".note.llvm.callgraph") {
+ ArrayRef<uint8_t> CallgraphBuff =
+ check(this->getObj().getSectionContents(&Sec));
+
+ StringRef Buff((const char *)CallgraphBuff.data(), CallgraphBuff.size());
+
+ auto ReadString = [&Buff]() {
+ size_t F = Buff.find_first_of(" \n");
+ StringRef Ret = Buff.substr(0, F);
+ Buff = Buff.substr(F + 1);
+ return Ret;
+ };
+
+ while (!Buff.empty()) {
+ StringRef From = ReadString();
+ StringRef To = ReadString();
+ uint64_t Count;
+ if (ReadString().getAsInteger(10, Count))
+ break;
+
+ // Merge duplicate counts by picking the largest.
+ uint64_t &C = Config->CFGProfile[std::make_pair(From, To)];
+ C = std::max(C, Count);
+ }
+
+ return &InputSection::Discarded;
+ }
+
if (shouldMerge(Sec))
return make<MergeInputSection>(this, &Sec, Name);
return make<InputSection>(this, &Sec, Name);
diff --git a/ELF/Options.td b/ELF/Options.td
index 0de0d73..000e111 100644
--- a/ELF/Options.td
+++ b/ELF/Options.td
@@ -162,6 +162,9 @@ def nostdlib: F<"nostdlib">,
def no_as_needed: F<"no-as-needed">,
HelpText<"Always DT_NEEDED for shared libraries">;
+def no_cfg_profile_reorder: F<"no-cfg-profile-reorder">,
+ HelpText<"Disable reordering of sections based on profile information">;
+
def no_color_diagnostics: F<"no-color-diagnostics">,
HelpText<"Do not use colors in diagnostics">;
diff --git a/ELF/Writer.cpp b/ELF/Writer.cpp
index a9e3856..3a6e174 100644
--- a/ELF/Writer.cpp
+++ b/ELF/Writer.cpp
@@ -20,11 +20,13 @@
#include "SyntheticSections.h"
#include "Target.h"
#include "Threads.h"
+#include "llvm/ADT/Hashing.h"
#include "llvm/ADT/StringMap.h"
#include "llvm/ADT/StringSwitch.h"
#include "llvm/Support/FileOutputBuffer.h"
#include "llvm/Support/raw_ostream.h"
#include <climits>
+#include <unordered_set>
using namespace llvm;
using namespace llvm::ELF;
@@ -896,6 +898,157 @@ template <class ELFT> static void sortBySymbolsOrder() {
Sec->sort([&](InputSectionBase *S) { return SectionOrder.lookup(S); });
}
+// Sort sections by the profile data provided in the .note.llvm.callgraph
+// sections.
+//
+// This algorithm is based on Call-Chain Clustering from:
+// Optimizing Function Placement for Large-Scale Data-Center Applications
+// https://research.fb.com/wp-content/uploads/2017/01/cgo2017-hfsort-final1.pdf
+//
+// This first builds a call graph based on the profile data then iteratively
+// merges the hottest call edges as long as it would not create a cluster larger
+// than the page size. All clusters are then sorted by a density metric to
+// further improve locality.
+template <class ELFT>
+static void sortByCFGProfile(ArrayRef<OutputSection *> OutputSections) {
+ if (Config->NoCFGProfileReorder)
+ return;
+
+ using NodeIndex = std::ptrdiff_t;
+
+ struct Node {
+ Node() {}
+ Node(const InputSectionBase *IS) {
+ Sections.push_back(IS);
+ Size = IS->getSize();
+ }
+ std::vector<const InputSectionBase *> Sections;
+ int64_t Size = 0;
+ uint64_t Weight = 0;
+ };
+
+ struct Edge {
+ NodeIndex From;
+ NodeIndex To;
+ mutable uint64_t Weight;
+ bool operator==(const Edge Other) const {
+ return From == Other.From && To == Other.To;
+ }
+ };
+
+ struct EdgeHash {
+ std::size_t operator()(const Edge E) const {
+ return llvm::hash_combine(E.From, E.To);
+ };
+ };
+
+ std::vector<Node> Nodes;
+ std::unordered_set<Edge, EdgeHash> Edges;
+
+ auto InsertOrIncrementEdge = [](std::unordered_set<Edge, EdgeHash> &Edges,
+ const Edge E) {
+ if (E.From == E.To)
+ return;
+ auto Res = Edges.insert(E);
+ if (!Res.second)
+ Res.first->Weight = SaturatingAdd(Res.first->Weight, E.Weight);
+ };
+
+ {
+ llvm::DenseMap<const InputSectionBase *, NodeIndex> SecToNode;
+
+ auto GetOrCreateNode =
+ [&Nodes, &SecToNode](const InputSectionBase *IS) -> NodeIndex {
+ auto Res = SecToNode.insert(std::make_pair(IS, Nodes.size()));
+ if (Res.second)
+ Nodes.emplace_back(IS);
+ return Res.first->second;
+ };
+
+ // Create the graph.
+ for (const auto &C : Config->CFGProfile) {
+ if (C.second == 0)
+ continue;
+ DefinedRegular *FromDR =
+ dyn_cast_or_null<DefinedRegular>(Symtab->find(C.first.first));
+ DefinedRegular *ToDR =
+ dyn_cast_or_null<DefinedRegular>(Symtab->find(C.first.second));
+ if (!FromDR || !ToDR)
+ continue;
+ auto FromSB = dyn_cast_or_null<const InputSectionBase>(FromDR->Section);
+ auto ToSB = dyn_cast_or_null<const InputSectionBase>(ToDR->Section);
+ if (!FromSB || !ToSB)
+ continue;
+ NodeIndex From = GetOrCreateNode(FromSB);
+ NodeIndex To = GetOrCreateNode(ToSB);
+ InsertOrIncrementEdge(Edges, {From, To, C.second});
+ Nodes[To].Weight = SaturatingAdd(Nodes[To].Weight, C.second);
+ }
+ }
+
+ // Collapse the graph.
+ while (!Edges.empty()) {
+ // Find the largest edge
+ // FIXME: non deterministic order for equal edges.
+ // FIXME: n^2
+ auto Max = std::max_element(
+ Edges.begin(), Edges.end(),
+ [](const Edge A, const Edge B) { return A.Weight < B.Weight; });
+ const Edge MaxE = *Max;
+ Edges.erase(Max);
+ // Merge the Nodes.
+ Node &From = Nodes[MaxE.From];
+ Node &To = Nodes[MaxE.To];
+ if (From.Size + To.Size > Target->PageSize)
+ continue;
+ From.Sections.insert(From.Sections.end(), To.Sections.begin(),
+ To.Sections.end());
+ To.Sections.clear();
+ From.Size += To.Size;
+ From.Weight = SaturatingAdd(From.Weight, To.Weight);
+ // Collect all edges from or to the removed node and update them for the new
+ // node.
+ std::vector<Edge> OldEdges;
+ // FIXME: n^2
+ for (auto EI = Edges.begin(), EE = Edges.end(); EI != EE;) {
+ if (EI->From == MaxE.To || EI->To == MaxE.To) {
+ OldEdges.push_back(*EI);
+ EI = Edges.erase(EI);
+ } else
+ ++EI;
+ }
+ for (const Edge E : OldEdges) {
+ InsertOrIncrementEdge(Edges,
+ {E.From == MaxE.To ? MaxE.From : E.From,
+ E.To == MaxE.To ? MaxE.From : E.To, E.Weight});
+ }
+ }
+
+ // Sort by density.
+ std::sort(Nodes.begin(), Nodes.end(), [](const Node &A, const Node &B) {
+ return double(A.Weight) / double(A.Size) <
+ double(B.Weight) / double(B.Size);
+ });
+
+ // Generate order.
+ llvm::DenseMap<const InputSectionBase *, std::size_t> OrderMap;
+ ssize_t CurOrder = 0;
+
+ for (const Node &N : Nodes) {
+ if (N.Sections.empty())
+ continue;
+ for (const InputSectionBase *IS : N.Sections)
+ OrderMap[IS] = CurOrder++;
+ }
+
+ for (OutputSection *OS : OutputSections) {
+ if (OS->Name != ".text")
+ continue;
+ OS->sort([&](InputSectionBase *IS) { return OrderMap.lookup(IS); });
+ break;
+ }
+}
+
template <class ELFT>
void Writer<ELFT>::forEachRelSec(std::function<void(InputSectionBase &)> Fn) {
for (InputSectionBase *IS : InputSections) {
@@ -928,6 +1081,7 @@ template <class ELFT> void Writer<ELFT>::createSections() {
Old.end());
Script->fabricateDefaultCommands();
+ sortByCFGProfile<ELFT>(OutputSections);
sortBySymbolsOrder<ELFT>();
sortInitFini(findSection(".init_array"));
sortInitFini(findSection(".fini_array"));
More information about the llvm-dev
mailing list