[llvm-dev] [RFC] Profile guided section layout

Rafael EspĂ­ndola via llvm-dev llvm-dev at lists.llvm.org
Mon Jul 31 15:12:48 PDT 2017


A rebased version of the lld patch is attached.

Cheers,
Rafael


On 31 July 2017 at 15:11, Rafael Avila de Espindola
<rafael.espindola at gmail.com> wrote:
> Tobias Edler von Koch <tobias at codeaurora.org> writes:
>
>> Hi Rafael,
>>
>> On 07/31/2017 04:20 PM, Rafael Avila de Espindola via llvm-dev wrote:
>>> However, do we need to start with instrumentation? The original paper
>>> uses sampling with good results and current intel cpus can record every
>>> branch in a program.
>>>
>>> I would propose starting with just an lld patch that reads the call
>>> graph from a file. The format would be very similar to what you propose,
>>> just weight,caller,callee.
>>
>> The advantage of the proposed approach (weighted callgraph section) is
>> that it's completely transparent: it works regardless of the particular
>> profiling methodology (as long as there's !perf metadata when the pass
>> runs). For this reason, it fits neatly into an *existing* PGO-based
>> build flow. I only need to add 1 compiler flag to enable it. That's a
>> big plus.
>>
>> On the other hand, I could see how your idea (callgraph input file for
>> linker) would be useful in situations where I just want to do section
>> layout but no PGO in the compiler... and of course for testing of the
>> linker's sorting algorithm.
>>
>> So there's merits in both, but for my use cases Michael's original
>> approach is the most practical.
>
> Yes, I must stress that I am not proposing having the option of reading
> the callgraph from another file *instead* of reading it from .o
> files. Just that doing it first decouples most of the lld patch form the
> llvm changes and can be useful in cases where only samples are available.
>
> Cheers,
> Rafael
-------------- next part --------------
diff --git a/ELF/Config.h b/ELF/Config.h
index 45c9565..6928583 100644
--- a/ELF/Config.h
+++ b/ELF/Config.h
@@ -10,6 +10,7 @@
 #ifndef LLD_ELF_CONFIG_H
 #define LLD_ELF_CONFIG_H
 
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSet.h"
@@ -108,6 +109,8 @@ struct Configuration {
   std::vector<SymbolVersion> VersionScriptLocals;
   std::vector<uint8_t> BuildIdVector;
   llvm::MapVector<Symbol *, RenamedSymbol> RenamedSymbols;
+  llvm::DenseMap<std::pair<llvm::StringRef, llvm::StringRef>, uint64_t>
+      CFGProfile;
   bool AllowMultipleDefinition;
   bool AsNeeded = false;
   bool Bsymbolic;
@@ -127,6 +130,7 @@ struct Configuration {
   bool GnuHash;
   bool ICF;
   bool MipsN32Abi = false;
+  bool NoCFGProfileReorder;
   bool NoGnuUnique;
   bool NoUndefinedVersion;
   bool NoinhibitExec;
diff --git a/ELF/Driver.cpp b/ELF/Driver.cpp
index 263ba7b..84d4d80 100644
--- a/ELF/Driver.cpp
+++ b/ELF/Driver.cpp
@@ -644,6 +644,7 @@ void LinkerDriver::readConfigs(opt::InputArgList &Args) {
   Config->LTOO = getInteger(Args, OPT_lto_O, 2);
   Config->LTOPartitions = getInteger(Args, OPT_lto_partitions, 1);
   Config->MapFile = Args.getLastArgValue(OPT_Map);
+  Config->NoCFGProfileReorder = Args.hasArg(OPT_no_cfg_profile_reorder);
   Config->NoGnuUnique = Args.hasArg(OPT_no_gnu_unique);
   Config->NoUndefinedVersion = Args.hasArg(OPT_no_undefined_version);
   Config->NoinhibitExec = Args.hasArg(OPT_noinhibit_exec);
diff --git a/ELF/InputFiles.cpp b/ELF/InputFiles.cpp
index a6cd1a6..848223b 100644
--- a/ELF/InputFiles.cpp
+++ b/ELF/InputFiles.cpp
@@ -511,6 +511,35 @@ InputSectionBase *ObjFile<ELFT>::createInputSection(const Elf_Shdr &Sec) {
   if (Name == ".eh_frame" && !Config->Relocatable)
     return make<EhInputSection>(this, &Sec, Name);
 
+  // Profile data.
+  if (Name == ".note.llvm.callgraph") {
+    ArrayRef<uint8_t> CallgraphBuff =
+        check(this->getObj().getSectionContents(&Sec));
+
+    StringRef Buff((const char *)CallgraphBuff.data(), CallgraphBuff.size());
+
+    auto ReadString = [&Buff]() {
+      size_t F = Buff.find_first_of(" \n");
+      StringRef Ret = Buff.substr(0, F);
+      Buff = Buff.substr(F + 1);
+      return Ret;
+    };
+
+    while (!Buff.empty()) {
+      StringRef From = ReadString();
+      StringRef To = ReadString();
+      uint64_t Count;
+      if (ReadString().getAsInteger(10, Count))
+        break;
+
+      // Merge duplicate counts by picking the largest.
+      uint64_t &C = Config->CFGProfile[std::make_pair(From, To)];
+      C = std::max(C, Count);
+    }
+
+    return &InputSection::Discarded;
+  }
+
   if (shouldMerge(Sec))
     return make<MergeInputSection>(this, &Sec, Name);
   return make<InputSection>(this, &Sec, Name);
diff --git a/ELF/Options.td b/ELF/Options.td
index 0de0d73..000e111 100644
--- a/ELF/Options.td
+++ b/ELF/Options.td
@@ -162,6 +162,9 @@ def nostdlib: F<"nostdlib">,
 def no_as_needed: F<"no-as-needed">,
   HelpText<"Always DT_NEEDED for shared libraries">;
 
+def no_cfg_profile_reorder: F<"no-cfg-profile-reorder">,
+  HelpText<"Disable reordering of sections based on profile information">;
+
 def no_color_diagnostics: F<"no-color-diagnostics">,
   HelpText<"Do not use colors in diagnostics">;
 
diff --git a/ELF/Writer.cpp b/ELF/Writer.cpp
index a9e3856..3a6e174 100644
--- a/ELF/Writer.cpp
+++ b/ELF/Writer.cpp
@@ -20,11 +20,13 @@
 #include "SyntheticSections.h"
 #include "Target.h"
 #include "Threads.h"
+#include "llvm/ADT/Hashing.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/Support/FileOutputBuffer.h"
 #include "llvm/Support/raw_ostream.h"
 #include <climits>
+#include <unordered_set>
 
 using namespace llvm;
 using namespace llvm::ELF;
@@ -896,6 +898,157 @@ template <class ELFT> static void sortBySymbolsOrder() {
       Sec->sort([&](InputSectionBase *S) { return SectionOrder.lookup(S); });
 }
 
+// Sort sections by the profile data provided in the .note.llvm.callgraph
+// sections.
+//
+// This algorithm is based on Call-Chain Clustering from:
+// Optimizing Function Placement for Large-Scale Data-Center Applications
+// https://research.fb.com/wp-content/uploads/2017/01/cgo2017-hfsort-final1.pdf
+//
+// This first builds a call graph based on the profile data then iteratively
+// merges the hottest call edges as long as it would not create a cluster larger
+// than the page size. All clusters are then sorted by a density metric to
+// further improve locality.
+template <class ELFT>
+static void sortByCFGProfile(ArrayRef<OutputSection *> OutputSections) {
+  if (Config->NoCFGProfileReorder)
+    return;
+
+  using NodeIndex = std::ptrdiff_t;
+
+  struct Node {
+    Node() {}
+    Node(const InputSectionBase *IS) {
+      Sections.push_back(IS);
+      Size = IS->getSize();
+    }
+    std::vector<const InputSectionBase *> Sections;
+    int64_t Size = 0;
+    uint64_t Weight = 0;
+  };
+
+  struct Edge {
+    NodeIndex From;
+    NodeIndex To;
+    mutable uint64_t Weight;
+    bool operator==(const Edge Other) const {
+      return From == Other.From && To == Other.To;
+    }
+  };
+
+  struct EdgeHash {
+    std::size_t operator()(const Edge E) const {
+      return llvm::hash_combine(E.From, E.To);
+    };
+  };
+
+  std::vector<Node> Nodes;
+  std::unordered_set<Edge, EdgeHash> Edges;
+
+  auto InsertOrIncrementEdge = [](std::unordered_set<Edge, EdgeHash> &Edges,
+                                  const Edge E) {
+    if (E.From == E.To)
+      return;
+    auto Res = Edges.insert(E);
+    if (!Res.second)
+      Res.first->Weight = SaturatingAdd(Res.first->Weight, E.Weight);
+  };
+
+  {
+    llvm::DenseMap<const InputSectionBase *, NodeIndex> SecToNode;
+
+    auto GetOrCreateNode =
+        [&Nodes, &SecToNode](const InputSectionBase *IS) -> NodeIndex {
+      auto Res = SecToNode.insert(std::make_pair(IS, Nodes.size()));
+      if (Res.second)
+        Nodes.emplace_back(IS);
+      return Res.first->second;
+    };
+
+    // Create the graph.
+    for (const auto &C : Config->CFGProfile) {
+      if (C.second == 0)
+        continue;
+      DefinedRegular *FromDR =
+          dyn_cast_or_null<DefinedRegular>(Symtab->find(C.first.first));
+      DefinedRegular *ToDR =
+          dyn_cast_or_null<DefinedRegular>(Symtab->find(C.first.second));
+      if (!FromDR || !ToDR)
+        continue;
+      auto FromSB = dyn_cast_or_null<const InputSectionBase>(FromDR->Section);
+      auto ToSB = dyn_cast_or_null<const InputSectionBase>(ToDR->Section);
+      if (!FromSB || !ToSB)
+        continue;
+      NodeIndex From = GetOrCreateNode(FromSB);
+      NodeIndex To = GetOrCreateNode(ToSB);
+      InsertOrIncrementEdge(Edges, {From, To, C.second});
+      Nodes[To].Weight = SaturatingAdd(Nodes[To].Weight, C.second);
+    }
+  }
+
+  // Collapse the graph.
+  while (!Edges.empty()) {
+    // Find the largest edge
+    // FIXME: non deterministic order for equal edges.
+    // FIXME: n^2
+    auto Max = std::max_element(
+        Edges.begin(), Edges.end(),
+        [](const Edge A, const Edge B) { return A.Weight < B.Weight; });
+    const Edge MaxE = *Max;
+    Edges.erase(Max);
+    // Merge the Nodes.
+    Node &From = Nodes[MaxE.From];
+    Node &To = Nodes[MaxE.To];
+    if (From.Size + To.Size > Target->PageSize)
+      continue;
+    From.Sections.insert(From.Sections.end(), To.Sections.begin(),
+                         To.Sections.end());
+    To.Sections.clear();
+    From.Size += To.Size;
+    From.Weight = SaturatingAdd(From.Weight, To.Weight);
+    // Collect all edges from or to the removed node and update them for the new
+    // node.
+    std::vector<Edge> OldEdges;
+    // FIXME: n^2
+    for (auto EI = Edges.begin(), EE = Edges.end(); EI != EE;) {
+      if (EI->From == MaxE.To || EI->To == MaxE.To) {
+        OldEdges.push_back(*EI);
+        EI = Edges.erase(EI);
+      } else
+        ++EI;
+    }
+    for (const Edge E : OldEdges) {
+      InsertOrIncrementEdge(Edges,
+                            {E.From == MaxE.To ? MaxE.From : E.From,
+                             E.To == MaxE.To ? MaxE.From : E.To, E.Weight});
+    }
+  }
+
+  // Sort by density.
+  std::sort(Nodes.begin(), Nodes.end(), [](const Node &A, const Node &B) {
+    return double(A.Weight) / double(A.Size) <
+           double(B.Weight) / double(B.Size);
+  });
+
+  // Generate order.
+  llvm::DenseMap<const InputSectionBase *, std::size_t> OrderMap;
+  ssize_t CurOrder = 0;
+
+  for (const Node &N : Nodes) {
+    if (N.Sections.empty())
+      continue;
+    for (const InputSectionBase *IS : N.Sections)
+      OrderMap[IS] = CurOrder++;
+  }
+
+  for (OutputSection *OS : OutputSections) {
+    if (OS->Name != ".text")
+      continue;
+    OS->sort([&](InputSectionBase *IS) { return OrderMap.lookup(IS); });
+    break;
+  }
+}
+
 template <class ELFT>
 void Writer<ELFT>::forEachRelSec(std::function<void(InputSectionBase &)> Fn) {
   for (InputSectionBase *IS : InputSections) {
@@ -928,6 +1081,7 @@ template <class ELFT> void Writer<ELFT>::createSections() {
                               Old.end());
 
   Script->fabricateDefaultCommands();
+  sortByCFGProfile<ELFT>(OutputSections);
   sortBySymbolsOrder<ELFT>();
   sortInitFini(findSection(".init_array"));
   sortInitFini(findSection(".fini_array"));


More information about the llvm-dev mailing list