[llvm] 34e131b - [llvm-profgen] On-demand track optimized-away inlinees for preinliner.

Tue Feb 8 08:33:29 PST 2022

Author: Hongtao Yu
Date: 2022-02-08T08:33:23-08:00
New Revision: 34e131b0f253d9c2eda78378aaf77703bc2921f9

URL: https://github.com/llvm/llvm-project/commit/34e131b0f253d9c2eda78378aaf77703bc2921f9
DIFF: https://github.com/llvm/llvm-project/commit/34e131b0f253d9c2eda78378aaf77703bc2921f9.diff

LOG: [llvm-profgen] On-demand track optimized-away inlinees for preinliner.

Tracking optimized-away inlinees based on all probes in a binary is expansive in terms of memory usage I'm making the tracking on-demand based on profiled functions only. This saves about 10%  memory overall for a medium-sized benchmark.

Before:

   note: After parsePerfTraces
   note: Thu Jan 27 18:42:09 2022
   note: VM: 8.68 GB   RSS: 8.39 GB
   note: After computeSizeForProfiledFunctions
   note: Thu Jan 27 18:42:41 2022
   note: **VM: 10.63 GB   RSS: 10.20 GB**
   note: After generateProbeBasedProfile
   note: Thu Jan 27 18:45:49 2022
   note: VM: 25.00 GB   RSS: 24.95 GB
   note: After postProcessProfiles
   note: Thu Jan 27 18:49:29 2022
   note: VM: 26.34 GB   RSS: 26.27 GB

After:
   note: After parsePerfTraces
   note: Fri Jan 28 12:04:49 2022
   note: VM: 8.68 GB   RSS: 7.65 GB
   note: After computeSizeForProfiledFunctions
   note: Fri Jan 28 12:05:26 2022
   note: **VM: 8.68 GB   RSS: 8.42 GB**
   note: After generateProbeBasedProfile
   note: Fri Jan 28 12:08:03 2022
   note: VM: 22.93 GB   RSS: 22.89 GB
   note: After postProcessProfiles
   note: Fri Jan 28 12:11:30 2022
   note: VM: 24.27 GB   RSS: 24.22 GB

This should be a no-diff change in terms of profile quality.

Reviewed By: wenlei

Differential Revision: https://reviews.llvm.org/D118515

Added: 
    

Modified: 
    llvm/tools/llvm-profgen/ProfileGenerator.cpp
    llvm/tools/llvm-profgen/ProfiledBinary.cpp
    llvm/tools/llvm-profgen/ProfiledBinary.h

Removed: 
    


################################################################################
diff  --git a/llvm/tools/llvm-profgen/ProfileGenerator.cpp b/llvm/tools/llvm-profgen/ProfileGenerator.cpp
index 61d4626574a00..b0e72def46db6 100644

--- a/llvm/tools/llvm-profgen/ProfileGenerator.cpp
+++ b/llvm/tools/llvm-profgen/ProfileGenerator.cpp
@@ -589,27 +589,24 @@ void CSProfileGenerator::generateProfile() {
 }
 
 void CSProfileGenerator::computeSizeForProfiledFunctions() {
-  // Hash map to deduplicate the function range and the item is a pair of
-  // function start and end offset.
-  std::unordered_map<uint64_t, uint64_t> AggregatedRanges;
+  std::unordered_set<const BinaryFunction *> ProfiledFunctions;
+
   // Go through all the ranges in the CS counters, use the start of the range to
-  // look up the function it belongs and record the function range.
+  // look up the function it belongs and record the function.
   for (const auto &CI : SampleCounters) {
     for (const auto &Item : CI.second.RangeCounter) {
       // FIXME: Filter the bogus crossing function range.
       uint64_t StartOffset = Item.first.first;
-      // Note that a function can be spilt into multiple ranges, so get all
-      // ranges of the function.
-      for (const auto &Range : Binary->getRangesForOffset(StartOffset))
-        AggregatedRanges[Range.first] = Range.second;
+      if (FuncRange *FRange = Binary->findFuncRangeForOffset(StartOffset))
+        ProfiledFunctions.insert(FRange->Func);
     }
   }
 
-  for (const auto &I : AggregatedRanges) {
-    uint64_t StartOffset = I.first;
-    uint64_t EndOffset = I.second;
-    Binary->computeInlinedContextSizeForRange(StartOffset, EndOffset);
-  }
+  for (auto *Func : ProfiledFunctions)
+    Binary->computeInlinedContextSizeForFunc(Func);
+
+  // Flush the symbolizer to save memory.
+  Binary->flushSymbolizer();
 }
 
 void CSProfileGenerator::generateLineNumBasedProfile() {

diff  --git a/llvm/tools/llvm-profgen/ProfiledBinary.cpp b/llvm/tools/llvm-profgen/ProfiledBinary.cpp
index e477eb86f2652..3430b030c01a8 100644
--- a/llvm/tools/llvm-profgen/ProfiledBinary.cpp
+++ b/llvm/tools/llvm-profgen/ProfiledBinary.cpp
@@ -219,10 +219,6 @@ void ProfiledBinary::load() {
   // Disassemble the text sections.
   disassemble(Obj);
 
-  // Track size for optimized inlinees when probe is available
-  if (UsePseudoProbes && TrackFuncContextSize)
-    FuncSizeTracker.trackInlineesOptimizedAway(ProbeDecoder);
-
   // Use function start and return address to infer prolog and epilog
   ProEpilogTracker.inferPrologOffsets(StartOffset2FuncRangeMap);
   ProEpilogTracker.inferEpilogOffsets(RetOffsets);
@@ -349,6 +345,17 @@ void ProfiledBinary::decodePseudoProbe(const ELFObjectFileBase *Obj) {
     }
   }
 
+  // Build TopLevelProbeFrameMap to track size for optimized inlinees when probe
+  // is available
+  if (UsePseudoProbes && TrackFuncContextSize) {
+    for (const auto &Child : ProbeDecoder.getDummyInlineRoot().getChildren()) {
+      auto *Frame = Child.second.get();
+      StringRef FuncName =
+          ProbeDecoder.getFuncDescForGUID(Frame->Guid)->FuncName;
+      TopLevelProbeFrameMap[FuncName] = Frame;
+    }
+  }
+
   if (ShowPseudoProbe)
     ProbeDecoder.printGUID2FuncDescMap(outs());
 }
@@ -747,6 +754,25 @@ void ProfiledBinary::computeInlinedContextSizeForRange(uint64_t StartOffset,
   } while (IP.advance() && IP.Address < RangeEnd);
 }
 
+void ProfiledBinary::computeInlinedContextSizeForFunc(
+    const BinaryFunction *Func) {
+  // Note that a function can be spilt into multiple ranges, so compute for all
+  // ranges of the function.
+  for (const auto &Range : Func->Ranges)
+    computeInlinedContextSizeForRange(Range.first, Range.second);
+
+  // Track optimized-away inlinee for probed binary. A function inlined and then
+  // optimized away should still have their probes left over in places.
+  if (usePseudoProbes()) {
+    auto I = TopLevelProbeFrameMap.find(Func->FuncName);
+    if (I != TopLevelProbeFrameMap.end()) {
+      BinarySizeContextTracker::ProbeFrameStack ProbeContext;
+      FuncSizeTracker.trackInlineesOptimizedAway(ProbeDecoder, *I->second,
+                                                 ProbeContext);
+    }
+  }
+}
+
 InstructionPointer::InstructionPointer(const ProfiledBinary *Binary,
                                        uint64_t Address, bool RoundToNext)
     : Binary(Binary), Address(Address) {

diff  --git a/llvm/tools/llvm-profgen/ProfiledBinary.h b/llvm/tools/llvm-profgen/ProfiledBinary.h
index d3d1c6f1fd248..33b0b81fb0468 100644
--- a/llvm/tools/llvm-profgen/ProfiledBinary.h
+++ b/llvm/tools/llvm-profgen/ProfiledBinary.h
@@ -166,14 +166,14 @@ class BinarySizeContextTracker {
   // their remaining probes.
   void trackInlineesOptimizedAway(MCPseudoProbeDecoder &ProbeDecoder);
 
-  void dump() { RootContext.dumpTree(); }
-
-private:
   using ProbeFrameStack = SmallVector<std::pair<StringRef, uint32_t>>;
   void trackInlineesOptimizedAway(MCPseudoProbeDecoder &ProbeDecoder,
                               MCDecodedPseudoProbeInlineTree &ProbeNode,
                               ProbeFrameStack &Context);
 
+  void dump() { RootContext.dumpTree(); }
+
+private:
   // Root node for context trie tree, node that this is a reverse context trie
   // with callee as parent and caller as child. This way we can traverse from
   // root to find the best/longest matching context if an exact match does not
@@ -256,6 +256,9 @@ class ProfiledBinary {
   // Pseudo probe decoder
   MCPseudoProbeDecoder ProbeDecoder;
 
+  // Function name to probe frame map for top-level outlined functions.
+  StringMap<MCDecodedPseudoProbeInlineTree *> TopLevelProbeFrameMap;
+
   bool UsePseudoProbes = false;
 
   bool UseFSDiscriminator = false;
@@ -477,6 +480,8 @@ class ProfiledBinary {
     return Stack.back();
   }
 
+  void flushSymbolizer() { Symbolizer.reset(); }
+
   // Compare two addresses' inline context
   bool inlineContextEqual(uint64_t Add1, uint64_t Add2);
 
@@ -491,6 +496,8 @@ class ProfiledBinary {
   void computeInlinedContextSizeForRange(uint64_t StartOffset,
                                          uint64_t EndOffset);
 
+  void computeInlinedContextSizeForFunc(const BinaryFunction *Func);
+
   const MCDecodedPseudoProbe *getCallProbeForAddr(uint64_t Address) const {
     return ProbeDecoder.getCallProbeForAddr(Address);
   }