[llvm-branch-commits] [llvm] e562ff0 - [CSSPGO][llvm-profgen] Aggregate samples on call frame trie to speed up profile generation

Tom Stellard via llvm-branch-commits llvm-branch-commits at lists.llvm.org
Fri Feb 19 21:28:39 PST 2021


Author: wlei
Date: 2021-02-19T21:21:11-08:00
New Revision: e562ff08f634d814c1cd1e65e3428ca5308d3022

URL: https://github.com/llvm/llvm-project/commit/e562ff08f634d814c1cd1e65e3428ca5308d3022
DIFF: https://github.com/llvm/llvm-project/commit/e562ff08f634d814c1cd1e65e3428ca5308d3022.diff

LOG: [CSSPGO][llvm-profgen] Aggregate samples on call frame trie to speed up profile generation

For CS profile generation, the process of call stack unwinding is time-consuming since for each LBR entry we need linear time to generate the context( hash, compression, string concatenation). This change speeds up this by grouping all the call frame within one LBR sample into a trie and aggregating the result(sample counter) on it, deferring the context compression and string generation to the end of unwinding.

Specifically, it uses `StackLeaf` as the top frame on the stack and manipulates(pop or push a trie node) it dynamically during virtual unwinding so that the raw sample can just be recoded on the leaf node, the path(root to leaf) will represent its calling context. In the end, it traverses the trie and generates the context on the fly.

Results:
Our internal branch shows about 5X speed-up on some large workloads in SPEC06 benchmark.

Differential Revision: https://reviews.llvm.org/D94110

Added: 
    

Modified: 
    llvm/tools/llvm-profgen/PerfReader.cpp
    llvm/tools/llvm-profgen/PerfReader.h
    llvm/tools/llvm-profgen/ProfiledBinary.cpp
    llvm/tools/llvm-profgen/ProfiledBinary.h

Removed: 
    


################################################################################
diff  --git a/llvm/tools/llvm-profgen/PerfReader.cpp b/llvm/tools/llvm-profgen/PerfReader.cpp
index d05c665f8583..787bde28400f 100644
--- a/llvm/tools/llvm-profgen/PerfReader.cpp
+++ b/llvm/tools/llvm-profgen/PerfReader.cpp
@@ -28,11 +28,12 @@ void VirtualUnwinder::unwindCall(UnwindState &State) {
   // 2nd frame is in prolog/epilog. In the future, we will switch to
   // pro/epi tracker(Dwarf CFI) for the precise check.
   uint64_t Source = State.getCurrentLBRSource();
-  auto Iter = State.CallStack.begin();
-  if (State.CallStack.size() == 1 || *(++Iter) != Source) {
-    State.CallStack.front() = Source;
+  auto *ParentFrame = State.getParentFrame();
+  if (ParentFrame == State.getDummyRootPtr() ||
+      ParentFrame->Address != Source) {
+    State.switchToFrame(Source);
   } else {
-    State.CallStack.pop_front();
+    State.popFrame();
   }
   State.InstPtr.update(Source);
 }
@@ -41,26 +42,29 @@ void VirtualUnwinder::unwindLinear(UnwindState &State, uint64_t Repeat) {
   InstructionPointer &IP = State.InstPtr;
   uint64_t Target = State.getCurrentLBRTarget();
   uint64_t End = IP.Address;
-  if (State.getBinary()->usePseudoProbes()) {
+  if (Binary->usePseudoProbes()) {
+    // We don't need to top frame probe since it should be extracted
+    // from the range.
     // The outcome of the virtual unwinding with pseudo probes is a
     // map from a context key to the address range being unwound.
     // This means basically linear unwinding is not needed for pseudo
     // probes. The range will be simply recorded here and will be
     // converted to a list of pseudo probes to report in ProfileGenerator.
-    recordRangeCount(Target, End, State, Repeat);
+    State.getParentFrame()->recordRangeCount(Target, End, Repeat);
   } else {
     // Unwind linear execution part
+    uint64_t LeafAddr = State.CurrentLeafFrame->Address;
     while (IP.Address >= Target) {
       uint64_t PrevIP = IP.Address;
       IP.backward();
       // Break into segments for implicit call/return due to inlining
-      bool SameInlinee =
-          State.getBinary()->inlineContextEqual(PrevIP, IP.Address);
+      bool SameInlinee = Binary->inlineContextEqual(PrevIP, IP.Address);
       if (!SameInlinee || PrevIP == Target) {
-        recordRangeCount(PrevIP, End, State, Repeat);
+        State.switchToFrame(LeafAddr);
+        State.CurrentLeafFrame->recordRangeCount(PrevIP, End, Repeat);
         End = IP.Address;
       }
-      State.CallStack.front() = IP.Address;
+      LeafAddr = IP.Address;
     }
   }
 }
@@ -68,9 +72,9 @@ void VirtualUnwinder::unwindLinear(UnwindState &State, uint64_t Repeat) {
 void VirtualUnwinder::unwindReturn(UnwindState &State) {
   // Add extra frame as we unwind through the return
   const LBREntry &LBR = State.getCurrentLBR();
-  uint64_t CallAddr = State.getBinary()->getCallAddrFromFrameAddr(LBR.Target);
-  State.CallStack.front() = CallAddr;
-  State.CallStack.push_front(LBR.Source);
+  uint64_t CallAddr = Binary->getCallAddrFromFrameAddr(LBR.Target);
+  State.switchToFrame(CallAddr);
+  State.pushFrame(LBR.Source);
   State.InstPtr.update(LBR.Source);
 }
 
@@ -78,79 +82,100 @@ void VirtualUnwinder::unwindBranchWithinFrame(UnwindState &State) {
   // TODO: Tolerate tail call for now, as we may see tail call from libraries.
   // This is only for intra function branches, excluding tail calls.
   uint64_t Source = State.getCurrentLBRSource();
-  State.CallStack.front() = Source;
+  State.switchToFrame(Source);
   State.InstPtr.update(Source);
 }
 
-SampleCounter &
-VirtualUnwinder::getOrCreateCounter(const ProfiledBinary *Binary,
-                                    std::list<uint64_t> &CallStack) {
-  if (Binary->usePseudoProbes()) {
-    return getOrCreateCounterForProbe(Binary, CallStack);
-  }
+std::shared_ptr<StringBasedCtxKey> FrameStack::getContextKey() {
   std::shared_ptr<StringBasedCtxKey> KeyStr =
       std::make_shared<StringBasedCtxKey>();
-  KeyStr->Context = Binary->getExpandedContextStr(CallStack);
+  KeyStr->Context = Binary->getExpandedContextStr(Stack);
   KeyStr->genHashCode();
-  auto Ret =
-      CtxCounterMap->emplace(Hashable<ContextKey>(KeyStr), SampleCounter());
-  return Ret.first->second;
+  return KeyStr;
 }
 
-SampleCounter &
-VirtualUnwinder::getOrCreateCounterForProbe(const ProfiledBinary *Binary,
-                                            std::list<uint64_t> &CallStack) {
+std::shared_ptr<ProbeBasedCtxKey> ProbeStack::getContextKey() {
   std::shared_ptr<ProbeBasedCtxKey> ProbeBasedKey =
       std::make_shared<ProbeBasedCtxKey>();
-  if (CallStack.size() > 1) {
-    // We don't need to top frame probe since it should be extracted
-    // from the range.
-    // The top of stack is an instruction from the function where
-    // the LBR address range physcially resides. Strip it since
-    // the function is not a part of the call context. We also
-    // don't need its inline context since the probes being unwound
-    // come with an inline context all the way back to the uninlined
-    // function in their prefix tree.
-    auto Iter = CallStack.rbegin();
-    auto EndT = std::prev(CallStack.rend());
-    for (; Iter != EndT; Iter++) {
-      uint64_t Address = *Iter;
-      const PseudoProbe *CallProbe = Binary->getCallProbeForAddr(Address);
-      // We may not find a probe for a merged or external callsite.
-      // Callsite merging may cause the loss of original probe IDs.
-      // Cutting off the context from here since the inline will
-      // not know how to consume a context with unknown callsites.
-      if (!CallProbe)
-        break;
-      ProbeBasedKey->Probes.emplace_back(CallProbe);
-    }
+  for (auto CallProbe : Stack) {
+    ProbeBasedKey->Probes.emplace_back(CallProbe);
   }
   CSProfileGenerator::compressRecursionContext<const PseudoProbe *>(
       ProbeBasedKey->Probes);
   ProbeBasedKey->genHashCode();
-  Hashable<ContextKey> ContextId(ProbeBasedKey);
-  auto Ret = CtxCounterMap->emplace(ContextId, SampleCounter());
-  return Ret.first->second;
+  return ProbeBasedKey;
+}
+
+template <typename T>
+void VirtualUnwinder::collectSamplesFromFrame(UnwindState::ProfiledFrame *Cur,
+                                              T &Stack) {
+  if (Cur->RangeSamples.empty() && Cur->BranchSamples.empty())
+    return;
+
+  std::shared_ptr<ContextKey> Key = Stack.getContextKey();
+  auto Ret = CtxCounterMap->emplace(Hashable<ContextKey>(Key), SampleCounter());
+  SampleCounter &SCounter = Ret.first->second;
+  for (auto &Item : Cur->RangeSamples) {
+    uint64_t StartOffset = Binary->virtualAddrToOffset(std::get<0>(Item));
+    uint64_t EndOffset = Binary->virtualAddrToOffset(std::get<1>(Item));
+    SCounter.recordRangeCount(StartOffset, EndOffset, std::get<2>(Item));
+  }
+
+  for (auto &Item : Cur->BranchSamples) {
+    uint64_t SourceOffset = Binary->virtualAddrToOffset(std::get<0>(Item));
+    uint64_t TargetOffset = Binary->virtualAddrToOffset(std::get<1>(Item));
+    SCounter.recordBranchCount(SourceOffset, TargetOffset, std::get<2>(Item));
+  }
+}
+
+template <typename T>
+void VirtualUnwinder::collectSamplesFromFrameTrie(
+    UnwindState::ProfiledFrame *Cur, T &Stack) {
+  if (!Cur->isDummyRoot()) {
+    if (!Stack.pushFrame(Cur)) {
+      // Process truncated context
+      for (const auto &Item : Cur->Children) {
+        // Start a new traversal ignoring its bottom context
+        collectSamplesFromFrameTrie(Item.second.get());
+      }
+      return;
+    }
+  }
+
+  collectSamplesFromFrame(Cur, Stack);
+  // Process children frame
+  for (const auto &Item : Cur->Children) {
+    collectSamplesFromFrameTrie(Item.second.get(), Stack);
+  }
+  // Recover the call stack
+  Stack.popFrame();
 }
 
-void VirtualUnwinder::recordRangeCount(uint64_t Start, uint64_t End,
-                                       UnwindState &State, uint64_t Repeat) {
-  uint64_t StartOffset = State.getBinary()->virtualAddrToOffset(Start);
-  uint64_t EndOffset = State.getBinary()->virtualAddrToOffset(End);
-  SampleCounter &SCounter =
-      getOrCreateCounter(State.getBinary(), State.CallStack);
-  SCounter.recordRangeCount(StartOffset, EndOffset, Repeat);
+void VirtualUnwinder::collectSamplesFromFrameTrie(
+    UnwindState::ProfiledFrame *Cur) {
+  if (Binary->usePseudoProbes()) {
+    ProbeStack Stack(Binary);
+    collectSamplesFromFrameTrie<ProbeStack>(Cur, Stack);
+  } else {
+    FrameStack Stack(Binary);
+    collectSamplesFromFrameTrie<FrameStack>(Cur, Stack);
+  }
 }
 
 void VirtualUnwinder::recordBranchCount(const LBREntry &Branch,
                                         UnwindState &State, uint64_t Repeat) {
   if (Branch.IsArtificial)
     return;
-  uint64_t SourceOffset = State.getBinary()->virtualAddrToOffset(Branch.Source);
-  uint64_t TargetOffset = State.getBinary()->virtualAddrToOffset(Branch.Target);
-  SampleCounter &SCounter =
-      getOrCreateCounter(State.getBinary(), State.CallStack);
-  SCounter.recordBranchCount(SourceOffset, TargetOffset, Repeat);
+
+  if (Binary->usePseudoProbes()) {
+    // Same as recordRangeCount, We don't need to top frame probe since we will
+    // extract it from branch's source address
+    State.getParentFrame()->recordBranchCount(Branch.Source, Branch.Target,
+                                              Repeat);
+  } else {
+    State.CurrentLeafFrame->recordBranchCount(Branch.Source, Branch.Target,
+                                              Repeat);
+  }
 }
 
 bool VirtualUnwinder::unwind(const HybridSample *Sample, uint64_t Repeat) {
@@ -199,6 +224,8 @@ bool VirtualUnwinder::unwind(const HybridSample *Sample, uint64_t Repeat) {
     // Record `branch` with calling context after unwinding.
     recordBranchCount(Branch, State, Repeat);
   }
+  // As samples are aggregated on trie, record them into counter map
+  collectSamplesFromFrameTrie(State.getDummyRootPtr());
 
   return true;
 }
@@ -325,7 +352,8 @@ void PerfReader::printUnwinderOutput() {
 void PerfReader::unwindSamples() {
   for (const auto &Item : AggregatedSamples) {
     const HybridSample *Sample = dyn_cast<HybridSample>(Item.first.getPtr());
-    VirtualUnwinder Unwinder(&BinarySampleCounters[Sample->Binary]);
+    VirtualUnwinder Unwinder(&BinarySampleCounters[Sample->Binary],
+                             Sample->Binary);
     Unwinder.unwind(Sample, Item.second);
   }
 
@@ -334,7 +362,7 @@ void PerfReader::unwindSamples() {
 }
 
 bool PerfReader::extractLBRStack(TraceStream &TraceIt,
-                                 SmallVector<LBREntry, 16> &LBRStack,
+                                 SmallVectorImpl<LBREntry> &LBRStack,
                                  ProfiledBinary *Binary) {
   // The raw format of LBR stack is like:
   // 0x4005c8/0x4005dc/P/-/-/0 0x40062f/0x4005b0/P/-/-/0 ...
@@ -398,7 +426,7 @@ bool PerfReader::extractLBRStack(TraceStream &TraceIt,
 }
 
 bool PerfReader::extractCallstack(TraceStream &TraceIt,
-                                  std::list<uint64_t> &CallStack) {
+                                  SmallVectorImpl<uint64_t> &CallStack) {
   // The raw format of call stack is like:
   //            4005dc      # leaf frame
   //	          400634

diff  --git a/llvm/tools/llvm-profgen/PerfReader.h b/llvm/tools/llvm-profgen/PerfReader.h
index 66649a060bc3..7eaa4b846259 100644
--- a/llvm/tools/llvm-profgen/PerfReader.h
+++ b/llvm/tools/llvm-profgen/PerfReader.h
@@ -133,7 +133,7 @@ struct HybridSample : public PerfSample {
   // Profiled binary that current frame address belongs to
   ProfiledBinary *Binary;
   // Call stack recorded in FILO(leaf to root) order
-  std::list<uint64_t> CallStack;
+  SmallVector<uint64_t, 16> CallStack;
   // LBR stack recorded in FIFO order
   SmallVector<LBREntry, 16> LBRStack;
 
@@ -147,7 +147,7 @@ struct HybridSample : public PerfSample {
     const HybridSample *Other = dyn_cast<HybridSample>(K);
     if (Other->Binary != Binary)
       return false;
-    const std::list<uint64_t> &OtherCallStack = Other->CallStack;
+    const SmallVector<uint64_t, 16> &OtherCallStack = Other->CallStack;
     const SmallVector<LBREntry, 16> &OtherLBRStack = Other->LBRStack;
 
     if (CallStack.size() != OtherCallStack.size() ||
@@ -193,14 +193,40 @@ using AggregatedCounter =
     std::unordered_map<Hashable<PerfSample>, uint64_t,
                        Hashable<PerfSample>::Hash, Hashable<PerfSample>::Equal>;
 
+using SampleVector = SmallVector<std::tuple<uint64_t, uint64_t, uint64_t>, 16>;
 // The state for the unwinder, it doesn't hold the data but only keep the
 // pointer/index of the data, While unwinding, the CallStack is changed
 // dynamicially and will be recorded as the context of the sample
 struct UnwindState {
   // Profiled binary that current frame address belongs to
   const ProfiledBinary *Binary;
-  // TODO: switch to use trie for call stack
-  std::list<uint64_t> CallStack;
+  // Call stack trie node
+  struct ProfiledFrame {
+    const uint64_t Address = 0;
+    ProfiledFrame *Parent;
+    SampleVector RangeSamples;
+    SampleVector BranchSamples;
+    std::unordered_map<uint64_t, std::unique_ptr<ProfiledFrame>> Children;
+
+    ProfiledFrame(uint64_t Addr = 0, ProfiledFrame *P = nullptr)
+        : Address(Addr), Parent(P) {}
+    ProfiledFrame *getOrCreateChildFrame(uint64_t Address) {
+      assert(Address && "Address can't be zero!");
+      auto Ret = Children.emplace(
+          Address, std::make_unique<ProfiledFrame>(Address, this));
+      return Ret.first->second.get();
+    }
+    void recordRangeCount(uint64_t Start, uint64_t End, uint64_t Count) {
+      RangeSamples.emplace_back(std::make_tuple(Start, End, Count));
+    }
+    void recordBranchCount(uint64_t Source, uint64_t Target, uint64_t Count) {
+      BranchSamples.emplace_back(std::make_tuple(Source, Target, Count));
+    }
+    bool isDummyRoot() { return Address == 0; }
+  };
+
+  ProfiledFrame DummyTrieRoot;
+  ProfiledFrame *CurrentLeafFrame;
   // Used to fall through the LBR stack
   uint32_t LBRIndex = 0;
   // Reference to HybridSample.LBRStack
@@ -208,19 +234,20 @@ struct UnwindState {
   // Used to iterate the address range
   InstructionPointer InstPtr;
   UnwindState(const HybridSample *Sample)
-      : Binary(Sample->Binary), CallStack(Sample->CallStack),
-        LBRStack(Sample->LBRStack),
-        InstPtr(Sample->Binary, Sample->CallStack.front()) {}
+      : Binary(Sample->Binary), LBRStack(Sample->LBRStack),
+        InstPtr(Sample->Binary, Sample->CallStack.front()) {
+    initFrameTrie(Sample->CallStack);
+  }
 
   bool validateInitialState() {
     uint64_t LBRLeaf = LBRStack[LBRIndex].Target;
-    uint64_t StackLeaf = CallStack.front();
+    uint64_t LeafAddr = CurrentLeafFrame->Address;
     // When we take a stack sample, ideally the sampling distance between the
     // leaf IP of stack and the last LBR target shouldn't be very large.
     // Use a heuristic size (0x100) to filter out broken records.
-    if (StackLeaf < LBRLeaf || StackLeaf >= LBRLeaf + 0x100) {
+    if (LeafAddr < LBRLeaf || LeafAddr >= LBRLeaf + 0x100) {
       WithColor::warning() << "Bogus trace: stack tip = "
-                           << format("%#010x", StackLeaf)
+                           << format("%#010x", LeafAddr)
                            << ", LBR tip = " << format("%#010x\n", LBRLeaf);
       return false;
     }
@@ -228,19 +255,40 @@ struct UnwindState {
   }
 
   void checkStateConsistency() {
-    assert(InstPtr.Address == CallStack.front() &&
+    assert(InstPtr.Address == CurrentLeafFrame->Address &&
            "IP should align with context leaf");
   }
 
-  std::string getExpandedContextStr() const {
-    return Binary->getExpandedContextStr(CallStack);
-  }
   const ProfiledBinary *getBinary() const { return Binary; }
   bool hasNextLBR() const { return LBRIndex < LBRStack.size(); }
   uint64_t getCurrentLBRSource() const { return LBRStack[LBRIndex].Source; }
   uint64_t getCurrentLBRTarget() const { return LBRStack[LBRIndex].Target; }
   const LBREntry &getCurrentLBR() const { return LBRStack[LBRIndex]; }
   void advanceLBR() { LBRIndex++; }
+
+  ProfiledFrame *getParentFrame() { return CurrentLeafFrame->Parent; }
+
+  void pushFrame(uint64_t Address) {
+    CurrentLeafFrame = CurrentLeafFrame->getOrCreateChildFrame(Address);
+  }
+
+  void switchToFrame(uint64_t Address) {
+    if (CurrentLeafFrame->Address == Address)
+      return;
+    CurrentLeafFrame = CurrentLeafFrame->Parent->getOrCreateChildFrame(Address);
+  }
+
+  void popFrame() { CurrentLeafFrame = CurrentLeafFrame->Parent; }
+
+  void initFrameTrie(const SmallVectorImpl<uint64_t> &CallStack) {
+    ProfiledFrame *Cur = &DummyTrieRoot;
+    for (auto Address : reverse(CallStack)) {
+      Cur = Cur->getOrCreateChildFrame(Address);
+    }
+    CurrentLeafFrame = Cur;
+  }
+
+  ProfiledFrame *getDummyRootPtr() { return &DummyTrieRoot; }
 };
 
 // Base class for sample counter key with context
@@ -330,6 +378,56 @@ using ContextSampleCounterMap =
     std::unordered_map<Hashable<ContextKey>, SampleCounter,
                        Hashable<ContextKey>::Hash, Hashable<ContextKey>::Equal>;
 
+struct FrameStack {
+  SmallVector<uint64_t, 16> Stack;
+  const ProfiledBinary *Binary;
+  FrameStack(const ProfiledBinary *B) : Binary(B) {}
+  bool pushFrame(UnwindState::ProfiledFrame *Cur) {
+    Stack.push_back(Cur->Address);
+    return true;
+  }
+
+  void popFrame() {
+    if (!Stack.empty())
+      Stack.pop_back();
+  }
+  std::shared_ptr<StringBasedCtxKey> getContextKey();
+};
+
+struct ProbeStack {
+  SmallVector<const PseudoProbe *, 16> Stack;
+  const ProfiledBinary *Binary;
+  ProbeStack(const ProfiledBinary *B) : Binary(B) {}
+  bool pushFrame(UnwindState::ProfiledFrame *Cur) {
+    const PseudoProbe *CallProbe = Binary->getCallProbeForAddr(Cur->Address);
+    // We may not find a probe for a merged or external callsite.
+    // Callsite merging may cause the loss of original probe IDs.
+    // Cutting off the context from here since the inliner will
+    // not know how to consume a context with unknown callsites.
+    if (!CallProbe)
+      return false;
+    Stack.push_back(CallProbe);
+    return true;
+  }
+
+  void popFrame() {
+    if (!Stack.empty())
+      Stack.pop_back();
+  }
+  // Use pseudo probe based context key to get the sample counter
+  // A context stands for a call path from 'main' to an uninlined
+  // callee with all inline frames recovered on that path. The probes
+  // belonging to that call path is the probes either originated from
+  // the callee or from any functions inlined into the callee. Since
+  // pseudo probes are organized in a tri-tree style after decoded,
+  // the tree path from the tri-tree root (which is the uninlined
+  // callee) to the probe node forms an inline context.
+  // Here we use a list of probe(pointer) as the context key to speed up
+  // aggregation and the final context string will be generate in
+  // ProfileGenerator
+  std::shared_ptr<ProbeBasedCtxKey> getContextKey();
+};
+
 /*
 As in hybrid sample we have a group of LBRs and the most recent sampling call
 stack, we can walk through those LBRs to infer more call stacks which would be
@@ -351,47 +449,43 @@ range as sample counter for further CS profile generation.
 */
 class VirtualUnwinder {
 public:
-  VirtualUnwinder(ContextSampleCounterMap *Counter) : CtxCounterMap(Counter) {}
+  VirtualUnwinder(ContextSampleCounterMap *Counter, const ProfiledBinary *B)
+      : CtxCounterMap(Counter), Binary(B) {}
+  bool unwind(const HybridSample *Sample, uint64_t Repeat);
 
+private:
   bool isCallState(UnwindState &State) const {
     // The tail call frame is always missing here in stack sample, we will
     // use a specific tail call tracker to infer it.
-    return State.getBinary()->addressIsCall(State.getCurrentLBRSource());
+    return Binary->addressIsCall(State.getCurrentLBRSource());
   }
 
   bool isReturnState(UnwindState &State) const {
     // Simply check addressIsReturn, as ret is always reliable, both for
     // regular call and tail call.
-    return State.getBinary()->addressIsReturn(State.getCurrentLBRSource());
+    return Binary->addressIsReturn(State.getCurrentLBRSource());
   }
 
   void unwindCall(UnwindState &State);
   void unwindLinear(UnwindState &State, uint64_t Repeat);
   void unwindReturn(UnwindState &State);
   void unwindBranchWithinFrame(UnwindState &State);
-  bool unwind(const HybridSample *Sample, uint64_t Repeat);
+
+  template <typename T>
+  void collectSamplesFromFrame(UnwindState::ProfiledFrame *Cur, T &Stack);
+  // Collect each samples on trie node by DFS traversal
+  template <typename T>
+  void collectSamplesFromFrameTrie(UnwindState::ProfiledFrame *Cur, T &Stack);
+  void collectSamplesFromFrameTrie(UnwindState::ProfiledFrame *Cur);
+
   void recordRangeCount(uint64_t Start, uint64_t End, UnwindState &State,
                         uint64_t Repeat);
   void recordBranchCount(const LBREntry &Branch, UnwindState &State,
                          uint64_t Repeat);
-  SampleCounter &getOrCreateCounter(const ProfiledBinary *Binary,
-                                    std::list<uint64_t> &CallStack);
-  // Use pseudo probe based context key to get the sample counter
-  // A context stands for a call path from 'main' to an uninlined
-  // callee with all inline frames recovered on that path. The probes
-  // belonging to that call path is the probes either originated from
-  // the callee or from any functions inlined into the callee. Since
-  // pseudo probes are organized in a tri-tree style after decoded,
-  // the tree path from the tri-tree root (which is the uninlined
-  // callee) to the probe node forms an inline context.
-  // Here we use a list of probe(pointer) as the context key to speed up
-  // aggregation and the final context string will be generate in
-  // ProfileGenerator
-  SampleCounter &getOrCreateCounterForProbe(const ProfiledBinary *Binary,
-                                            std::list<uint64_t> &CallStack);
 
-private:
   ContextSampleCounterMap *CtxCounterMap;
+  // Profiled binary that current frame address belongs to
+  const ProfiledBinary *Binary;
 };
 
 // Filename to binary map
@@ -457,10 +551,11 @@ class PerfReader {
   // Parse the hybrid sample including the call and LBR line
   void parseHybridSample(TraceStream &TraceIt);
   // Extract call stack from the perf trace lines
-  bool extractCallstack(TraceStream &TraceIt, std::list<uint64_t> &CallStack);
+  bool extractCallstack(TraceStream &TraceIt,
+                        SmallVectorImpl<uint64_t> &CallStack);
   // Extract LBR stack from one perf trace line
   bool extractLBRStack(TraceStream &TraceIt,
-                       SmallVector<LBREntry, 16> &LBRStack,
+                       SmallVectorImpl<LBREntry> &LBRStack,
                        ProfiledBinary *Binary);
   void checkAndSetPerfType(cl::list<std::string> &PerfTraceFilenames);
   // Post process the profile after trace aggregation, we will do simple range

diff  --git a/llvm/tools/llvm-profgen/ProfiledBinary.cpp b/llvm/tools/llvm-profgen/ProfiledBinary.cpp
index 16ef04aba99e..2c6cedf57649 100644
--- a/llvm/tools/llvm-profgen/ProfiledBinary.cpp
+++ b/llvm/tools/llvm-profgen/ProfiledBinary.cpp
@@ -126,13 +126,13 @@ bool ProfiledBinary::inlineContextEqual(uint64_t Address1,
                     Context2.begin(), Context2.begin() + Context2.size() - 1);
 }
 
-std::string
-ProfiledBinary::getExpandedContextStr(const std::list<uint64_t> &Stack) const {
+std::string ProfiledBinary::getExpandedContextStr(
+    const SmallVectorImpl<uint64_t> &Stack) const {
   std::string ContextStr;
   SmallVector<std::string, 16> ContextVec;
   // Process from frame root to leaf
-  for (auto Iter = Stack.rbegin(); Iter != Stack.rend(); Iter++) {
-    uint64_t Offset = virtualAddrToOffset(*Iter);
+  for (auto Address : Stack) {
+    uint64_t Offset = virtualAddrToOffset(Address);
     const FrameLocationStack &ExpandedContext = getFrameLocationStack(Offset);
     for (const auto &Loc : ExpandedContext) {
       ContextVec.push_back(getCallSite(Loc));

diff  --git a/llvm/tools/llvm-profgen/ProfiledBinary.h b/llvm/tools/llvm-profgen/ProfiledBinary.h
index bc28e58deb9d..f6c7460e186d 100644
--- a/llvm/tools/llvm-profgen/ProfiledBinary.h
+++ b/llvm/tools/llvm-profgen/ProfiledBinary.h
@@ -236,7 +236,8 @@ class ProfiledBinary {
   // Get the context string of the current stack with inline context filled in.
   // It will search the disassembling info stored in Offset2LocStackMap. This is
   // used as the key of function sample map
-  std::string getExpandedContextStr(const std::list<uint64_t> &stack) const;
+  std::string
+  getExpandedContextStr(const SmallVectorImpl<uint64_t> &Stack) const;
 
   const PseudoProbe *getCallProbeForAddr(uint64_t Address) const {
     return ProbeDecoder.getCallProbeForAddr(Address);


        


More information about the llvm-branch-commits mailing list