[llvm] r315635 - [XRay][tools] Updated stacks tool with flamegraph output.

Keith Wyss via llvm-commits llvm-commits at lists.llvm.org
Thu Oct 12 15:47:42 PDT 2017


Author: kpw
Date: Thu Oct 12 15:47:42 2017
New Revision: 315635

URL: http://llvm.org/viewvc/llvm-project?rev=315635&view=rev
Log:
[XRay][tools] Updated stacks tool with flamegraph output.

Summary:
As the first step to allow analysis and visualization of xray collected data,
allow using the llvm-xray stacks tool to emit a complete listing of stacks in
the format consumable by a flamegraph tool.

Possible follow up formats include chrome trace viewer format and sql load
files.

As a POC, I'm able to generate flamegraphs of an xray instrumented llc compiling
hello world.

Reviewers: dberris, pelikan

Subscribers: llvm-commits

Differential Revision: https://reviews.llvm.org/D38650

Modified:
    llvm/trunk/tools/llvm-xray/xray-stacks.cc

Modified: llvm/trunk/tools/llvm-xray/xray-stacks.cc
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/tools/llvm-xray/xray-stacks.cc?rev=315635&r1=315634&r2=315635&view=diff
==============================================================================
--- llvm/trunk/tools/llvm-xray/xray-stacks.cc (original)
+++ llvm/trunk/tools/llvm-xray/xray-stacks.cc Thu Oct 12 15:47:42 2017
@@ -66,8 +66,52 @@ static cl::opt<bool>
                      cl::desc("Aggregate stack times across threads"),
                      cl::sub(Stack), cl::init(false));
 
-/// A helper struct to work with formatv and XRayRecords. Makes it easier to use
-/// instrumentation map names or addresses in formatted output.
+static cl::opt<bool>
+    DumpAllStacks("all-stacks",
+                  cl::desc("Dump sum of timings for all stacks. "
+                           "By default separates stacks per-thread."),
+                  cl::sub(Stack), cl::init(false));
+static cl::alias DumpAllStacksShort("all", cl::aliasopt(DumpAllStacks),
+                                    cl::desc("Alias for -all-stacks"),
+                                    cl::sub(Stack));
+
+// TODO(kpw): Add other interesting formats. Perhaps chrome trace viewer format
+// possibly with aggregations or just a linear trace of timings.
+enum StackOutputFormat { HUMAN, FLAMETOOL };
+
+static cl::opt<StackOutputFormat> StacksOutputFormat(
+    "stack-format",
+    cl::desc("The format that output stacks should be "
+             "output in. Only applies with all-stacks."),
+    cl::values(
+        clEnumValN(HUMAN, "human",
+                   "Human readable output. Only valid without -all-stacks."),
+        clEnumValN(FLAMETOOL, "flame",
+                   "Format consumable by Brendan Gregg's FlameGraph tool. "
+                   "Only valid with -all-stacks.")),
+    cl::sub(Stack), cl::init(HUMAN));
+
+// Types of values for each stack in a CallTrie.
+enum class AggregationType {
+  TOTAL_TIME,      // The total time spent in a stack and its callees.
+  INVOCATION_COUNT // The number of times the stack was invoked.
+};
+
+static cl::opt<AggregationType> RequestedAggregation(
+    "aggregation-type",
+    cl::desc("The type of aggregation to do on call stacks."),
+    cl::values(
+        clEnumValN(
+            AggregationType::TOTAL_TIME, "time",
+            "Capture the total time spent in an all invocations of a stack."),
+        clEnumValN(AggregationType::INVOCATION_COUNT, "count",
+                   "Capture the number of times a stack was invoked. "
+                   "In flamegraph mode, this count also includes invocations "
+                   "of all callees.")),
+    cl::sub(Stack), cl::init(AggregationType::TOTAL_TIME));
+
+/// A helper struct to work with formatv and XRayRecords. Makes it easier to
+/// use instrumentation map names or addresses in formatted output.
 struct format_xray_record : public FormatAdapter<XRayRecord> {
   explicit format_xray_record(XRayRecord record,
                               const FuncIdConversionHelper &conv)
@@ -274,10 +318,45 @@ TrieNode *mergeTrieNodes(const TrieNode
   return Node;
 }
 
+template <AggregationType AggType>
+std::size_t GetValueForStack(const TrieNode *Node);
+
+// When computing total time spent in a stack, we're adding the timings from
+// its callees and the timings from when it was a leaf.
+template <>
+std::size_t
+GetValueForStack<AggregationType::TOTAL_TIME>(const TrieNode *Node) {
+  auto TopSum = std::accumulate(Node->TerminalDurations.begin(),
+                                Node->TerminalDurations.end(), 0uLL);
+  return std::accumulate(Node->IntermediateDurations.begin(),
+                         Node->IntermediateDurations.end(), TopSum);
+}
+
+// Calculates how many times a function was invoked.
+// TODO: Hook up option to produce stacks
+template <>
+std::size_t
+GetValueForStack<AggregationType::INVOCATION_COUNT>(const TrieNode *Node) {
+  return Node->TerminalDurations.size() + Node->IntermediateDurations.size();
+}
+
+// Make sure there are implementations for each enum value.
+template <AggregationType T> struct DependentFalseType : std::false_type {};
+
+template <AggregationType AggType>
+std::size_t GetValueForStack(const TrieNode *Node) {
+  static_assert(DependentFalseType<AggType>::value,
+                "No implementation found for aggregation type provided.");
+  return 0;
+}
+
 class StackTrie {
+  // Avoid the magic number of 4 propagated through the code with an alias.
+  // We use this SmallVector to track the root nodes in a call graph.
+  using RootVector = SmallVector<TrieNode *, 4>;
 
   // We maintain pointers to the roots of the tries we see.
-  DenseMap<uint32_t, SmallVector<TrieNode *, 4>> Roots;
+  DenseMap<uint32_t, RootVector> Roots;
 
   // We make sure all the nodes are accounted for in this list.
   std::forward_list<TrieNode> NodeStore;
@@ -439,11 +518,23 @@ public:
     }
   }
 
+  /// Prints timing sums for each stack in each threads.
+  template <AggregationType AggType>
+  void printAllPerThread(raw_ostream &OS, FuncIdConversionHelper &FN,
+                         StackOutputFormat format) {
+    for (auto iter : Roots) {
+      uint32_t threadId = iter.first;
+      RootVector &perThreadRoots = iter.second;
+      bool reportThreadId = true;
+      printAll<AggType>(OS, FN, perThreadRoots, threadId, reportThreadId);
+    }
+  }
+
   /// Prints top stacks from looking at all the leaves and ignoring thread IDs.
   /// Stacks that consist of the same function IDs but were called in different
   /// thread IDs are not considered unique in this printout.
   void printIgnoringThreads(raw_ostream &OS, FuncIdConversionHelper &FN) {
-    SmallVector<TrieNode *, 4> RootValues;
+    RootVector RootValues;
 
     // Function to pull the values out of a map iterator.
     using RootsType = decltype(Roots.begin())::value_type;
@@ -459,30 +550,88 @@ public:
     print(OS, FN, RootValues);
   }
 
-  /// Merges the trie by thread id before printing top stacks.
-  void printAggregatingThreads(raw_ostream &OS, FuncIdConversionHelper &FN) {
-    std::forward_list<TrieNode> AggregatedNodeStore;
-    SmallVector<TrieNode *, 4> RootValues;
+  /// Creates a merged list of Tries for unique stacks that disregards their
+  /// thread IDs.
+  RootVector mergeAcrossThreads(std::forward_list<TrieNode> &NodeStore) {
+    RootVector MergedByThreadRoots;
     for (auto MapIter : Roots) {
       const auto &RootNodeVector = MapIter.second;
       for (auto *Node : RootNodeVector) {
-        auto MaybeFoundIter = find_if(RootValues, [Node](TrieNode *elem) {
-          return Node->FuncId == elem->FuncId;
-        });
-        if (MaybeFoundIter == RootValues.end()) {
-          RootValues.push_back(Node);
+        auto MaybeFoundIter =
+            find_if(MergedByThreadRoots, [Node](TrieNode *elem) {
+              return Node->FuncId == elem->FuncId;
+            });
+        if (MaybeFoundIter == MergedByThreadRoots.end()) {
+          MergedByThreadRoots.push_back(Node);
         } else {
-          RootValues.push_back(mergeTrieNodes(**MaybeFoundIter, *Node, nullptr,
-                                              AggregatedNodeStore));
-          RootValues.erase(MaybeFoundIter);
+          MergedByThreadRoots.push_back(
+              mergeTrieNodes(**MaybeFoundIter, *Node, nullptr, NodeStore));
+          MergedByThreadRoots.erase(MaybeFoundIter);
         }
       }
     }
-    print(OS, FN, RootValues);
+    return MergedByThreadRoots;
+  }
+
+  /// Print timing sums for all stacks merged by Thread ID.
+  template <AggregationType AggType>
+  void printAllAggregatingThreads(raw_ostream &OS, FuncIdConversionHelper &FN,
+                                  StackOutputFormat format) {
+    std::forward_list<TrieNode> AggregatedNodeStore;
+    RootVector MergedByThreadRoots = mergeAcrossThreads(AggregatedNodeStore);
+    bool reportThreadId = false;
+    printAll<AggType>(OS, FN, MergedByThreadRoots,
+                      /*threadId*/ 0, reportThreadId);
+  }
+
+  /// Merges the trie by thread id before printing top stacks.
+  void printAggregatingThreads(raw_ostream &OS, FuncIdConversionHelper &FN) {
+    std::forward_list<TrieNode> AggregatedNodeStore;
+    RootVector MergedByThreadRoots = mergeAcrossThreads(AggregatedNodeStore);
+    print(OS, FN, MergedByThreadRoots);
+  }
+
+  // TODO: Add a format option when more than one are supported.
+  template <AggregationType AggType>
+  void printAll(raw_ostream &OS, FuncIdConversionHelper &FN,
+                RootVector RootValues, uint32_t ThreadId, bool ReportThread) {
+    SmallVector<const TrieNode *, 16> S;
+    for (const auto *N : RootValues) {
+      S.clear();
+      S.push_back(N);
+      while (!S.empty()) {
+        auto *Top = S.pop_back_val();
+        printSingleStack<AggType>(OS, FN, ReportThread, ThreadId, Top);
+        for (const auto *C : Top->Callees)
+          S.push_back(C);
+      }
+    }
+  }
+
+  /// Prints values for stacks in a format consumable for the flamegraph.pl
+  /// tool. This is a line based format that lists each level in the stack
+  /// hierarchy in a semicolon delimited form followed by a space and a numeric
+  /// value. If breaking down by thread, the thread ID will be added as the
+  /// root level of the stack.
+  template <AggregationType AggType>
+  void printSingleStack(raw_ostream &OS, FuncIdConversionHelper &Converter,
+                        bool ReportThread, uint32_t ThreadId,
+                        const TrieNode *Node) {
+    if (ReportThread)
+      OS << "thread_" << ThreadId << ";";
+    SmallVector<const TrieNode *, 5> lineage{};
+    lineage.push_back(Node);
+    while (lineage.back()->Parent != nullptr)
+      lineage.push_back(lineage.back()->Parent);
+    while (!lineage.empty()) {
+      OS << Converter.SymbolOrNumber(lineage.back()->FuncId) << ";";
+      lineage.pop_back();
+    }
+    OS << " " << GetValueForStack<AggType>(Node) << "\n";
   }
 
   void print(raw_ostream &OS, FuncIdConversionHelper &FN,
-             SmallVector<TrieNode *, 4> RootValues) {
+             RootVector RootValues) {
     // Go through each of the roots, and traverse the call stack, producing the
     // aggregates as you go along. Remember these aggregates and stacks, and
     // show summary statistics about:
@@ -502,7 +651,7 @@ public:
       S.emplace_back(N);
 
       while (!S.empty()) {
-        auto Top = S.pop_back_val();
+        auto *Top = S.pop_back_val();
 
         // We only start printing the stack (by walking up the parent pointers)
         // when we get to a leaf function.
@@ -587,6 +736,17 @@ static CommandRegistration Unused(&Stack
               "that aggregates threads."),
         std::make_error_code(std::errc::invalid_argument));
 
+  if (!DumpAllStacks && StacksOutputFormat != HUMAN)
+    return make_error<StringError>(
+        Twine("Can't specify a non-human format without -all-stacks."),
+        std::make_error_code(std::errc::invalid_argument));
+
+  if (DumpAllStacks && StacksOutputFormat == HUMAN)
+    return make_error<StringError>(
+        Twine("You must specify a non-human format when reporting with "
+              "-all-stacks."),
+        std::make_error_code(std::errc::invalid_argument));
+
   symbolize::LLVMSymbolizer::Options Opts(
       symbolize::FunctionNameKind::LinkageName, true, true, false, "");
   symbolize::LLVMSymbolizer Symbolizer(Opts);
@@ -625,6 +785,44 @@ static CommandRegistration Unused(&Stack
         "No instrumented calls were accounted in the input file.",
         make_error_code(errc::result_out_of_range));
   }
+
+  // Report the stacks in a long form mode for another tool to analyze.
+  if (DumpAllStacks) {
+    if (AggregateThreads) {
+      switch (RequestedAggregation) {
+      case AggregationType::TOTAL_TIME:
+        ST.printAllAggregatingThreads<AggregationType::TOTAL_TIME>(
+            outs(), FuncIdHelper, StacksOutputFormat);
+        break;
+      case AggregationType::INVOCATION_COUNT:
+        ST.printAllAggregatingThreads<AggregationType::INVOCATION_COUNT>(
+            outs(), FuncIdHelper, StacksOutputFormat);
+        break;
+      default:
+        return make_error<StringError>(
+            "Illegal value for aggregation-type.",
+            make_error_code(errc::result_out_of_range));
+      }
+    } else {
+      switch (RequestedAggregation) {
+      case AggregationType::TOTAL_TIME:
+        ST.printAllPerThread<AggregationType::TOTAL_TIME>(outs(), FuncIdHelper,
+                                                          StacksOutputFormat);
+        break;
+      case AggregationType::INVOCATION_COUNT:
+        ST.printAllPerThread<AggregationType::INVOCATION_COUNT>(
+            outs(), FuncIdHelper, StacksOutputFormat);
+        break;
+      default:
+        return make_error<StringError>(
+            "Illegal value for aggregation-type.",
+            make_error_code(errc::result_out_of_range));
+      }
+    }
+    return Error::success();
+  }
+
+  // We're only outputting top stacks.
   if (AggregateThreads) {
     ST.printAggregatingThreads(outs(), FuncIdHelper);
   } else if (SeparateThreadStacks) {




More information about the llvm-commits mailing list