[compiler-rt] r346116 - [XRay] Update TSC math to handle wraparound

Dean Michael Berris via llvm-commits llvm-commits at lists.llvm.org
Sun Nov 4 21:43:22 PST 2018


Author: dberris
Date: Sun Nov  4 21:43:22 2018
New Revision: 346116

URL: http://llvm.org/viewvc/llvm-project?rev=346116&view=rev
Log:
[XRay] Update TSC math to handle wraparound

Summary:
Prior to this change, we can run into situations where the TSC we're
getting when exiting a function is less than the TSC we got when
entering it. This would sometimes cause the counter for cumulative call
times overflow, which was erroneously also being stored as a signed
64-bit integer.

This change addresses both these issues while adding provisions for
tracking CPU migrations. We do this because moving from one CPU to
another doesn't guarantee that the timestamp counter for some
architectures aren't guaranteed to be synchronised. For the moment, we
leave the provisions there until we can update the data format to
include the counting of CPU migrations we can catch.

We update the necessary tests as well, ensuring that our expectations
for the cycle accounting to be met in case of counter wraparound.

Reviewers: mboerger

Subscribers: llvm-commits

Differential Revision: https://reviews.llvm.org/D54088

Modified:
    compiler-rt/trunk/lib/xray/tests/unit/function_call_trie_test.cc
    compiler-rt/trunk/lib/xray/tests/unit/profile_collector_test.cc
    compiler-rt/trunk/lib/xray/xray_function_call_trie.h
    compiler-rt/trunk/lib/xray/xray_profiling.cc

Modified: compiler-rt/trunk/lib/xray/tests/unit/function_call_trie_test.cc
URL: http://llvm.org/viewvc/llvm-project/compiler-rt/trunk/lib/xray/tests/unit/function_call_trie_test.cc?rev=346116&r1=346115&r2=346116&view=diff
==============================================================================
--- compiler-rt/trunk/lib/xray/tests/unit/function_call_trie_test.cc (original)
+++ compiler-rt/trunk/lib/xray/tests/unit/function_call_trie_test.cc Sun Nov  4 21:43:22 2018
@@ -10,9 +10,9 @@
 // This file is a part of XRay, a function call tracing system.
 //
 //===----------------------------------------------------------------------===//
-#include "gtest/gtest.h"
-
 #include "xray_function_call_trie.h"
+#include "gtest/gtest.h"
+#include <cstdint>
 
 namespace __xray {
 
@@ -29,26 +29,54 @@ TEST(FunctionCallTrieTest, EnterAndExitF
   auto A = FunctionCallTrie::InitAllocators();
   FunctionCallTrie Trie(A);
 
-  Trie.enterFunction(1, 1);
-  Trie.exitFunction(1, 2);
+  uint64_t TSC = 1;
+  uint16_t CPU = 0;
+  Trie.enterFunction(1, TSC++, CPU++);
+  Trie.exitFunction(1, TSC++, CPU++);
+  const auto &R = Trie.getRoots();
 
-  // We need a way to pull the data out. At this point, until we get a data
-  // collection service implemented, we're going to export the data as a list of
-  // roots, and manually walk through the structure ourselves.
+  ASSERT_EQ(R.size(), 1u);
+  ASSERT_EQ(R.front()->FId, 1);
+  ASSERT_EQ(R.front()->CallCount, 1u);
+  ASSERT_EQ(R.front()->CumulativeLocalTime, 1u);
+}
+
+TEST(FunctionCallTrieTest, HandleTSCOverflow) {
+  profilingFlags()->setDefaults();
+  auto A = FunctionCallTrie::InitAllocators();
+  FunctionCallTrie Trie(A);
 
+  Trie.enterFunction(1, std::numeric_limits<uint64_t>::max(), 0);
+  Trie.exitFunction(1, 1, 0);
   const auto &R = Trie.getRoots();
 
   ASSERT_EQ(R.size(), 1u);
   ASSERT_EQ(R.front()->FId, 1);
-  ASSERT_EQ(R.front()->CallCount, 1);
+  ASSERT_EQ(R.front()->CallCount, 1u);
   ASSERT_EQ(R.front()->CumulativeLocalTime, 1u);
 }
 
+TEST(FunctionCallTrieTest, MaximalCumulativeTime) {
+  profilingFlags()->setDefaults();
+  auto A = FunctionCallTrie::InitAllocators();
+  FunctionCallTrie Trie(A);
+
+  Trie.enterFunction(1, 1, 0);
+  Trie.exitFunction(1, 0, 0);
+  const auto &R = Trie.getRoots();
+
+  ASSERT_EQ(R.size(), 1u);
+  ASSERT_EQ(R.front()->FId, 1);
+  ASSERT_EQ(R.front()->CallCount, 1u);
+  ASSERT_EQ(R.front()->CumulativeLocalTime,
+            std::numeric_limits<uint64_t>::max() - 1);
+}
+
 TEST(FunctionCallTrieTest, MissingFunctionEntry) {
   profilingFlags()->setDefaults();
   auto A = FunctionCallTrie::InitAllocators();
   FunctionCallTrie Trie(A);
-  Trie.exitFunction(1, 1);
+  Trie.exitFunction(1, 1, 0);
   const auto &R = Trie.getRoots();
 
   ASSERT_TRUE(R.empty());
@@ -58,9 +86,9 @@ TEST(FunctionCallTrieTest, NoMatchingEnt
   profilingFlags()->setDefaults();
   auto A = FunctionCallTrie::InitAllocators();
   FunctionCallTrie Trie(A);
-  Trie.enterFunction(2, 1);
-  Trie.enterFunction(3, 3);
-  Trie.exitFunction(1, 5);
+  Trie.enterFunction(2, 1, 0);
+  Trie.enterFunction(3, 3, 0);
+  Trie.exitFunction(1, 5, 0);
   const auto &R = Trie.getRoots();
 
   ASSERT_FALSE(R.empty());
@@ -71,7 +99,7 @@ TEST(FunctionCallTrieTest, MissingFuncti
   profilingFlags()->setDefaults();
   auto A = FunctionCallTrie::InitAllocators();
   FunctionCallTrie Trie(A);
-  Trie.enterFunction(1, 1);
+  Trie.enterFunction(1, 1, 0);
   const auto &R = Trie.getRoots();
 
   ASSERT_FALSE(R.empty());
@@ -84,12 +112,12 @@ TEST(FunctionCallTrieTest, MultipleRoots
   FunctionCallTrie Trie(A);
 
   // Enter and exit FId = 1.
-  Trie.enterFunction(1, 1);
-  Trie.exitFunction(1, 2);
+  Trie.enterFunction(1, 1, 0);
+  Trie.exitFunction(1, 2, 0);
 
   // Enter and exit FId = 2.
-  Trie.enterFunction(2, 3);
-  Trie.exitFunction(2, 4);
+  Trie.enterFunction(2, 3, 0);
+  Trie.exitFunction(2, 4, 0);
 
   const auto &R = Trie.getRoots();
   ASSERT_FALSE(R.empty());
@@ -126,11 +154,11 @@ TEST(FunctionCallTrieTest, MissingInterm
   auto A = FunctionCallTrie::InitAllocators();
   FunctionCallTrie Trie(A);
 
-  Trie.enterFunction(1, 0);
-  Trie.enterFunction(2, 100);
-  Trie.enterFunction(3, 200);
-  Trie.exitFunction(3, 300);
-  Trie.exitFunction(1, 400);
+  Trie.enterFunction(1, 0, 0);
+  Trie.enterFunction(2, 100, 0);
+  Trie.enterFunction(3, 200, 0);
+  Trie.exitFunction(3, 300, 0);
+  Trie.exitFunction(1, 400, 0);
 
   // What we should see at this point is all the functions in the trie in a
   // specific order (1 -> 2 -> 3) with the appropriate count(s) and local
@@ -153,12 +181,12 @@ TEST(FunctionCallTrieTest, MissingInterm
 
   // Now that we've established the preconditions, we check for specific aspects
   // of the nodes.
-  EXPECT_EQ(F3.CallCount, 1);
-  EXPECT_EQ(F2.CallCount, 1);
-  EXPECT_EQ(F1.CallCount, 1);
-  EXPECT_EQ(F3.CumulativeLocalTime, 100);
-  EXPECT_EQ(F2.CumulativeLocalTime, 300);
-  EXPECT_EQ(F1.CumulativeLocalTime, 100);
+  EXPECT_EQ(F3.CallCount, 1u);
+  EXPECT_EQ(F2.CallCount, 1u);
+  EXPECT_EQ(F1.CallCount, 1u);
+  EXPECT_EQ(F3.CumulativeLocalTime, 100u);
+  EXPECT_EQ(F2.CumulativeLocalTime, 300u);
+  EXPECT_EQ(F1.CumulativeLocalTime, 100u);
 }
 
 TEST(FunctionCallTrieTest, DeepCallStack) {
@@ -168,8 +196,8 @@ TEST(FunctionCallTrieTest, DeepCallStack
   auto A = FunctionCallTrie::InitAllocators();
   FunctionCallTrie Trie(A);
   for (int i = 0; i < 32; ++i)
-    Trie.enterFunction(i + 1, i);
-  Trie.exitFunction(1, 33);
+    Trie.enterFunction(i + 1, i, 0);
+  Trie.exitFunction(1, 33, 0);
 
   // Here, validate that we have a 32-level deep function call path from the
   // root (1) down to the leaf (33).
@@ -178,7 +206,7 @@ TEST(FunctionCallTrieTest, DeepCallStack
   auto F = R[0];
   for (int i = 0; i < 32; ++i) {
     EXPECT_EQ(F->FId, i + 1);
-    EXPECT_EQ(F->CallCount, 1);
+    EXPECT_EQ(F->CallCount, 1u);
     if (F->Callees.empty() && i != 31)
       FAIL() << "Empty callees for FId " << F->FId;
     if (i != 31)
@@ -193,12 +221,12 @@ TEST(FunctionCallTrieTest, DeepCopy) {
   auto A = FunctionCallTrie::InitAllocators();
   FunctionCallTrie Trie(A);
 
-  Trie.enterFunction(1, 0);
-  Trie.enterFunction(2, 1);
-  Trie.exitFunction(2, 2);
-  Trie.enterFunction(3, 3);
-  Trie.exitFunction(3, 4);
-  Trie.exitFunction(1, 5);
+  Trie.enterFunction(1, 0, 0);
+  Trie.enterFunction(2, 1, 0);
+  Trie.exitFunction(2, 2, 0);
+  Trie.enterFunction(3, 3, 0);
+  Trie.exitFunction(3, 4, 0);
+  Trie.exitFunction(1, 5, 0);
 
   // We want to make a deep copy and compare notes.
   auto B = FunctionCallTrie::InitAllocators();
@@ -236,20 +264,20 @@ TEST(FunctionCallTrieTest, MergeInto) {
   FunctionCallTrie T1(A);
 
   // 1 -> 2 -> 3
-  T0.enterFunction(1, 0);
-  T0.enterFunction(2, 1);
-  T0.enterFunction(3, 2);
-  T0.exitFunction(3, 3);
-  T0.exitFunction(2, 4);
-  T0.exitFunction(1, 5);
+  T0.enterFunction(1, 0, 0);
+  T0.enterFunction(2, 1, 0);
+  T0.enterFunction(3, 2, 0);
+  T0.exitFunction(3, 3, 0);
+  T0.exitFunction(2, 4, 0);
+  T0.exitFunction(1, 5, 0);
 
   // 1 -> 2 -> 3
-  T1.enterFunction(1, 0);
-  T1.enterFunction(2, 1);
-  T1.enterFunction(3, 2);
-  T1.exitFunction(3, 3);
-  T1.exitFunction(2, 4);
-  T1.exitFunction(1, 5);
+  T1.enterFunction(1, 0, 0);
+  T1.enterFunction(2, 1, 0);
+  T1.enterFunction(3, 2, 0);
+  T1.exitFunction(3, 3, 0);
+  T1.exitFunction(2, 4, 0);
+  T1.exitFunction(1, 5, 0);
 
   // We use a different allocator here to make sure that we're able to transfer
   // data into a FunctionCallTrie which uses a different allocator. This
@@ -264,20 +292,20 @@ TEST(FunctionCallTrieTest, MergeInto) {
   ASSERT_EQ(Merged.getRoots().size(), 1u);
   const auto &R0 = *Merged.getRoots()[0];
   EXPECT_EQ(R0.FId, 1);
-  EXPECT_EQ(R0.CallCount, 2);
-  EXPECT_EQ(R0.CumulativeLocalTime, 10);
+  EXPECT_EQ(R0.CallCount, 2u);
+  EXPECT_EQ(R0.CumulativeLocalTime, 10u);
   EXPECT_EQ(R0.Callees.size(), 1u);
 
   const auto &F1 = *R0.Callees[0].NodePtr;
   EXPECT_EQ(F1.FId, 2);
-  EXPECT_EQ(F1.CallCount, 2);
-  EXPECT_EQ(F1.CumulativeLocalTime, 6);
+  EXPECT_EQ(F1.CallCount, 2u);
+  EXPECT_EQ(F1.CumulativeLocalTime, 6u);
   EXPECT_EQ(F1.Callees.size(), 1u);
 
   const auto &F2 = *F1.Callees[0].NodePtr;
   EXPECT_EQ(F2.FId, 3);
-  EXPECT_EQ(F2.CallCount, 2);
-  EXPECT_EQ(F2.CumulativeLocalTime, 2);
+  EXPECT_EQ(F2.CallCount, 2u);
+  EXPECT_EQ(F2.CumulativeLocalTime, 2u);
   EXPECT_EQ(F2.Callees.size(), 0u);
 }
 

Modified: compiler-rt/trunk/lib/xray/tests/unit/profile_collector_test.cc
URL: http://llvm.org/viewvc/llvm-project/compiler-rt/trunk/lib/xray/tests/unit/profile_collector_test.cc?rev=346116&r1=346115&r2=346116&view=diff
==============================================================================
--- compiler-rt/trunk/lib/xray/tests/unit/profile_collector_test.cc (original)
+++ compiler-rt/trunk/lib/xray/tests/unit/profile_collector_test.cc Sun Nov  4 21:43:22 2018
@@ -121,10 +121,10 @@ TEST(profileCollectorServiceTest, PostSe
   FunctionCallTrie T(Allocators);
 
   // Then, we populate the trie with some data.
-  T.enterFunction(1, 1);
-  T.enterFunction(2, 2);
-  T.exitFunction(2, 3);
-  T.exitFunction(1, 4);
+  T.enterFunction(1, 1, 0);
+  T.enterFunction(2, 2, 0);
+  T.exitFunction(2, 3, 0);
+  T.exitFunction(1, 4, 0);
 
   // Then we post the data to the global profile collector service.
   profileCollectorService::post(T, 1);
@@ -177,10 +177,10 @@ void threadProcessing() {
   thread_local auto Allocators = FunctionCallTrie::InitAllocators();
   FunctionCallTrie T(Allocators);
 
-  T.enterFunction(1, 1);
-  T.enterFunction(2, 2);
-  T.exitFunction(2, 3);
-  T.exitFunction(1, 4);
+  T.enterFunction(1, 1, 0);
+  T.enterFunction(2, 2, 0);
+  T.exitFunction(2, 3, 0);
+  T.exitFunction(1, 4, 0);
 
   profileCollectorService::post(T, GetTid());
 }

Modified: compiler-rt/trunk/lib/xray/xray_function_call_trie.h
URL: http://llvm.org/viewvc/llvm-project/compiler-rt/trunk/lib/xray/xray_function_call_trie.h?rev=346116&r1=346115&r2=346116&view=diff
==============================================================================
--- compiler-rt/trunk/lib/xray/xray_function_call_trie.h (original)
+++ compiler-rt/trunk/lib/xray/xray_function_call_trie.h Sun Nov  4 21:43:22 2018
@@ -18,6 +18,7 @@
 #include "xray_defs.h"
 #include "xray_profiling_flags.h"
 #include "xray_segmented_array.h"
+#include <limits>
 #include <memory> // For placement new.
 #include <utility>
 
@@ -113,13 +114,13 @@ public:
   struct Node {
     Node *Parent;
     NodeIdPairArray Callees;
-    int64_t CallCount;
-    int64_t CumulativeLocalTime; // Typically in TSC deltas, not wall-time.
+    uint64_t CallCount;
+    uint64_t CumulativeLocalTime; // Typically in TSC deltas, not wall-time.
     int32_t FId;
 
     // We add a constructor here to allow us to inplace-construct through
     // Array<...>'s AppendEmplace.
-    Node(Node *P, NodeIdPairAllocatorType &A, int64_t CC, int64_t CLT,
+    Node(Node *P, NodeIdPairAllocatorType &A, uint64_t CC, uint64_t CLT,
          int32_t F) XRAY_NEVER_INSTRUMENT : Parent(P),
                                             Callees(A),
                                             CallCount(CC),
@@ -133,11 +134,14 @@ private:
   struct ShadowStackEntry {
     uint64_t EntryTSC;
     Node *NodePtr;
+    uint16_t EntryCPU;
 
     // We add a constructor here to allow us to inplace-construct through
     // Array<...>'s AppendEmplace.
-    ShadowStackEntry(uint64_t T, Node *N) XRAY_NEVER_INSTRUMENT : EntryTSC{T},
-                                                                  NodePtr{N} {}
+    ShadowStackEntry(uint64_t T, Node *N, uint16_t C) XRAY_NEVER_INSTRUMENT
+        : EntryTSC{T},
+          NodePtr{N},
+          EntryCPU{C} {}
   };
 
   using NodeArray = Array<Node>;
@@ -262,17 +266,18 @@ public:
         ShadowStack(*A.ShadowStackAllocator),
         NodeIdPairAllocator(A.NodeIdPairAllocator) {}
 
-  void enterFunction(const int32_t FId, uint64_t TSC) XRAY_NEVER_INSTRUMENT {
+  void enterFunction(const int32_t FId, uint64_t TSC,
+                     uint16_t CPU) XRAY_NEVER_INSTRUMENT {
     DCHECK_NE(FId, 0);
     // This function primarily deals with ensuring that the ShadowStack is
     // consistent and ready for when an exit event is encountered.
     if (UNLIKELY(ShadowStack.empty())) {
       auto NewRoot =
-          Nodes.AppendEmplace(nullptr, *NodeIdPairAllocator, 0, 0, FId);
+          Nodes.AppendEmplace(nullptr, *NodeIdPairAllocator, 0u, 0u, FId);
       if (UNLIKELY(NewRoot == nullptr))
         return;
       Roots.Append(NewRoot);
-      ShadowStack.AppendEmplace(TSC, NewRoot);
+      ShadowStack.AppendEmplace(TSC, NewRoot, CPU);
       return;
     }
 
@@ -286,23 +291,24 @@ public:
         [FId](const NodeIdPair &NR) { return NR.FId == FId; });
     if (Callee != nullptr) {
       CHECK_NE(Callee->NodePtr, nullptr);
-      ShadowStack.AppendEmplace(TSC, Callee->NodePtr);
+      ShadowStack.AppendEmplace(TSC, Callee->NodePtr, CPU);
       return;
     }
 
     // This means we've never seen this stack before, create a new node here.
     auto NewNode =
-        Nodes.AppendEmplace(TopNode, *NodeIdPairAllocator, 0, 0, FId);
+        Nodes.AppendEmplace(TopNode, *NodeIdPairAllocator, 0u, 0u, FId);
     if (UNLIKELY(NewNode == nullptr))
       return;
     DCHECK_NE(NewNode, nullptr);
     TopNode->Callees.AppendEmplace(NewNode, FId);
-    ShadowStack.AppendEmplace(TSC, NewNode);
+    ShadowStack.AppendEmplace(TSC, NewNode, CPU);
     DCHECK_NE(ShadowStack.back().NodePtr, nullptr);
     return;
   }
 
-  void exitFunction(int32_t FId, uint64_t TSC) XRAY_NEVER_INSTRUMENT {
+  void exitFunction(int32_t FId, uint64_t TSC,
+                    uint16_t CPU) XRAY_NEVER_INSTRUMENT {
     // When we exit a function, we look up the ShadowStack to see whether we've
     // entered this function before. We do as little processing here as we can,
     // since most of the hard work would have already been done at function
@@ -312,7 +318,24 @@ public:
       const auto &Top = ShadowStack.back();
       auto TopNode = Top.NodePtr;
       DCHECK_NE(TopNode, nullptr);
-      auto LocalTime = TSC - Top.EntryTSC;
+
+      // We may encounter overflow on the TSC we're provided, which may end up
+      // being less than the TSC when we first entered the function.
+      //
+      // To get the accurate measurement of cycles, we need to check whether
+      // we've overflowed (TSC < Top.EntryTSC) and then account the difference
+      // between the entry TSC and the max for the TSC counter (max of uint64_t)
+      // then add the value of TSC. We can prove that the maximum delta we will
+      // get is at most the 64-bit unsigned value, since the difference between
+      // a TSC of 0 and a Top.EntryTSC of 1 is (numeric_limits<uint64_t>::max()
+      // - 1) + 1.
+      //
+      // NOTE: This assumes that TSCs are synchronised across CPUs.
+      // TODO: Count the number of times we've seen CPU migrations.
+      uint64_t LocalTime =
+          Top.EntryTSC > TSC
+              ? (std::numeric_limits<uint64_t>::max() - Top.EntryTSC) + TSC
+              : TSC - Top.EntryTSC;
       TopNode->CallCount++;
       TopNode->CumulativeLocalTime += LocalTime - CumulativeTreeTime;
       CumulativeTreeTime += LocalTime;
@@ -410,8 +433,8 @@ public:
       auto R = O.Roots.find_element(
           [&](const Node *Node) { return Node->FId == Root->FId; });
       if (R == nullptr) {
-        TargetRoot = O.Nodes.AppendEmplace(nullptr, *O.NodeIdPairAllocator, 0,
-                                           0, Root->FId);
+        TargetRoot = O.Nodes.AppendEmplace(nullptr, *O.NodeIdPairAllocator, 0u,
+                                           0u, Root->FId);
         if (UNLIKELY(TargetRoot == nullptr))
           return;
 
@@ -436,7 +459,7 @@ public:
               });
           if (TargetCallee == nullptr) {
             auto NewTargetNode = O.Nodes.AppendEmplace(
-                NT.TargetNode, *O.NodeIdPairAllocator, 0, 0, Callee.FId);
+                NT.TargetNode, *O.NodeIdPairAllocator, 0u, 0u, Callee.FId);
 
             if (UNLIKELY(NewTargetNode == nullptr))
               return;

Modified: compiler-rt/trunk/lib/xray/xray_profiling.cc
URL: http://llvm.org/viewvc/llvm-project/compiler-rt/trunk/lib/xray/xray_profiling.cc?rev=346116&r1=346115&r2=346116&view=diff
==============================================================================
--- compiler-rt/trunk/lib/xray/xray_profiling.cc (original)
+++ compiler-rt/trunk/lib/xray/xray_profiling.cc Sun Nov  4 21:43:22 2018
@@ -197,11 +197,11 @@ void profilingHandleArg0(int32_t FuncId,
   switch (Entry) {
   case XRayEntryType::ENTRY:
   case XRayEntryType::LOG_ARGS_ENTRY:
-    TLD.FCT->enterFunction(FuncId, TSC);
+    TLD.FCT->enterFunction(FuncId, TSC, CPU);
     break;
   case XRayEntryType::EXIT:
   case XRayEntryType::TAIL:
-    TLD.FCT->exitFunction(FuncId, TSC);
+    TLD.FCT->exitFunction(FuncId, TSC, CPU);
     break;
   default:
     // FIXME: Handle bugs.




More information about the llvm-commits mailing list