[compiler-rt] r337342 - [XRay][compiler-rt] Simplify Allocator Implementation

Dean Michael Berris via llvm-commits llvm-commits at lists.llvm.org
Tue Jul 17 18:53:39 PDT 2018


Author: dberris
Date: Tue Jul 17 18:53:39 2018
New Revision: 337342

URL: http://llvm.org/viewvc/llvm-project?rev=337342&view=rev
Log:
[XRay][compiler-rt] Simplify Allocator Implementation

Summary:
This change simplifies the XRay Allocator implementation to self-manage
an mmap'ed memory segment instead of using the internal allocator
implementation in sanitizer_common.

We've found through benchmarks and profiling these benchmarks in D48879
that using the internal allocator in sanitizer_common introduces a
bottleneck on allocating memory through a central spinlock. This change
allows thread-local allocators to eliminate contention on the
centralized allocator.

To get the most benefit from this approach, we also use a managed
allocator for the chunk elements used by the segmented array
implementation. This gives us the chance to amortize the cost of
allocating memory when creating these internal segmented array data
structures.

We also took the opportunity to remove the preallocation argument from
the allocator API, simplifying the usage of the allocator throughout the
profiling implementation.

In this change we also tweak some of the flag values to reduce the
amount of maximum memory we use/need for each thread, when requesting
memory through mmap.

Depends on D48956.

Reviewers: kpw, eizan

Subscribers: llvm-commits

Differential Revision: https://reviews.llvm.org/D49217

Modified:
    compiler-rt/trunk/lib/xray/tests/unit/allocator_test.cc
    compiler-rt/trunk/lib/xray/tests/unit/function_call_trie_test.cc
    compiler-rt/trunk/lib/xray/tests/unit/segmented_array_test.cc
    compiler-rt/trunk/lib/xray/xray_allocator.h
    compiler-rt/trunk/lib/xray/xray_function_call_trie.h
    compiler-rt/trunk/lib/xray/xray_profile_collector.cc
    compiler-rt/trunk/lib/xray/xray_profiling.cc
    compiler-rt/trunk/lib/xray/xray_profiling_flags.inc
    compiler-rt/trunk/lib/xray/xray_segmented_array.h
    compiler-rt/trunk/lib/xray/xray_utils.h

Modified: compiler-rt/trunk/lib/xray/tests/unit/allocator_test.cc
URL: http://llvm.org/viewvc/llvm-project/compiler-rt/trunk/lib/xray/tests/unit/allocator_test.cc?rev=337342&r1=337341&r2=337342&view=diff
==============================================================================
--- compiler-rt/trunk/lib/xray/tests/unit/allocator_test.cc (original)
+++ compiler-rt/trunk/lib/xray/tests/unit/allocator_test.cc Tue Jul 17 18:53:39 2018
@@ -22,16 +22,16 @@ struct TestData {
   s64 Second;
 };
 
-TEST(AllocatorTest, Construction) { Allocator<sizeof(TestData)> A(2 << 11, 0); }
+TEST(AllocatorTest, Construction) { Allocator<sizeof(TestData)> A(2 << 11); }
 
 TEST(AllocatorTest, Allocate) {
-  Allocator<sizeof(TestData)> A(2 << 11, 0);
+  Allocator<sizeof(TestData)> A(2 << 11);
   auto B = A.Allocate();
   ASSERT_NE(B.Data, nullptr);
 }
 
 TEST(AllocatorTest, OverAllocate) {
-  Allocator<sizeof(TestData)> A(sizeof(TestData), 0);
+  Allocator<sizeof(TestData)> A(sizeof(TestData));
   auto B1 = A.Allocate();
   (void)B1;
   auto B2 = A.Allocate();

Modified: compiler-rt/trunk/lib/xray/tests/unit/function_call_trie_test.cc
URL: http://llvm.org/viewvc/llvm-project/compiler-rt/trunk/lib/xray/tests/unit/function_call_trie_test.cc?rev=337342&r1=337341&r2=337342&view=diff
==============================================================================
--- compiler-rt/trunk/lib/xray/tests/unit/function_call_trie_test.cc (original)
+++ compiler-rt/trunk/lib/xray/tests/unit/function_call_trie_test.cc Tue Jul 17 18:53:39 2018
@@ -19,8 +19,6 @@ namespace __xray {
 namespace {
 
 TEST(FunctionCallTrieTest, ConstructWithTLSAllocators) {
-  // FIXME: Support passing in configuration for allocators in the allocator
-  // constructors.
   profilingFlags()->setDefaults();
   FunctionCallTrie::Allocators Allocators = FunctionCallTrie::InitAllocators();
   FunctionCallTrie Trie(Allocators);
@@ -47,6 +45,7 @@ TEST(FunctionCallTrieTest, EnterAndExitF
 }
 
 TEST(FunctionCallTrieTest, MissingFunctionEntry) {
+  profilingFlags()->setDefaults();
   auto A = FunctionCallTrie::InitAllocators();
   FunctionCallTrie Trie(A);
   Trie.exitFunction(1, 1);
@@ -56,6 +55,7 @@ TEST(FunctionCallTrieTest, MissingFuncti
 }
 
 TEST(FunctionCallTrieTest, NoMatchingEntersForExit) {
+  profilingFlags()->setDefaults();
   auto A = FunctionCallTrie::InitAllocators();
   FunctionCallTrie Trie(A);
   Trie.enterFunction(2, 1);
@@ -63,16 +63,19 @@ TEST(FunctionCallTrieTest, NoMatchingEnt
   Trie.exitFunction(1, 5);
   const auto &R = Trie.getRoots();
 
-  ASSERT_TRUE(R.empty());
+  ASSERT_FALSE(R.empty());
+  EXPECT_EQ(R.size(), size_t{1});
 }
 
 TEST(FunctionCallTrieTest, MissingFunctionExit) {
+  profilingFlags()->setDefaults();
   auto A = FunctionCallTrie::InitAllocators();
   FunctionCallTrie Trie(A);
   Trie.enterFunction(1, 1);
   const auto &R = Trie.getRoots();
 
-  ASSERT_TRUE(R.empty());
+  ASSERT_FALSE(R.empty());
+  EXPECT_EQ(R.size(), size_t{1});
 }
 
 TEST(FunctionCallTrieTest, MultipleRoots) {

Modified: compiler-rt/trunk/lib/xray/tests/unit/segmented_array_test.cc
URL: http://llvm.org/viewvc/llvm-project/compiler-rt/trunk/lib/xray/tests/unit/segmented_array_test.cc?rev=337342&r1=337341&r2=337342&view=diff
==============================================================================
--- compiler-rt/trunk/lib/xray/tests/unit/segmented_array_test.cc (original)
+++ compiler-rt/trunk/lib/xray/tests/unit/segmented_array_test.cc Tue Jul 17 18:53:39 2018
@@ -12,27 +12,29 @@ struct TestData {
   TestData(s64 F, s64 S) : First(F), Second(S) {}
 };
 
-TEST(SegmentedArrayTest, Construction) {
-  Array<TestData> Data;
-  (void)Data;
-}
-
-TEST(SegmentedArrayTest, ConstructWithAllocator) {
+TEST(SegmentedArrayTest, ConstructWithAllocators) {
   using AllocatorType = typename Array<TestData>::AllocatorType;
-  AllocatorType A(1 << 4, 0);
-  Array<TestData> Data(A);
+  AllocatorType A(1 << 4);
+  ChunkAllocator CA(1 << 4);
+  Array<TestData> Data(A, CA);
   (void)Data;
 }
 
 TEST(SegmentedArrayTest, ConstructAndPopulate) {
-  Array<TestData> data;
+  using AllocatorType = typename Array<TestData>::AllocatorType;
+  AllocatorType A(1 << 4);
+  ChunkAllocator CA(1 << 4);
+  Array<TestData> data(A, CA);
   ASSERT_NE(data.Append(TestData{0, 0}), nullptr);
   ASSERT_NE(data.Append(TestData{1, 1}), nullptr);
   ASSERT_EQ(data.size(), 2u);
 }
 
 TEST(SegmentedArrayTest, ConstructPopulateAndLookup) {
-  Array<TestData> data;
+  using AllocatorType = typename Array<TestData>::AllocatorType;
+  AllocatorType A(1 << 4);
+  ChunkAllocator CA(1 << 4);
+  Array<TestData> data(A, CA);
   ASSERT_NE(data.Append(TestData{0, 1}), nullptr);
   ASSERT_EQ(data.size(), 1u);
   ASSERT_EQ(data[0].First, 0);
@@ -40,7 +42,10 @@ TEST(SegmentedArrayTest, ConstructPopula
 }
 
 TEST(SegmentedArrayTest, PopulateWithMoreElements) {
-  Array<TestData> data;
+  using AllocatorType = typename Array<TestData>::AllocatorType;
+  AllocatorType A(1 << 24);
+  ChunkAllocator CA(1 << 20);
+  Array<TestData> data(A, CA);
   static const auto kMaxElements = 100u;
   for (auto I = 0u; I < kMaxElements; ++I) {
     ASSERT_NE(data.Append(TestData{I, I + 1}), nullptr);
@@ -53,14 +58,20 @@ TEST(SegmentedArrayTest, PopulateWithMor
 }
 
 TEST(SegmentedArrayTest, AppendEmplace) {
-  Array<TestData> data;
+  using AllocatorType = typename Array<TestData>::AllocatorType;
+  AllocatorType A(1 << 4);
+  ChunkAllocator CA(1 << 4);
+  Array<TestData> data(A, CA);
   ASSERT_NE(data.AppendEmplace(1, 1), nullptr);
   ASSERT_EQ(data[0].First, 1);
   ASSERT_EQ(data[0].Second, 1);
 }
 
 TEST(SegmentedArrayTest, AppendAndTrim) {
-  Array<TestData> data;
+  using AllocatorType = typename Array<TestData>::AllocatorType;
+  AllocatorType A(1 << 4);
+  ChunkAllocator CA(1 << 4);
+  Array<TestData> data(A, CA);
   ASSERT_NE(data.AppendEmplace(1, 1), nullptr);
   ASSERT_EQ(data.size(), 1u);
   data.trim(1);
@@ -69,7 +80,10 @@ TEST(SegmentedArrayTest, AppendAndTrim)
 }
 
 TEST(SegmentedArrayTest, IteratorAdvance) {
-  Array<TestData> data;
+  using AllocatorType = typename Array<TestData>::AllocatorType;
+  AllocatorType A(1 << 4);
+  ChunkAllocator CA(1 << 4);
+  Array<TestData> data(A, CA);
   ASSERT_TRUE(data.empty());
   ASSERT_EQ(data.begin(), data.end());
   auto I0 = data.begin();
@@ -88,7 +102,10 @@ TEST(SegmentedArrayTest, IteratorAdvance
 }
 
 TEST(SegmentedArrayTest, IteratorRetreat) {
-  Array<TestData> data;
+  using AllocatorType = typename Array<TestData>::AllocatorType;
+  AllocatorType A(1 << 4);
+  ChunkAllocator CA(1 << 4);
+  Array<TestData> data(A, CA);
   ASSERT_TRUE(data.empty());
   ASSERT_EQ(data.begin(), data.end());
   ASSERT_NE(data.AppendEmplace(1, 1), nullptr);
@@ -108,8 +125,9 @@ TEST(SegmentedArrayTest, IteratorRetreat
 
 TEST(SegmentedArrayTest, IteratorTrimBehaviour) {
   using AllocatorType = typename Array<TestData>::AllocatorType;
-  AllocatorType A(1 << 10, 0);
-  Array<TestData> Data(A);
+  AllocatorType A(1 << 20);
+  ChunkAllocator CA(1 << 10);
+  Array<TestData> Data(A, CA);
   ASSERT_TRUE(Data.empty());
   auto I0Begin = Data.begin(), I0End = Data.end();
   // Add enough elements in Data to have more than one chunk.
@@ -160,8 +178,9 @@ struct ShadowStackEntry {
 
 TEST(SegmentedArrayTest, SimulateStackBehaviour) {
   using AllocatorType = typename Array<ShadowStackEntry>::AllocatorType;
-  AllocatorType A(1 << 10, 0);
-  Array<ShadowStackEntry> Data(A);
+  AllocatorType A(1 << 10);
+  ChunkAllocator CA(1 << 10);
+  Array<ShadowStackEntry> Data(A, CA);
   static uint64_t Dummy = 0;
   constexpr uint64_t Max = 9;
 

Modified: compiler-rt/trunk/lib/xray/xray_allocator.h
URL: http://llvm.org/viewvc/llvm-project/compiler-rt/trunk/lib/xray/xray_allocator.h?rev=337342&r1=337341&r2=337342&view=diff
==============================================================================
--- compiler-rt/trunk/lib/xray/xray_allocator.h (original)
+++ compiler-rt/trunk/lib/xray/xray_allocator.h Tue Jul 17 18:53:39 2018
@@ -16,10 +16,11 @@
 #ifndef XRAY_ALLOCATOR_H
 #define XRAY_ALLOCATOR_H
 
-#include "sanitizer_common/sanitizer_allocator_internal.h"
 #include "sanitizer_common/sanitizer_common.h"
 #include "sanitizer_common/sanitizer_internal_defs.h"
 #include "sanitizer_common/sanitizer_mutex.h"
+#include "sanitizer_common/sanitizer_posix.h"
+#include "sys/mman.h"
 #include "xray_utils.h"
 #include <cstddef>
 #include <cstdint>
@@ -36,130 +37,87 @@ namespace __xray {
 /// allocation function. N is used to compute the size of a block, which is
 /// cache-line-size multiples worth of memory. We compute the size of a block by
 /// determining how many cache lines worth of memory is required to subsume N.
+///
+/// The Allocator instance will manage its own memory acquired through mmap.
+/// This severely constrains the platforms on which this can be used to POSIX
+/// systems where mmap semantics are well-defined.
+///
+/// FIXME: Isolate the lower-level memory management to a different abstraction
+/// that can be platform-specific.
 template <size_t N> struct Allocator {
   // The Allocator returns memory as Block instances.
   struct Block {
     /// Compute the minimum cache-line size multiple that is >= N.
     static constexpr auto Size = nearest_boundary(N, kCacheLineSize);
-    void *Data = nullptr;
+    void *Data;
   };
 
 private:
-  // A BlockLink will contain a fixed number of blocks, each with an identifier
-  // to specify whether it's been handed out or not. We keep track of BlockLink
-  // iterators, which are basically a pointer to the link and an offset into
-  // the fixed set of blocks associated with a link. The iterators are
-  // bidirectional.
-  //
-  // We're calling it a "link" in the context of seeing these as a chain of
-  // block pointer containers (i.e. links in a chain).
-  struct BlockLink {
-    static_assert(kCacheLineSize % sizeof(void *) == 0,
-                  "Cache line size is not divisible by size of void*; none of "
-                  "the assumptions of the BlockLink will hold.");
-
-    // We compute the number of pointers to areas in memory where we consider as
-    // individual blocks we've allocated. To ensure that instances of the
-    // BlockLink object are cache-line sized, we deduct two pointers worth
-    // representing the pointer to the previous link and the backing store for
-    // the whole block.
-    //
-    // This structure corresponds to the following layout:
-    //
-    //   Blocks [ 0, 1, 2, .., BlockPtrCount - 2]
-    //
-    static constexpr auto BlockPtrCount =
-        (kCacheLineSize / sizeof(Block *)) - 2;
-
-    BlockLink() {
-      // Zero out Blocks.
-      // FIXME: Use a braced member initializer when we drop support for GCC
-      // 4.8.
-      internal_memset(Blocks, 0, sizeof(Blocks));
-    }
-
-    // FIXME: Align this to cache-line address boundaries?
-    Block Blocks[BlockPtrCount];
-    BlockLink *Prev = nullptr;
-    void *BackingStore = nullptr;
-  };
-
-  static_assert(sizeof(BlockLink) == kCacheLineSize,
-                "BlockLink instances must be cache-line-sized.");
-
-  static BlockLink NullLink;
-
-  // FIXME: Implement a freelist, in case we actually do intend to return memory
-  // to the allocator, as opposed to just de-allocating everything in one go?
-
-  size_t MaxMemory;
+  const size_t MaxMemory{0};
+  void *BackingStore = nullptr;
+  void *AlignedNextBlock = nullptr;
+  size_t AllocatedBlocks = 0;
   SpinMutex Mutex{};
-  BlockLink *Tail = &NullLink;
-  size_t Counter = 0;
 
-  BlockLink *NewChainLink(uint64_t Alignment) {
-    auto NewChain = reinterpret_cast<BlockLink *>(
-        InternalAlloc(sizeof(BlockLink), nullptr, kCacheLineSize));
-    auto BackingStore = reinterpret_cast<char *>(InternalAlloc(
-        (BlockLink::BlockPtrCount + 1) * Block::Size, nullptr, Alignment));
-    size_t Offset = 0;
-    DCHECK_NE(NewChain, nullptr);
-    DCHECK_NE(BackingStore, nullptr);
-    NewChain->BackingStore = BackingStore;
-
-    // Here we ensure that the alignment of the pointers we're handing out
-    // adhere to the alignment requirements of the call to Allocate().
-    for (auto &B : NewChain->Blocks) {
-      auto AlignmentAdjustment =
-          nearest_boundary(reinterpret_cast<uintptr_t>(BackingStore + Offset),
-                           Alignment) -
-          reinterpret_cast<uintptr_t>(BackingStore + Offset);
-      B.Data = BackingStore + AlignmentAdjustment + Offset;
-      DCHECK_EQ(reinterpret_cast<uintptr_t>(B.Data) % Alignment, 0);
-      Offset += AlignmentAdjustment + Block::Size;
+  void *Alloc() {
+    SpinMutexLock Lock(&Mutex);
+    if (UNLIKELY(BackingStore == nullptr)) {
+      BackingStore = reinterpret_cast<void *>(
+          internal_mmap(NULL, MaxMemory, PROT_READ | PROT_WRITE,
+                        MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE, 0, 0));
+      if (BackingStore == MAP_FAILED) {
+        BackingStore = nullptr;
+        if (Verbosity())
+          Report("XRay Profiling: Failed to allocate memory for allocator.\n");
+        return nullptr;
+      }
+
+      AlignedNextBlock = BackingStore;
+
+      // Ensure that NextBlock is aligned appropriately.
+      auto BackingStoreNum = reinterpret_cast<uintptr_t>(BackingStore);
+      auto AlignedNextBlockNum = nearest_boundary(
+          reinterpret_cast<uintptr_t>(AlignedNextBlock), kCacheLineSize);
+      if (diff(AlignedNextBlockNum, BackingStoreNum) > ptrdiff_t(MaxMemory)) {
+        munmap(BackingStore, MaxMemory);
+        AlignedNextBlock = BackingStore = nullptr;
+        if (Verbosity())
+          Report("XRay Profiling: Cannot obtain enough memory from "
+                 "preallocated region.\n");
+        return nullptr;
+      }
+
+      AlignedNextBlock = reinterpret_cast<void *>(AlignedNextBlockNum);
+
+      // Assert that AlignedNextBlock is cache-line aligned.
+      DCHECK_EQ(reinterpret_cast<uintptr_t>(AlignedNextBlock) % kCacheLineSize,
+                0);
     }
-    NewChain->Prev = Tail;
-    return NewChain;
+
+    if ((AllocatedBlocks * Block::Size) >= MaxMemory)
+      return nullptr;
+
+    // Align the pointer we'd like to return to an appropriate alignment, then
+    // advance the pointer from where to start allocations.
+    void *Result = AlignedNextBlock;
+    AlignedNextBlock = reinterpret_cast<void *>(
+        reinterpret_cast<char *>(AlignedNextBlock) + N);
+    ++AllocatedBlocks;
+    return Result;
   }
 
 public:
-  Allocator(size_t M, size_t PreAllocate) : MaxMemory(M) {
-    // FIXME: Implement PreAllocate support!
-  }
+  explicit Allocator(size_t M)
+      : MaxMemory(nearest_boundary(M, kCacheLineSize)) {}
 
-  Block Allocate(uint64_t Alignment = 8) {
-    SpinMutexLock Lock(&Mutex);
-    // Check whether we're over quota.
-    if (Counter * Block::Size >= MaxMemory)
-      return {};
-
-    size_t ChainOffset = Counter % BlockLink::BlockPtrCount;
-
-    Block B{};
-    BlockLink *Link = Tail;
-    if (UNLIKELY(Counter == 0 || ChainOffset == 0))
-      Tail = Link = NewChainLink(Alignment);
-
-    B = Link->Blocks[ChainOffset];
-    ++Counter;
-    return B;
-  }
+  Block Allocate() { return {Alloc()}; }
 
   ~Allocator() NOEXCEPT {
-    // We need to deallocate all the blocks, including the chain links.
-    for (auto *C = Tail; C != &NullLink;) {
-      // We know that the data block is a large contiguous page, we deallocate
-      // that at once.
-      InternalFree(C->BackingStore);
-      auto Prev = C->Prev;
-      InternalFree(C);
-      C = Prev;
+    if (BackingStore != nullptr) {
+      internal_munmap(BackingStore, MaxMemory);
     }
   }
-}; // namespace __xray
-
-// Storage for the NullLink sentinel.
-template <size_t N> typename Allocator<N>::BlockLink Allocator<N>::NullLink;
+};
 
 } // namespace __xray
 

Modified: compiler-rt/trunk/lib/xray/xray_function_call_trie.h
URL: http://llvm.org/viewvc/llvm-project/compiler-rt/trunk/lib/xray/xray_function_call_trie.h?rev=337342&r1=337341&r2=337342&view=diff
==============================================================================
--- compiler-rt/trunk/lib/xray/xray_function_call_trie.h (original)
+++ compiler-rt/trunk/lib/xray/xray_function_call_trie.h Tue Jul 17 18:53:39 2018
@@ -15,6 +15,7 @@
 #ifndef XRAY_FUNCTION_CALL_TRIE_H
 #define XRAY_FUNCTION_CALL_TRIE_H
 
+#include "sanitizer_common/sanitizer_allocator_internal.h"
 #include "xray_profiling_flags.h"
 #include "xray_segmented_array.h"
 #include <memory> // For placement new.
@@ -118,9 +119,9 @@ public:
 
     // We add a constructor here to allow us to inplace-construct through
     // Array<...>'s AppendEmplace.
-    Node(Node *P, NodeIdPairAllocatorType &A, int64_t CC, int64_t CLT,
-         int32_t F)
-        : Parent(P), Callees(A), CallCount(CC), CumulativeLocalTime(CLT),
+    Node(Node *P, NodeIdPairAllocatorType &A, ChunkAllocator &CA, int64_t CC,
+         int64_t CLT, int32_t F)
+        : Parent(P), Callees(A, CA), CallCount(CC), CumulativeLocalTime(CLT),
           FId(F) {}
 
     // TODO: Include the compact histogram.
@@ -152,6 +153,7 @@ public:
     RootAllocatorType *RootAllocator = nullptr;
     ShadowStackAllocatorType *ShadowStackAllocator = nullptr;
     NodeIdPairAllocatorType *NodeIdPairAllocator = nullptr;
+    ChunkAllocator *ChunkAlloc = nullptr;
 
     Allocators() {}
     Allocators(const Allocators &) = delete;
@@ -160,11 +162,12 @@ public:
     Allocators(Allocators &&O)
         : NodeAllocator(O.NodeAllocator), RootAllocator(O.RootAllocator),
           ShadowStackAllocator(O.ShadowStackAllocator),
-          NodeIdPairAllocator(O.NodeIdPairAllocator) {
+          NodeIdPairAllocator(O.NodeIdPairAllocator), ChunkAlloc(O.ChunkAlloc) {
       O.NodeAllocator = nullptr;
       O.RootAllocator = nullptr;
       O.ShadowStackAllocator = nullptr;
       O.NodeIdPairAllocator = nullptr;
+      O.ChunkAlloc = nullptr;
     }
 
     Allocators &operator=(Allocators &&O) {
@@ -188,6 +191,11 @@ public:
         O.NodeIdPairAllocator = this->NodeIdPairAllocator;
         this->NodeIdPairAllocator = Tmp;
       }
+      {
+        auto Tmp = O.ChunkAlloc;
+        O.ChunkAlloc = this->ChunkAlloc;
+        this->ChunkAlloc = Tmp;
+      }
       return *this;
     }
 
@@ -198,18 +206,27 @@ public:
       if (NodeAllocator != nullptr) {
         NodeAllocator->~NodeAllocatorType();
         InternalFree(NodeAllocator);
+        NodeAllocator = nullptr;
       }
       if (RootAllocator != nullptr) {
         RootAllocator->~RootAllocatorType();
         InternalFree(RootAllocator);
+        RootAllocator = nullptr;
       }
       if (ShadowStackAllocator != nullptr) {
         ShadowStackAllocator->~ShadowStackAllocatorType();
         InternalFree(ShadowStackAllocator);
+        ShadowStackAllocator = nullptr;
       }
       if (NodeIdPairAllocator != nullptr) {
         NodeIdPairAllocator->~NodeIdPairAllocatorType();
         InternalFree(NodeIdPairAllocator);
+        NodeIdPairAllocator = nullptr;
+      }
+      if (ChunkAlloc != nullptr) {
+        ChunkAlloc->~ChunkAllocator();
+        InternalFree(ChunkAlloc);
+        ChunkAlloc = nullptr;
       }
     }
   };
@@ -223,24 +240,29 @@ public:
     Allocators A;
     auto NodeAllocator = reinterpret_cast<Allocators::NodeAllocatorType *>(
         InternalAlloc(sizeof(Allocators::NodeAllocatorType)));
-    new (NodeAllocator) Allocators::NodeAllocatorType(Max, 0);
+    new (NodeAllocator) Allocators::NodeAllocatorType(Max);
     A.NodeAllocator = NodeAllocator;
 
     auto RootAllocator = reinterpret_cast<Allocators::RootAllocatorType *>(
         InternalAlloc(sizeof(Allocators::RootAllocatorType)));
-    new (RootAllocator) Allocators::RootAllocatorType(Max, 0);
+    new (RootAllocator) Allocators::RootAllocatorType(Max);
     A.RootAllocator = RootAllocator;
 
     auto ShadowStackAllocator =
         reinterpret_cast<Allocators::ShadowStackAllocatorType *>(
             InternalAlloc(sizeof(Allocators::ShadowStackAllocatorType)));
-    new (ShadowStackAllocator) Allocators::ShadowStackAllocatorType(Max, 0);
+    new (ShadowStackAllocator) Allocators::ShadowStackAllocatorType(Max);
     A.ShadowStackAllocator = ShadowStackAllocator;
 
     auto NodeIdPairAllocator = reinterpret_cast<NodeIdPairAllocatorType *>(
         InternalAlloc(sizeof(NodeIdPairAllocatorType)));
-    new (NodeIdPairAllocator) NodeIdPairAllocatorType(Max, 0);
+    new (NodeIdPairAllocator) NodeIdPairAllocatorType(Max);
     A.NodeIdPairAllocator = NodeIdPairAllocator;
+
+    auto ChunkAlloc = reinterpret_cast<ChunkAllocator *>(
+        InternalAlloc(sizeof(ChunkAllocator)));
+    new (ChunkAlloc) ChunkAllocator(Max);
+    A.ChunkAlloc = ChunkAlloc;
     return A;
   }
 
@@ -249,20 +271,22 @@ private:
   RootArray Roots;
   ShadowStackArray ShadowStack;
   NodeIdPairAllocatorType *NodeIdPairAllocator = nullptr;
+  ChunkAllocator *ChunkAlloc = nullptr;
 
 public:
   explicit FunctionCallTrie(const Allocators &A)
-      : Nodes(*A.NodeAllocator), Roots(*A.RootAllocator),
-        ShadowStack(*A.ShadowStackAllocator),
-        NodeIdPairAllocator(A.NodeIdPairAllocator) {}
+      : Nodes(*A.NodeAllocator, *A.ChunkAlloc),
+        Roots(*A.RootAllocator, *A.ChunkAlloc),
+        ShadowStack(*A.ShadowStackAllocator, *A.ChunkAlloc),
+        NodeIdPairAllocator(A.NodeIdPairAllocator), ChunkAlloc(A.ChunkAlloc) {}
 
   void enterFunction(const int32_t FId, uint64_t TSC) {
     DCHECK_NE(FId, 0);
     // This function primarily deals with ensuring that the ShadowStack is
     // consistent and ready for when an exit event is encountered.
     if (UNLIKELY(ShadowStack.empty())) {
-      auto NewRoot =
-          Nodes.AppendEmplace(nullptr, *NodeIdPairAllocator, 0, 0, FId);
+      auto NewRoot = Nodes.AppendEmplace(nullptr, *NodeIdPairAllocator,
+                                         *ChunkAlloc, 0, 0, FId);
       if (UNLIKELY(NewRoot == nullptr))
         return;
       Roots.Append(NewRoot);
@@ -285,8 +309,8 @@ public:
     }
 
     // This means we've never seen this stack before, create a new node here.
-    auto NewNode =
-        Nodes.AppendEmplace(TopNode, *NodeIdPairAllocator, 0, 0, FId);
+    auto NewNode = Nodes.AppendEmplace(TopNode, *NodeIdPairAllocator,
+                                       *ChunkAlloc, 0, 0, FId);
     if (UNLIKELY(NewNode == nullptr))
       return;
     DCHECK_NE(NewNode, nullptr);
@@ -345,13 +369,14 @@ public:
     using Stack = Array<NodeAndParent>;
 
     typename Stack::AllocatorType StackAllocator(
-        profilingFlags()->stack_allocator_max, 0);
-    Stack DFSStack(StackAllocator);
+        profilingFlags()->stack_allocator_max);
+    ChunkAllocator StackChunkAllocator(profilingFlags()->stack_allocator_max);
+    Stack DFSStack(StackAllocator, StackChunkAllocator);
 
     for (const auto Root : getRoots()) {
       // Add a node in O for this root.
       auto NewRoot = O.Nodes.AppendEmplace(
-          nullptr, *O.NodeIdPairAllocator, Root->CallCount,
+          nullptr, *O.NodeIdPairAllocator, *O.ChunkAlloc, Root->CallCount,
           Root->CumulativeLocalTime, Root->FId);
 
       // Because we cannot allocate more memory we should bail out right away.
@@ -370,8 +395,9 @@ public:
         DFSStack.trim(1);
         for (const auto Callee : NP.Node->Callees) {
           auto NewNode = O.Nodes.AppendEmplace(
-              NP.NewNode, *O.NodeIdPairAllocator, Callee.NodePtr->CallCount,
-              Callee.NodePtr->CumulativeLocalTime, Callee.FId);
+              NP.NewNode, *O.NodeIdPairAllocator, *O.ChunkAlloc,
+              Callee.NodePtr->CallCount, Callee.NodePtr->CumulativeLocalTime,
+              Callee.FId);
           if (UNLIKELY(NewNode == nullptr))
             return;
           NP.NewNode->Callees.AppendEmplace(NewNode, Callee.FId);
@@ -396,16 +422,17 @@ public:
     };
     using Stack = Array<NodeAndTarget>;
     typename Stack::AllocatorType StackAllocator(
-        profilingFlags()->stack_allocator_max, 0);
-    Stack DFSStack(StackAllocator);
+        profilingFlags()->stack_allocator_max);
+    ChunkAllocator CA(profilingFlags()->stack_allocator_max);
+    Stack DFSStack(StackAllocator, CA);
 
     for (const auto Root : getRoots()) {
       Node *TargetRoot = nullptr;
       auto R = O.Roots.find_element(
           [&](const Node *Node) { return Node->FId == Root->FId; });
       if (R == nullptr) {
-        TargetRoot = O.Nodes.AppendEmplace(nullptr, *O.NodeIdPairAllocator, 0,
-                                           0, Root->FId);
+        TargetRoot = O.Nodes.AppendEmplace(nullptr, *O.NodeIdPairAllocator,
+                                           *O.ChunkAlloc, 0, 0, Root->FId);
         if (UNLIKELY(TargetRoot == nullptr))
           return;
 
@@ -429,8 +456,9 @@ public:
                 return C.FId == Callee.FId;
               });
           if (TargetCallee == nullptr) {
-            auto NewTargetNode = O.Nodes.AppendEmplace(
-                NT.TargetNode, *O.NodeIdPairAllocator, 0, 0, Callee.FId);
+            auto NewTargetNode =
+                O.Nodes.AppendEmplace(NT.TargetNode, *O.NodeIdPairAllocator,
+                                      *O.ChunkAlloc, 0, 0, Callee.FId);
 
             if (UNLIKELY(NewTargetNode == nullptr))
               return;

Modified: compiler-rt/trunk/lib/xray/xray_profile_collector.cc
URL: http://llvm.org/viewvc/llvm-project/compiler-rt/trunk/lib/xray/xray_profile_collector.cc?rev=337342&r1=337341&r2=337342&view=diff
==============================================================================
--- compiler-rt/trunk/lib/xray/xray_profile_collector.cc (original)
+++ compiler-rt/trunk/lib/xray/xray_profile_collector.cc Tue Jul 17 18:53:39 2018
@@ -13,6 +13,7 @@
 //
 //===----------------------------------------------------------------------===//
 #include "xray_profile_collector.h"
+#include "sanitizer_common/sanitizer_allocator_internal.h"
 #include "sanitizer_common/sanitizer_common.h"
 #include "sanitizer_common/sanitizer_vector.h"
 #include "xray_profiling_flags.h"
@@ -117,11 +118,12 @@ struct ProfileRecord {
   const FunctionCallTrie::Node *Node = nullptr;
 
   // Constructor for in-place construction.
-  ProfileRecord(PathAllocator &A, const FunctionCallTrie::Node *N)
+  ProfileRecord(PathAllocator &A, ChunkAllocator &CA,
+                const FunctionCallTrie::Node *N)
       : Path([&] {
           auto P =
               reinterpret_cast<PathArray *>(InternalAlloc(sizeof(PathArray)));
-          new (P) PathArray(A);
+          new (P) PathArray(A, CA);
           return P;
         }()),
         Node(N) {}
@@ -135,17 +137,20 @@ using ProfileRecordArray = Array<Profile
 // the path(s) and the data associated with the path.
 static void populateRecords(ProfileRecordArray &PRs,
                             ProfileRecord::PathAllocator &PA,
-                            const FunctionCallTrie &Trie) {
+                            ChunkAllocator &CA, const FunctionCallTrie &Trie) {
   using StackArray = Array<const FunctionCallTrie::Node *>;
   using StackAllocator = typename StackArray::AllocatorType;
-  StackAllocator StackAlloc(profilingFlags()->stack_allocator_max, 0);
-  StackArray DFSStack(StackAlloc);
+  StackAllocator StackAlloc(profilingFlags()->stack_allocator_max);
+  ChunkAllocator StackChunkAlloc(profilingFlags()->stack_allocator_max);
+  StackArray DFSStack(StackAlloc, StackChunkAlloc);
   for (const auto R : Trie.getRoots()) {
     DFSStack.Append(R);
     while (!DFSStack.empty()) {
       auto Node = DFSStack.back();
       DFSStack.trim(1);
-      auto Record = PRs.AppendEmplace(PA, Node);
+      auto Record = PRs.AppendEmplace(PA, CA, Node);
+      if (Record == nullptr)
+        return;
       DCHECK_NE(Record, nullptr);
 
       // Traverse the Node's parents and as we're doing so, get the FIds in
@@ -208,10 +213,11 @@ void serialize() {
   // Then repopulate the global ProfileBuffers.
   for (u32 I = 0; I < ThreadTries->Size(); ++I) {
     using ProfileRecordAllocator = typename ProfileRecordArray::AllocatorType;
-    ProfileRecordAllocator PRAlloc(profilingFlags()->global_allocator_max, 0);
+    ProfileRecordAllocator PRAlloc(profilingFlags()->global_allocator_max);
+    ChunkAllocator CA(profilingFlags()->global_allocator_max);
     ProfileRecord::PathAllocator PathAlloc(
-        profilingFlags()->global_allocator_max, 0);
-    ProfileRecordArray ProfileRecords(PRAlloc);
+        profilingFlags()->global_allocator_max);
+    ProfileRecordArray ProfileRecords(PRAlloc, CA);
 
     // First, we want to compute the amount of space we're going to need. We'll
     // use a local allocator and an __xray::Array<...> to store the intermediary
@@ -220,7 +226,7 @@ void serialize() {
     const auto &Trie = *(*ThreadTries)[I].Trie;
     if (Trie.getRoots().empty())
       continue;
-    populateRecords(ProfileRecords, PathAlloc, Trie);
+    populateRecords(ProfileRecords, PathAlloc, CA, Trie);
     DCHECK(!Trie.getRoots().empty());
     DCHECK(!ProfileRecords.empty());
 

Modified: compiler-rt/trunk/lib/xray/xray_profiling.cc
URL: http://llvm.org/viewvc/llvm-project/compiler-rt/trunk/lib/xray/xray_profiling.cc?rev=337342&r1=337341&r2=337342&view=diff
==============================================================================
--- compiler-rt/trunk/lib/xray/xray_profiling.cc (original)
+++ compiler-rt/trunk/lib/xray/xray_profiling.cc Tue Jul 17 18:53:39 2018
@@ -56,8 +56,9 @@ struct alignas(64) ProfilingData {
 
 static pthread_key_t ProfilingKey;
 
-ProfilingData &getThreadLocalData() XRAY_NEVER_INSTRUMENT {
-  thread_local std::aligned_storage<sizeof(ProfilingData)>::type ThreadStorage;
+thread_local std::aligned_storage<sizeof(ProfilingData)>::type ThreadStorage{};
+
+static ProfilingData &getThreadLocalData() XRAY_NEVER_INSTRUMENT {
   if (pthread_getspecific(ProfilingKey) == NULL) {
     new (&ThreadStorage) ProfilingData{};
     pthread_setspecific(ProfilingKey, &ThreadStorage);
@@ -86,6 +87,18 @@ ProfilingData &getThreadLocalData() XRAY
   return TLD;
 }
 
+static void cleanupTLD() XRAY_NEVER_INSTRUMENT {
+  auto &TLD = *reinterpret_cast<ProfilingData *>(&ThreadStorage);
+  if (TLD.Allocators != nullptr && TLD.FCT != nullptr) {
+    TLD.FCT->~FunctionCallTrie();
+    TLD.Allocators->~Allocators();
+    InternalFree(TLD.FCT);
+    InternalFree(TLD.Allocators);
+    TLD.FCT = nullptr;
+    TLD.Allocators = nullptr;
+  }
+}
+
 } // namespace
 
 const char *profilingCompilerDefinedFlags() XRAY_NEVER_INSTRUMENT {
@@ -148,6 +161,9 @@ XRayLogFlushStatus profilingFlush() XRAY
 
   profileCollectorService::reset();
 
+  // Flush the current thread's local data structures as well.
+  cleanupTLD();
+
   atomic_store(&ProfilerLogStatus, XRayLogFlushStatus::XRAY_LOG_FLUSHED,
                memory_order_release);
 
@@ -158,17 +174,12 @@ namespace {
 
 thread_local atomic_uint8_t ReentranceGuard{0};
 
-void postCurrentThreadFCT(ProfilingData &TLD) {
+static void postCurrentThreadFCT(ProfilingData &TLD) {
   if (TLD.Allocators == nullptr || TLD.FCT == nullptr)
     return;
 
   profileCollectorService::post(*TLD.FCT, GetTid());
-  TLD.FCT->~FunctionCallTrie();
-  TLD.Allocators->~Allocators();
-  InternalFree(TLD.FCT);
-  InternalFree(TLD.Allocators);
-  TLD.FCT = nullptr;
-  TLD.Allocators = nullptr;
+  cleanupTLD();
 }
 
 } // namespace
@@ -294,10 +305,14 @@ profilingLoggingInit(size_t BufferSize,
     // ABI functions for registering exit handlers.
     Atexit(+[] {
       // Finalize and flush.
-      if (profilingFinalize() != XRAY_LOG_FINALIZED)
+      if (profilingFinalize() != XRAY_LOG_FINALIZED) {
+        cleanupTLD();
         return;
-      if (profilingFlush() != XRAY_LOG_FLUSHED)
+      }
+      if (profilingFlush() != XRAY_LOG_FLUSHED) {
+        cleanupTLD();
         return;
+      }
       if (Verbosity())
         Report("XRay Profile flushed at exit.");
     });

Modified: compiler-rt/trunk/lib/xray/xray_profiling_flags.inc
URL: http://llvm.org/viewvc/llvm-project/compiler-rt/trunk/lib/xray/xray_profiling_flags.inc?rev=337342&r1=337341&r2=337342&view=diff
==============================================================================
--- compiler-rt/trunk/lib/xray/xray_profiling_flags.inc (original)
+++ compiler-rt/trunk/lib/xray/xray_profiling_flags.inc Tue Jul 17 18:53:39 2018
@@ -18,7 +18,7 @@ XRAY_FLAG(uptr, per_thread_allocator_max
           "Maximum size of any single per-thread allocator.")
 XRAY_FLAG(uptr, global_allocator_max, 2 << 24,
           "Maximum size of the global allocator for profile storage.")
-XRAY_FLAG(uptr, stack_allocator_max, 2 << 24,
+XRAY_FLAG(uptr, stack_allocator_max, 2 << 20,
           "Maximum size of the traversal stack allocator.")
 XRAY_FLAG(int, grace_period_ms, 1,
           "Profile collection will wait this much time in milliseconds before "

Modified: compiler-rt/trunk/lib/xray/xray_segmented_array.h
URL: http://llvm.org/viewvc/llvm-project/compiler-rt/trunk/lib/xray/xray_segmented_array.h?rev=337342&r1=337341&r2=337342&view=diff
==============================================================================
--- compiler-rt/trunk/lib/xray/xray_segmented_array.h (original)
+++ compiler-rt/trunk/lib/xray/xray_segmented_array.h Tue Jul 17 18:53:39 2018
@@ -24,6 +24,13 @@
 #include <utility>
 
 namespace __xray {
+struct Chunk {
+  void *Data;
+  Chunk *Prev;
+  Chunk *Next;
+};
+
+using ChunkAllocator = Allocator<next_pow2(sizeof(Chunk)) * 8>;
 
 /// The Array type provides an interface similar to std::vector<...> but does
 /// not shrink in size. Once constructed, elements can be appended but cannot be
@@ -51,16 +58,10 @@ private:
   // TODO: Consider co-locating the chunk information with the data in the
   // Block, as in an intrusive list -- i.e. putting the next and previous
   // pointer values inside the Block storage.
-  struct Chunk {
-    typename AllocatorType::Block Block;
-    static constexpr size_t Size = N;
-    Chunk *Prev = this;
-    Chunk *Next = this;
-  };
-
   static Chunk SentinelChunk;
 
   AllocatorType *Alloc;
+  ChunkAllocator *ChunkAlloc;
   Chunk *Head = &SentinelChunk;
   Chunk *Tail = &SentinelChunk;
   size_t Size = 0;
@@ -82,30 +83,19 @@ private:
       return FreeChunk;
     }
 
-    auto Block = Alloc->Allocate(ElementStorageSize);
+    auto Block = Alloc->Allocate();
     if (Block.Data == nullptr)
       return nullptr;
 
-    // TODO: Maybe use a separate managed allocator for Chunk instances?
-    auto C = reinterpret_cast<Chunk *>(
-        InternalAlloc(sizeof(Chunk), nullptr, kCacheLineSize));
-    if (C == nullptr)
+    auto ChunkElement = ChunkAlloc->Allocate();
+    if (ChunkElement.Data == nullptr)
       return nullptr;
-    C->Block = Block;
-    C->Prev = &SentinelChunk;
-    C->Next = &SentinelChunk;
-    return C;
-  }
 
-  static AllocatorType &GetGlobalAllocator() {
-    static AllocatorType *const GlobalAllocator = [] {
-      AllocatorType *A = reinterpret_cast<AllocatorType *>(
-          InternalAlloc(sizeof(AllocatorType)));
-      new (A) AllocatorType(2 << 10, 0);
-      return A;
-    }();
-
-    return *GlobalAllocator;
+    // Placement-new the Chunk element at the appropriate location.
+    auto C = reinterpret_cast<Chunk *>(ChunkElement.Data);
+    Chunk LocalC{Block.Data, &SentinelChunk, &SentinelChunk};
+    internal_memcpy(C, &LocalC, sizeof(LocalC));
+    return C;
   }
 
   Chunk *InitHeadAndTail() {
@@ -201,21 +191,21 @@ private:
     U &operator*() const {
       DCHECK_NE(C, &SentinelChunk);
       auto RelOff = Offset % N;
-      return *reinterpret_cast<U *>(reinterpret_cast<char *>(C->Block.Data) +
+      return *reinterpret_cast<U *>(reinterpret_cast<char *>(C->Data) +
                                     (RelOff * ElementStorageSize));
     }
 
     U *operator->() const {
       DCHECK_NE(C, &SentinelChunk);
       auto RelOff = Offset % N;
-      return reinterpret_cast<U *>(reinterpret_cast<char *>(C->Block.Data) +
+      return reinterpret_cast<U *>(reinterpret_cast<char *>(C->Data) +
                                    (RelOff * ElementStorageSize));
     }
   };
 
 public:
-  explicit Array(AllocatorType &A) : Alloc(&A) {}
-  Array() : Array(GetGlobalAllocator()) {}
+  explicit Array(AllocatorType &A, ChunkAllocator &CA)
+      : Alloc(&A), ChunkAlloc(&CA) {}
 
   Array(const Array &) = delete;
   Array(Array &&O) NOEXCEPT : Alloc(O.Alloc),
@@ -246,9 +236,8 @@ public:
       if (AppendNewChunk() == nullptr)
         return nullptr;
 
-    auto Position =
-        reinterpret_cast<T *>(reinterpret_cast<char *>(Tail->Block.Data) +
-                              (Offset * ElementStorageSize));
+    auto Position = reinterpret_cast<T *>(reinterpret_cast<char *>(Tail->Data) +
+                                          (Offset * ElementStorageSize));
     *Position = E;
     ++Size;
     return Position;
@@ -268,7 +257,7 @@ public:
     }
 
     DCHECK_NE(Tail, &SentinelChunk);
-    auto Position = reinterpret_cast<char *>(LatestChunk->Block.Data) +
+    auto Position = reinterpret_cast<char *>(LatestChunk->Data) +
                     (Offset * ElementStorageSize);
     DCHECK_EQ(reinterpret_cast<uintptr_t>(Position) % ElementStorageSize, 0);
     // In-place construct at Position.
@@ -286,7 +275,7 @@ public:
       Offset -= N;
       DCHECK_NE(C, &SentinelChunk);
     }
-    return *reinterpret_cast<T *>(reinterpret_cast<char *>(C->Block.Data) +
+    return *reinterpret_cast<T *>(reinterpret_cast<char *>(C->Data) +
                                   (Offset * ElementStorageSize));
   }
 
@@ -360,7 +349,8 @@ public:
 // ensure that storage for the SentinelChunk is defined and has a single
 // address.
 template <class T, size_t N>
-typename Array<T, N>::Chunk Array<T, N>::SentinelChunk;
+Chunk Array<T, N>::SentinelChunk{nullptr, &Array<T, N>::SentinelChunk,
+                                 &Array<T, N>::SentinelChunk};
 
 } // namespace __xray
 

Modified: compiler-rt/trunk/lib/xray/xray_utils.h
URL: http://llvm.org/viewvc/llvm-project/compiler-rt/trunk/lib/xray/xray_utils.h?rev=337342&r1=337341&r2=337342&view=diff
==============================================================================
--- compiler-rt/trunk/lib/xray/xray_utils.h (original)
+++ compiler-rt/trunk/lib/xray/xray_utils.h Tue Jul 17 18:53:39 2018
@@ -15,6 +15,8 @@
 #ifndef XRAY_UTILS_H
 #define XRAY_UTILS_H
 
+#include <cstddef>
+#include <cstdint>
 #include <sys/types.h>
 #include <utility>
 
@@ -54,6 +56,14 @@ constexpr size_t next_pow2(size_t number
   return next_pow2_helper(number, 1);
 }
 
+template <class T> constexpr T &max(T &A, T &B) { return A > B ? A : B; }
+
+template <class T> constexpr T &min(T &A, T &B) { return A <= B ? A : B; }
+
+constexpr ptrdiff_t diff(uintptr_t A, uintptr_t B) {
+  return max(A, B) - min(A, B);
+}
+
 } // namespace __xray
 
 #endif // XRAY_UTILS_H




More information about the llvm-commits mailing list