[compiler-rt] r339978 - [XRay][compiler-rt] Avoid InternalAlloc(...) in Profiling Mode

Thu Aug 16 18:57:42 PDT 2018

Author: dberris
Date: Thu Aug 16 18:57:42 2018
New Revision: 339978

URL: http://llvm.org/viewvc/llvm-project?rev=339978&view=rev
Log:
[XRay][compiler-rt] Avoid InternalAlloc(...) in Profiling Mode

Summary:
We avoid using dynamic memory allocated with the internal allocator in
the profile collection service used by profiling mode. We use aligned
storage for globals and in-struct storage of objects we dynamically
initialize.

We also remove the dependency on `Vector<...>` which also internally
uses the dynamic allocator in sanitizer_common (InternalAlloc) in favour
of the XRay allocator and segmented array implementation.

This change addresses llvm.org/PR38577.

Reviewers: eizan

Reviewed By: eizan

Subscribers: llvm-commits

Differential Revision: https://reviews.llvm.org/D50782

Modified:
    compiler-rt/trunk/lib/xray/xray_profile_collector.cc
    compiler-rt/trunk/lib/xray/xray_segmented_array.h

Modified: compiler-rt/trunk/lib/xray/xray_profile_collector.cc
URL: http://llvm.org/viewvc/llvm-project/compiler-rt/trunk/lib/xray/xray_profile_collector.cc?rev=339978&r1=339977&r2=339978&view=diff
==============================================================================

--- compiler-rt/trunk/lib/xray/xray_profile_collector.cc (original)
+++ compiler-rt/trunk/lib/xray/xray_profile_collector.cc Thu Aug 16 18:57:42 2018
@@ -13,10 +13,10 @@
 //
 //===----------------------------------------------------------------------===//
 #include "xray_profile_collector.h"
-#include "sanitizer_common/sanitizer_allocator_internal.h"
 #include "sanitizer_common/sanitizer_common.h"
-#include "sanitizer_common/sanitizer_vector.h"
+#include "xray_allocator.h"
 #include "xray_profiling_flags.h"
+#include "xray_segmented_array.h"
 #include <memory>
 #include <pthread.h>
 #include <utility>
@@ -29,7 +29,7 @@ namespace {
 SpinMutex GlobalMutex;
 struct ThreadTrie {
   tid_t TId;
-  FunctionCallTrie *Trie;
+  typename std::aligned_storage<sizeof(FunctionCallTrie)>::type TrieStorage;
 };
 
 struct ProfileBuffer {
@@ -56,65 +56,68 @@ struct BlockHeader {
   u64 ThreadId;
 };
 
-// These need to be pointers that point to heap/internal-allocator-allocated
-// objects because these are accessed even at program exit.
-Vector<ThreadTrie> *ThreadTries = nullptr;
-Vector<ProfileBuffer> *ProfileBuffers = nullptr;
-FunctionCallTrie::Allocators *GlobalAllocators = nullptr;
+using ThreadTriesArray = Array<ThreadTrie>;
+using ProfileBufferArray = Array<ProfileBuffer>;
+using ThreadTriesArrayAllocator = typename ThreadTriesArray::AllocatorType;
+using ProfileBufferArrayAllocator = typename ProfileBufferArray::AllocatorType;
+
+// These need to be global aligned storage to avoid dynamic initialization. We
+// need these to be aligned to allow us to placement new objects into the
+// storage, and have pointers to those objects be appropriately aligned.
+static typename std::aligned_storage<sizeof(FunctionCallTrie::Allocators)>::type
+    AllocatorStorage;
+static typename std::aligned_storage<sizeof(ThreadTriesArray)>::type
+    ThreadTriesStorage;
+static typename std::aligned_storage<sizeof(ProfileBufferArray)>::type
+    ProfileBuffersStorage;
+static typename std::aligned_storage<sizeof(ThreadTriesArrayAllocator)>::type
+    ThreadTriesArrayAllocatorStorage;
+static typename std::aligned_storage<sizeof(ProfileBufferArrayAllocator)>::type
+    ProfileBufferArrayAllocatorStorage;
+
+static ThreadTriesArray *ThreadTries = nullptr;
+static ThreadTriesArrayAllocator *ThreadTriesAllocator = nullptr;
+static ProfileBufferArray *ProfileBuffers = nullptr;
+static ProfileBufferArrayAllocator *ProfileBuffersAllocator = nullptr;
+static FunctionCallTrie::Allocators *GlobalAllocators = nullptr;
+
+static void *allocateBuffer(size_t S) {
+  auto B = reinterpret_cast<void *>(internal_mmap(
+      NULL, S, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0));
+  if (B == MAP_FAILED) {
+    if (Verbosity())
+      Report("XRay Profiling: Failed to allocate memory of size %d.\n", S);
+    return nullptr;
+  }
+  return B;
+}
+
+static void deallocateBuffer(void *B, size_t S) {
+  if (B == nullptr)
+    return;
+  internal_munmap(B, S);
+}
 
 } // namespace
 
 void post(const FunctionCallTrie &T, tid_t TId) {
   static pthread_once_t Once = PTHREAD_ONCE_INIT;
-  pthread_once(&Once, +[] {
-    SpinMutexLock Lock(&GlobalMutex);
-    GlobalAllocators = reinterpret_cast<FunctionCallTrie::Allocators *>(
-        InternalAlloc(sizeof(FunctionCallTrie::Allocators)));
-    new (GlobalAllocators) FunctionCallTrie::Allocators();
-    *GlobalAllocators = FunctionCallTrie::InitAllocatorsCustom(
-        profilingFlags()->global_allocator_max);
-    ThreadTries = reinterpret_cast<Vector<ThreadTrie> *>(
-        InternalAlloc(sizeof(Vector<ThreadTrie>)));
-    new (ThreadTries) Vector<ThreadTrie>();
-    ProfileBuffers = reinterpret_cast<Vector<ProfileBuffer> *>(
-        InternalAlloc(sizeof(Vector<ProfileBuffer>)));
-    new (ProfileBuffers) Vector<ProfileBuffer>();
-  });
-  DCHECK_NE(GlobalAllocators, nullptr);
-  DCHECK_NE(ThreadTries, nullptr);
-  DCHECK_NE(ProfileBuffers, nullptr);
+  pthread_once(&Once, +[] { reset(); });
 
   ThreadTrie *Item = nullptr;
   {
     SpinMutexLock Lock(&GlobalMutex);
-    if (GlobalAllocators == nullptr)
+    if (GlobalAllocators == nullptr || ThreadTries == nullptr)
       return;
 
-    Item = ThreadTries->PushBack();
+    Item = ThreadTries->Append({});
     Item->TId = TId;
-
-    // Here we're using the internal allocator instead of the managed allocator
-    // because:
-    //
-    // 1) We're not using the segmented array data structure to host
-    //    FunctionCallTrie objects. We're using a Vector (from sanitizer_common)
-    //    which works like a std::vector<...> keeping elements contiguous in
-    //    memory. The segmented array data structure assumes that elements are
-    //    trivially destructible, where FunctionCallTrie isn't.
-    //
-    // 2) Using a managed allocator means we need to manage that separately,
-    //    which complicates the nature of this code. To get around that, we're
-    //    using the internal allocator instead, which has its own global state
-    //    and is decoupled from the lifetime management required by the managed
-    //    allocator we have in XRay.
-    //
-    Item->Trie = reinterpret_cast<FunctionCallTrie *>(InternalAlloc(
-        sizeof(FunctionCallTrie), nullptr, alignof(FunctionCallTrie)));
-    DCHECK_NE(Item->Trie, nullptr);
-    new (Item->Trie) FunctionCallTrie(*GlobalAllocators);
+    auto Trie = reinterpret_cast<FunctionCallTrie *>(&Item->TrieStorage);
+    new (Trie) FunctionCallTrie(*GlobalAllocators);
   }
 
-  T.deepCopyInto(*Item->Trie);
+  auto Trie = reinterpret_cast<FunctionCallTrie *>(&Item->TrieStorage);
+  T.deepCopyInto(*Trie);
 }
 
 // A PathArray represents the function id's representing a stack trace. In this
@@ -127,18 +130,12 @@ struct ProfileRecord {
 
   // The Path in this record is the function id's from the leaf to the root of
   // the function call stack as represented from a FunctionCallTrie.
-  PathArray *Path = nullptr;
+  PathArray Path;
   const FunctionCallTrie::Node *Node = nullptr;
 
   // Constructor for in-place construction.
   ProfileRecord(PathAllocator &A, const FunctionCallTrie::Node *N)
-      : Path([&] {
-          auto P =
-              reinterpret_cast<PathArray *>(InternalAlloc(sizeof(PathArray)));
-          new (P) PathArray(A);
-          return P;
-        }()),
-        Node(N) {}
+      : Path(A), Node(N) {}
 };
 
 namespace {
@@ -167,8 +164,8 @@ static void populateRecords(ProfileRecor
       // Traverse the Node's parents and as we're doing so, get the FIds in
       // the order they appear.
       for (auto N = Node; N != nullptr; N = N->Parent)
-        Record->Path->Append(N->FId);
-      DCHECK(!Record->Path->empty());
+        Record->Path.Append(N->FId);
+      DCHECK(!Record->Path.empty());
 
       for (const auto C : Node->Callees)
         DFSStack.Append(C.NodePtr);
@@ -183,7 +180,7 @@ static void serializeRecords(ProfileBuff
                  sizeof(Header);
   for (const auto &Record : ProfileRecords) {
     // List of IDs follow:
-    for (const auto FId : *Record.Path)
+    for (const auto FId : Record.Path)
       NextPtr =
           static_cast<char *>(internal_memcpy(NextPtr, &FId, sizeof(FId))) +
           sizeof(FId);
@@ -213,16 +210,21 @@ static void serializeRecords(ProfileBuff
 void serialize() {
   SpinMutexLock Lock(&GlobalMutex);
 
-  // Clear out the global ProfileBuffers.
-  for (uptr I = 0; I < ProfileBuffers->Size(); ++I)
-    InternalFree((*ProfileBuffers)[I].Data);
-  ProfileBuffers->Reset();
+  if (GlobalAllocators == nullptr || ThreadTries == nullptr ||
+      ProfileBuffers == nullptr)
+    return;
+
+  // Clear out the global ProfileBuffers, if it's not empty.
+  for (auto &B : *ProfileBuffers)
+    deallocateBuffer(B.Data, B.Size);
+  ProfileBuffers->trim(ProfileBuffers->size());
 
-  if (ThreadTries->Size() == 0)
+  if (ThreadTries->empty())
     return;
 
   // Then repopulate the global ProfileBuffers.
-  for (u32 I = 0; I < ThreadTries->Size(); ++I) {
+  u32 I = 0;
+  for (const auto &ThreadTrie : *ThreadTries) {
     using ProfileRecordAllocator = typename ProfileRecordArray::AllocatorType;
     ProfileRecordAllocator PRAlloc(profilingFlags()->global_allocator_max);
     ProfileRecord::PathAllocator PathAlloc(
@@ -233,9 +235,11 @@ void serialize() {
     // use a local allocator and an __xray::Array<...> to store the intermediary
     // data, then compute the size as we're going along. Then we'll allocate the
     // contiguous space to contain the thread buffer data.
-    const auto &Trie = *(*ThreadTries)[I].Trie;
+    const auto &Trie =
+        *reinterpret_cast<const FunctionCallTrie *>(&(ThreadTrie.TrieStorage));
     if (Trie.getRoots().empty())
       continue;
+
     populateRecords(ProfileRecords, PathAlloc, Trie);
     DCHECK(!Trie.getRoots().empty());
     DCHECK(!ProfileRecords.empty());
@@ -251,68 +255,71 @@ void serialize() {
     //   + end of record (8 bytes)
     u32 CumulativeSizes = 0;
     for (const auto &Record : ProfileRecords)
-      CumulativeSizes += 20 + (4 * Record.Path->size());
+      CumulativeSizes += 20 + (4 * Record.Path.size());
 
-    BlockHeader Header{16 + CumulativeSizes, I, (*ThreadTries)[I].TId};
-    auto Buffer = ProfileBuffers->PushBack();
+    BlockHeader Header{16 + CumulativeSizes, I++, ThreadTrie.TId};
+    auto Buffer = ProfileBuffers->Append({});
     Buffer->Size = sizeof(Header) + CumulativeSizes;
-    Buffer->Data = InternalAlloc(Buffer->Size, nullptr, 64);
+    Buffer->Data = allocateBuffer(Buffer->Size);
     DCHECK_NE(Buffer->Data, nullptr);
     serializeRecords(Buffer, Header, ProfileRecords);
-
-    // Now clean up the ProfileRecords array, one at a time.
-    for (auto &Record : ProfileRecords) {
-      Record.Path->~PathArray();
-      InternalFree(Record.Path);
-    }
   }
 }
 
 void reset() {
   SpinMutexLock Lock(&GlobalMutex);
+
   if (ProfileBuffers != nullptr) {
     // Clear out the profile buffers that have been serialized.
-    for (uptr I = 0; I < ProfileBuffers->Size(); ++I)
-      InternalFree((*ProfileBuffers)[I].Data);
-    ProfileBuffers->Reset();
-    InternalFree(ProfileBuffers);
-    ProfileBuffers = nullptr;
+    for (auto &B : *ProfileBuffers)
+      deallocateBuffer(B.Data, B.Size);
+    ProfileBuffers->trim(ProfileBuffers->size());
   }
 
   if (ThreadTries != nullptr) {
     // Clear out the function call tries per thread.
-    for (uptr I = 0; I < ThreadTries->Size(); ++I) {
-      auto &T = (*ThreadTries)[I];
-      T.Trie->~FunctionCallTrie();
-      InternalFree(T.Trie);
+    for (auto &T : *ThreadTries) {
+      auto Trie = reinterpret_cast<FunctionCallTrie *>(&T.TrieStorage);
+      Trie->~FunctionCallTrie();
     }
-    ThreadTries->Reset();
-    InternalFree(ThreadTries);
-    ThreadTries = nullptr;
+    ThreadTries->trim(ThreadTries->size());
   }
 
   // Reset the global allocators.
-  if (GlobalAllocators != nullptr) {
+  if (GlobalAllocators != nullptr)
     GlobalAllocators->~Allocators();
-    InternalFree(GlobalAllocators);
-    GlobalAllocators = nullptr;
-  }
-  GlobalAllocators = reinterpret_cast<FunctionCallTrie::Allocators *>(
-      InternalAlloc(sizeof(FunctionCallTrie::Allocators)));
+
+  GlobalAllocators =
+      reinterpret_cast<FunctionCallTrie::Allocators *>(&AllocatorStorage);
   new (GlobalAllocators) FunctionCallTrie::Allocators();
   *GlobalAllocators = FunctionCallTrie::InitAllocators();
-  ThreadTries = reinterpret_cast<Vector<ThreadTrie> *>(
-      InternalAlloc(sizeof(Vector<ThreadTrie>)));
-  new (ThreadTries) Vector<ThreadTrie>();
-  ProfileBuffers = reinterpret_cast<Vector<ProfileBuffer> *>(
-      InternalAlloc(sizeof(Vector<ProfileBuffer>)));
-  new (ProfileBuffers) Vector<ProfileBuffer>();
+
+  if (ThreadTriesAllocator != nullptr)
+    ThreadTriesAllocator->~ThreadTriesArrayAllocator();
+
+  ThreadTriesAllocator = reinterpret_cast<ThreadTriesArrayAllocator *>(
+      &ThreadTriesArrayAllocatorStorage);
+  new (ThreadTriesAllocator)
+      ThreadTriesArrayAllocator(profilingFlags()->global_allocator_max);
+  ThreadTries = reinterpret_cast<ThreadTriesArray *>(&ThreadTriesStorage);
+  new (ThreadTries) ThreadTriesArray(*ThreadTriesAllocator);
+
+  if (ProfileBuffersAllocator != nullptr)
+    ProfileBuffersAllocator->~ProfileBufferArrayAllocator();
+
+  ProfileBuffersAllocator = reinterpret_cast<ProfileBufferArrayAllocator *>(
+      &ProfileBufferArrayAllocatorStorage);
+  new (ProfileBuffersAllocator)
+      ProfileBufferArrayAllocator(profilingFlags()->global_allocator_max);
+  ProfileBuffers =
+      reinterpret_cast<ProfileBufferArray *>(&ProfileBuffersStorage);
+  new (ProfileBuffers) ProfileBufferArray(*ProfileBuffersAllocator);
 }
 
 XRayBuffer nextBuffer(XRayBuffer B) {
   SpinMutexLock Lock(&GlobalMutex);
 
-  if (ProfileBuffers == nullptr || ProfileBuffers->Size() == 0)
+  if (ProfileBuffers == nullptr || ProfileBuffers->size() == 0)
     return {nullptr, 0};
 
   static pthread_once_t Once = PTHREAD_ONCE_INIT;
@@ -336,7 +343,7 @@ XRayBuffer nextBuffer(XRayBuffer B) {
   BlockHeader Header;
   internal_memcpy(&Header, B.Data, sizeof(BlockHeader));
   auto NextBlock = Header.BlockNum + 1;
-  if (NextBlock < ProfileBuffers->Size())
+  if (NextBlock < ProfileBuffers->size())
     return {(*ProfileBuffers)[NextBlock].Data,
             (*ProfileBuffers)[NextBlock].Size};
   return {nullptr, 0};

Modified: compiler-rt/trunk/lib/xray/xray_segmented_array.h
URL: http://llvm.org/viewvc/llvm-project/compiler-rt/trunk/lib/xray/xray_segmented_array.h?rev=339978&r1=339977&r2=339978&view=diff
==============================================================================
--- compiler-rt/trunk/lib/xray/xray_segmented_array.h (original)
+++ compiler-rt/trunk/lib/xray/xray_segmented_array.h Thu Aug 16 18:57:42 2018
@@ -325,6 +325,9 @@ public:
   /// Remove N Elements from the end. This leaves the blocks behind, and not
   /// require allocation of new blocks for new elements added after trimming.
   void trim(size_t Elements) {
+    if (Elements == 0)
+      return;
+
     DCHECK_LE(Elements, Size);
     DCHECK_GT(Size, 0);
     auto OldSize = Size;