[compiler-rt] r339978 - [XRay][compiler-rt] Avoid InternalAlloc(...) in Profiling Mode
Dean Michael Berris via llvm-commits
llvm-commits at lists.llvm.org
Thu Aug 16 18:57:42 PDT 2018
Author: dberris
Date: Thu Aug 16 18:57:42 2018
New Revision: 339978
URL: http://llvm.org/viewvc/llvm-project?rev=339978&view=rev
Log:
[XRay][compiler-rt] Avoid InternalAlloc(...) in Profiling Mode
Summary:
We avoid using dynamic memory allocated with the internal allocator in
the profile collection service used by profiling mode. We use aligned
storage for globals and in-struct storage of objects we dynamically
initialize.
We also remove the dependency on `Vector<...>` which also internally
uses the dynamic allocator in sanitizer_common (InternalAlloc) in favour
of the XRay allocator and segmented array implementation.
This change addresses llvm.org/PR38577.
Reviewers: eizan
Reviewed By: eizan
Subscribers: llvm-commits
Differential Revision: https://reviews.llvm.org/D50782
Modified:
compiler-rt/trunk/lib/xray/xray_profile_collector.cc
compiler-rt/trunk/lib/xray/xray_segmented_array.h
Modified: compiler-rt/trunk/lib/xray/xray_profile_collector.cc
URL: http://llvm.org/viewvc/llvm-project/compiler-rt/trunk/lib/xray/xray_profile_collector.cc?rev=339978&r1=339977&r2=339978&view=diff
==============================================================================
--- compiler-rt/trunk/lib/xray/xray_profile_collector.cc (original)
+++ compiler-rt/trunk/lib/xray/xray_profile_collector.cc Thu Aug 16 18:57:42 2018
@@ -13,10 +13,10 @@
//
//===----------------------------------------------------------------------===//
#include "xray_profile_collector.h"
-#include "sanitizer_common/sanitizer_allocator_internal.h"
#include "sanitizer_common/sanitizer_common.h"
-#include "sanitizer_common/sanitizer_vector.h"
+#include "xray_allocator.h"
#include "xray_profiling_flags.h"
+#include "xray_segmented_array.h"
#include <memory>
#include <pthread.h>
#include <utility>
@@ -29,7 +29,7 @@ namespace {
SpinMutex GlobalMutex;
struct ThreadTrie {
tid_t TId;
- FunctionCallTrie *Trie;
+ typename std::aligned_storage<sizeof(FunctionCallTrie)>::type TrieStorage;
};
struct ProfileBuffer {
@@ -56,65 +56,68 @@ struct BlockHeader {
u64 ThreadId;
};
-// These need to be pointers that point to heap/internal-allocator-allocated
-// objects because these are accessed even at program exit.
-Vector<ThreadTrie> *ThreadTries = nullptr;
-Vector<ProfileBuffer> *ProfileBuffers = nullptr;
-FunctionCallTrie::Allocators *GlobalAllocators = nullptr;
+using ThreadTriesArray = Array<ThreadTrie>;
+using ProfileBufferArray = Array<ProfileBuffer>;
+using ThreadTriesArrayAllocator = typename ThreadTriesArray::AllocatorType;
+using ProfileBufferArrayAllocator = typename ProfileBufferArray::AllocatorType;
+
+// These need to be global aligned storage to avoid dynamic initialization. We
+// need these to be aligned to allow us to placement new objects into the
+// storage, and have pointers to those objects be appropriately aligned.
+static typename std::aligned_storage<sizeof(FunctionCallTrie::Allocators)>::type
+ AllocatorStorage;
+static typename std::aligned_storage<sizeof(ThreadTriesArray)>::type
+ ThreadTriesStorage;
+static typename std::aligned_storage<sizeof(ProfileBufferArray)>::type
+ ProfileBuffersStorage;
+static typename std::aligned_storage<sizeof(ThreadTriesArrayAllocator)>::type
+ ThreadTriesArrayAllocatorStorage;
+static typename std::aligned_storage<sizeof(ProfileBufferArrayAllocator)>::type
+ ProfileBufferArrayAllocatorStorage;
+
+static ThreadTriesArray *ThreadTries = nullptr;
+static ThreadTriesArrayAllocator *ThreadTriesAllocator = nullptr;
+static ProfileBufferArray *ProfileBuffers = nullptr;
+static ProfileBufferArrayAllocator *ProfileBuffersAllocator = nullptr;
+static FunctionCallTrie::Allocators *GlobalAllocators = nullptr;
+
+static void *allocateBuffer(size_t S) {
+ auto B = reinterpret_cast<void *>(internal_mmap(
+ NULL, S, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0));
+ if (B == MAP_FAILED) {
+ if (Verbosity())
+ Report("XRay Profiling: Failed to allocate memory of size %d.\n", S);
+ return nullptr;
+ }
+ return B;
+}
+
+static void deallocateBuffer(void *B, size_t S) {
+ if (B == nullptr)
+ return;
+ internal_munmap(B, S);
+}
} // namespace
void post(const FunctionCallTrie &T, tid_t TId) {
static pthread_once_t Once = PTHREAD_ONCE_INIT;
- pthread_once(&Once, +[] {
- SpinMutexLock Lock(&GlobalMutex);
- GlobalAllocators = reinterpret_cast<FunctionCallTrie::Allocators *>(
- InternalAlloc(sizeof(FunctionCallTrie::Allocators)));
- new (GlobalAllocators) FunctionCallTrie::Allocators();
- *GlobalAllocators = FunctionCallTrie::InitAllocatorsCustom(
- profilingFlags()->global_allocator_max);
- ThreadTries = reinterpret_cast<Vector<ThreadTrie> *>(
- InternalAlloc(sizeof(Vector<ThreadTrie>)));
- new (ThreadTries) Vector<ThreadTrie>();
- ProfileBuffers = reinterpret_cast<Vector<ProfileBuffer> *>(
- InternalAlloc(sizeof(Vector<ProfileBuffer>)));
- new (ProfileBuffers) Vector<ProfileBuffer>();
- });
- DCHECK_NE(GlobalAllocators, nullptr);
- DCHECK_NE(ThreadTries, nullptr);
- DCHECK_NE(ProfileBuffers, nullptr);
+ pthread_once(&Once, +[] { reset(); });
ThreadTrie *Item = nullptr;
{
SpinMutexLock Lock(&GlobalMutex);
- if (GlobalAllocators == nullptr)
+ if (GlobalAllocators == nullptr || ThreadTries == nullptr)
return;
- Item = ThreadTries->PushBack();
+ Item = ThreadTries->Append({});
Item->TId = TId;
-
- // Here we're using the internal allocator instead of the managed allocator
- // because:
- //
- // 1) We're not using the segmented array data structure to host
- // FunctionCallTrie objects. We're using a Vector (from sanitizer_common)
- // which works like a std::vector<...> keeping elements contiguous in
- // memory. The segmented array data structure assumes that elements are
- // trivially destructible, where FunctionCallTrie isn't.
- //
- // 2) Using a managed allocator means we need to manage that separately,
- // which complicates the nature of this code. To get around that, we're
- // using the internal allocator instead, which has its own global state
- // and is decoupled from the lifetime management required by the managed
- // allocator we have in XRay.
- //
- Item->Trie = reinterpret_cast<FunctionCallTrie *>(InternalAlloc(
- sizeof(FunctionCallTrie), nullptr, alignof(FunctionCallTrie)));
- DCHECK_NE(Item->Trie, nullptr);
- new (Item->Trie) FunctionCallTrie(*GlobalAllocators);
+ auto Trie = reinterpret_cast<FunctionCallTrie *>(&Item->TrieStorage);
+ new (Trie) FunctionCallTrie(*GlobalAllocators);
}
- T.deepCopyInto(*Item->Trie);
+ auto Trie = reinterpret_cast<FunctionCallTrie *>(&Item->TrieStorage);
+ T.deepCopyInto(*Trie);
}
// A PathArray represents the function id's representing a stack trace. In this
@@ -127,18 +130,12 @@ struct ProfileRecord {
// The Path in this record is the function id's from the leaf to the root of
// the function call stack as represented from a FunctionCallTrie.
- PathArray *Path = nullptr;
+ PathArray Path;
const FunctionCallTrie::Node *Node = nullptr;
// Constructor for in-place construction.
ProfileRecord(PathAllocator &A, const FunctionCallTrie::Node *N)
- : Path([&] {
- auto P =
- reinterpret_cast<PathArray *>(InternalAlloc(sizeof(PathArray)));
- new (P) PathArray(A);
- return P;
- }()),
- Node(N) {}
+ : Path(A), Node(N) {}
};
namespace {
@@ -167,8 +164,8 @@ static void populateRecords(ProfileRecor
// Traverse the Node's parents and as we're doing so, get the FIds in
// the order they appear.
for (auto N = Node; N != nullptr; N = N->Parent)
- Record->Path->Append(N->FId);
- DCHECK(!Record->Path->empty());
+ Record->Path.Append(N->FId);
+ DCHECK(!Record->Path.empty());
for (const auto C : Node->Callees)
DFSStack.Append(C.NodePtr);
@@ -183,7 +180,7 @@ static void serializeRecords(ProfileBuff
sizeof(Header);
for (const auto &Record : ProfileRecords) {
// List of IDs follow:
- for (const auto FId : *Record.Path)
+ for (const auto FId : Record.Path)
NextPtr =
static_cast<char *>(internal_memcpy(NextPtr, &FId, sizeof(FId))) +
sizeof(FId);
@@ -213,16 +210,21 @@ static void serializeRecords(ProfileBuff
void serialize() {
SpinMutexLock Lock(&GlobalMutex);
- // Clear out the global ProfileBuffers.
- for (uptr I = 0; I < ProfileBuffers->Size(); ++I)
- InternalFree((*ProfileBuffers)[I].Data);
- ProfileBuffers->Reset();
+ if (GlobalAllocators == nullptr || ThreadTries == nullptr ||
+ ProfileBuffers == nullptr)
+ return;
+
+ // Clear out the global ProfileBuffers, if it's not empty.
+ for (auto &B : *ProfileBuffers)
+ deallocateBuffer(B.Data, B.Size);
+ ProfileBuffers->trim(ProfileBuffers->size());
- if (ThreadTries->Size() == 0)
+ if (ThreadTries->empty())
return;
// Then repopulate the global ProfileBuffers.
- for (u32 I = 0; I < ThreadTries->Size(); ++I) {
+ u32 I = 0;
+ for (const auto &ThreadTrie : *ThreadTries) {
using ProfileRecordAllocator = typename ProfileRecordArray::AllocatorType;
ProfileRecordAllocator PRAlloc(profilingFlags()->global_allocator_max);
ProfileRecord::PathAllocator PathAlloc(
@@ -233,9 +235,11 @@ void serialize() {
// use a local allocator and an __xray::Array<...> to store the intermediary
// data, then compute the size as we're going along. Then we'll allocate the
// contiguous space to contain the thread buffer data.
- const auto &Trie = *(*ThreadTries)[I].Trie;
+ const auto &Trie =
+ *reinterpret_cast<const FunctionCallTrie *>(&(ThreadTrie.TrieStorage));
if (Trie.getRoots().empty())
continue;
+
populateRecords(ProfileRecords, PathAlloc, Trie);
DCHECK(!Trie.getRoots().empty());
DCHECK(!ProfileRecords.empty());
@@ -251,68 +255,71 @@ void serialize() {
// + end of record (8 bytes)
u32 CumulativeSizes = 0;
for (const auto &Record : ProfileRecords)
- CumulativeSizes += 20 + (4 * Record.Path->size());
+ CumulativeSizes += 20 + (4 * Record.Path.size());
- BlockHeader Header{16 + CumulativeSizes, I, (*ThreadTries)[I].TId};
- auto Buffer = ProfileBuffers->PushBack();
+ BlockHeader Header{16 + CumulativeSizes, I++, ThreadTrie.TId};
+ auto Buffer = ProfileBuffers->Append({});
Buffer->Size = sizeof(Header) + CumulativeSizes;
- Buffer->Data = InternalAlloc(Buffer->Size, nullptr, 64);
+ Buffer->Data = allocateBuffer(Buffer->Size);
DCHECK_NE(Buffer->Data, nullptr);
serializeRecords(Buffer, Header, ProfileRecords);
-
- // Now clean up the ProfileRecords array, one at a time.
- for (auto &Record : ProfileRecords) {
- Record.Path->~PathArray();
- InternalFree(Record.Path);
- }
}
}
void reset() {
SpinMutexLock Lock(&GlobalMutex);
+
if (ProfileBuffers != nullptr) {
// Clear out the profile buffers that have been serialized.
- for (uptr I = 0; I < ProfileBuffers->Size(); ++I)
- InternalFree((*ProfileBuffers)[I].Data);
- ProfileBuffers->Reset();
- InternalFree(ProfileBuffers);
- ProfileBuffers = nullptr;
+ for (auto &B : *ProfileBuffers)
+ deallocateBuffer(B.Data, B.Size);
+ ProfileBuffers->trim(ProfileBuffers->size());
}
if (ThreadTries != nullptr) {
// Clear out the function call tries per thread.
- for (uptr I = 0; I < ThreadTries->Size(); ++I) {
- auto &T = (*ThreadTries)[I];
- T.Trie->~FunctionCallTrie();
- InternalFree(T.Trie);
+ for (auto &T : *ThreadTries) {
+ auto Trie = reinterpret_cast<FunctionCallTrie *>(&T.TrieStorage);
+ Trie->~FunctionCallTrie();
}
- ThreadTries->Reset();
- InternalFree(ThreadTries);
- ThreadTries = nullptr;
+ ThreadTries->trim(ThreadTries->size());
}
// Reset the global allocators.
- if (GlobalAllocators != nullptr) {
+ if (GlobalAllocators != nullptr)
GlobalAllocators->~Allocators();
- InternalFree(GlobalAllocators);
- GlobalAllocators = nullptr;
- }
- GlobalAllocators = reinterpret_cast<FunctionCallTrie::Allocators *>(
- InternalAlloc(sizeof(FunctionCallTrie::Allocators)));
+
+ GlobalAllocators =
+ reinterpret_cast<FunctionCallTrie::Allocators *>(&AllocatorStorage);
new (GlobalAllocators) FunctionCallTrie::Allocators();
*GlobalAllocators = FunctionCallTrie::InitAllocators();
- ThreadTries = reinterpret_cast<Vector<ThreadTrie> *>(
- InternalAlloc(sizeof(Vector<ThreadTrie>)));
- new (ThreadTries) Vector<ThreadTrie>();
- ProfileBuffers = reinterpret_cast<Vector<ProfileBuffer> *>(
- InternalAlloc(sizeof(Vector<ProfileBuffer>)));
- new (ProfileBuffers) Vector<ProfileBuffer>();
+
+ if (ThreadTriesAllocator != nullptr)
+ ThreadTriesAllocator->~ThreadTriesArrayAllocator();
+
+ ThreadTriesAllocator = reinterpret_cast<ThreadTriesArrayAllocator *>(
+ &ThreadTriesArrayAllocatorStorage);
+ new (ThreadTriesAllocator)
+ ThreadTriesArrayAllocator(profilingFlags()->global_allocator_max);
+ ThreadTries = reinterpret_cast<ThreadTriesArray *>(&ThreadTriesStorage);
+ new (ThreadTries) ThreadTriesArray(*ThreadTriesAllocator);
+
+ if (ProfileBuffersAllocator != nullptr)
+ ProfileBuffersAllocator->~ProfileBufferArrayAllocator();
+
+ ProfileBuffersAllocator = reinterpret_cast<ProfileBufferArrayAllocator *>(
+ &ProfileBufferArrayAllocatorStorage);
+ new (ProfileBuffersAllocator)
+ ProfileBufferArrayAllocator(profilingFlags()->global_allocator_max);
+ ProfileBuffers =
+ reinterpret_cast<ProfileBufferArray *>(&ProfileBuffersStorage);
+ new (ProfileBuffers) ProfileBufferArray(*ProfileBuffersAllocator);
}
XRayBuffer nextBuffer(XRayBuffer B) {
SpinMutexLock Lock(&GlobalMutex);
- if (ProfileBuffers == nullptr || ProfileBuffers->Size() == 0)
+ if (ProfileBuffers == nullptr || ProfileBuffers->size() == 0)
return {nullptr, 0};
static pthread_once_t Once = PTHREAD_ONCE_INIT;
@@ -336,7 +343,7 @@ XRayBuffer nextBuffer(XRayBuffer B) {
BlockHeader Header;
internal_memcpy(&Header, B.Data, sizeof(BlockHeader));
auto NextBlock = Header.BlockNum + 1;
- if (NextBlock < ProfileBuffers->Size())
+ if (NextBlock < ProfileBuffers->size())
return {(*ProfileBuffers)[NextBlock].Data,
(*ProfileBuffers)[NextBlock].Size};
return {nullptr, 0};
Modified: compiler-rt/trunk/lib/xray/xray_segmented_array.h
URL: http://llvm.org/viewvc/llvm-project/compiler-rt/trunk/lib/xray/xray_segmented_array.h?rev=339978&r1=339977&r2=339978&view=diff
==============================================================================
--- compiler-rt/trunk/lib/xray/xray_segmented_array.h (original)
+++ compiler-rt/trunk/lib/xray/xray_segmented_array.h Thu Aug 16 18:57:42 2018
@@ -325,6 +325,9 @@ public:
/// Remove N Elements from the end. This leaves the blocks behind, and not
/// require allocation of new blocks for new elements added after trimming.
void trim(size_t Elements) {
+ if (Elements == 0)
+ return;
+
DCHECK_LE(Elements, Size);
DCHECK_GT(Size, 0);
auto OldSize = Size;
More information about the llvm-commits
mailing list