[compiler-rt] ccf765c - [compiler-rt][ctx_profile] Add the instrumented contextual profiling APIs (#89838)
via llvm-commits
llvm-commits at lists.llvm.org
Tue May 7 15:01:20 PDT 2024
Author: Mircea Trofin
Date: 2024-05-07T15:01:15-07:00
New Revision: ccf765cfd578c4ea4f710386e19cb8d1ef1859ce
URL: https://github.com/llvm/llvm-project/commit/ccf765cfd578c4ea4f710386e19cb8d1ef1859ce
DIFF: https://github.com/llvm/llvm-project/commit/ccf765cfd578c4ea4f710386e19cb8d1ef1859ce.diff
LOG: [compiler-rt][ctx_profile] Add the instrumented contextual profiling APIs (#89838)
APIs for contextual profiling. `ContextNode` is the call context-specific counter buffer. `ContextRoot` is associated to those functions that constitute roots into interesting call graphs, and is the object on which we hang off `Arena`s for allocating `ContextNode`s, as well as the `ContextNode` corresponding to such functions. Graphs of `ContextNode`s are accessible by one thread at a time.
(Tracking Issue: #89287, more details in the RFC referenced there)
Added:
Modified:
compiler-rt/lib/ctx_profile/CtxInstrProfiling.cpp
compiler-rt/lib/ctx_profile/CtxInstrProfiling.h
compiler-rt/lib/ctx_profile/tests/CtxInstrProfilingTest.cpp
Removed:
################################################################################
diff --git a/compiler-rt/lib/ctx_profile/CtxInstrProfiling.cpp b/compiler-rt/lib/ctx_profile/CtxInstrProfiling.cpp
index 7620ce92f7ebd..68bfe5c1ae614 100644
--- a/compiler-rt/lib/ctx_profile/CtxInstrProfiling.cpp
+++ b/compiler-rt/lib/ctx_profile/CtxInstrProfiling.cpp
@@ -10,20 +10,115 @@
#include "sanitizer_common/sanitizer_allocator_internal.h"
#include "sanitizer_common/sanitizer_common.h"
#include "sanitizer_common/sanitizer_dense_map.h"
+#include "sanitizer_common/sanitizer_libc.h"
#include "sanitizer_common/sanitizer_mutex.h"
#include "sanitizer_common/sanitizer_placement_new.h"
#include "sanitizer_common/sanitizer_thread_safety.h"
+#include "sanitizer_common/sanitizer_vector.h"
#include <assert.h>
using namespace __ctx_profile;
+namespace {
+// Keep track of all the context roots we actually saw, so we can then traverse
+// them when the user asks for the profile in __llvm_ctx_profile_fetch
+__sanitizer::SpinMutex AllContextsMutex;
+SANITIZER_GUARDED_BY(AllContextsMutex)
+__sanitizer::Vector<ContextRoot *> AllContextRoots;
+
+// utility to taint a pointer by setting the LSB. There is an assumption
+// throughout that the addresses of contexts are even (really, they should be
+// align(8), but "even"-ness is the minimum assumption)
+// "scratch contexts" are buffers that we return in certain cases - they are
+// large enough to allow for memory safe counter access, but they don't link
+// subcontexts below them (the runtime recognizes them and enforces that)
+ContextNode *markAsScratch(const ContextNode *Ctx) {
+ return reinterpret_cast<ContextNode *>(reinterpret_cast<uint64_t>(Ctx) | 1);
+}
+
+// Used when getting the data from TLS. We don't *really* need to reset, but
+// it's a simpler system if we do.
+template <typename T> inline T consume(T &V) {
+ auto R = V;
+ V = {0};
+ return R;
+}
+
+// We allocate at least kBuffSize Arena pages. The scratch buffer is also that
+// large.
+constexpr size_t kPower = 20;
+constexpr size_t kBuffSize = 1 << kPower;
+
+// Highly unlikely we need more than kBuffSize for a context.
+size_t getArenaAllocSize(size_t Needed) {
+ if (Needed >= kBuffSize)
+ return 2 * Needed;
+ return kBuffSize;
+}
+
+// verify the structural integrity of the context
+bool validate(const ContextRoot *Root) {
+ // all contexts should be laid out in some arena page. Go over each arena
+ // allocated for this Root, and jump over contained contexts based on
+ // self-reported sizes.
+ __sanitizer::DenseMap<uint64_t, bool> ContextStartAddrs;
+ for (const auto *Mem = Root->FirstMemBlock; Mem; Mem = Mem->next()) {
+ const auto *Pos = Mem->start();
+ while (Pos < Mem->pos()) {
+ const auto *Ctx = reinterpret_cast<const ContextNode *>(Pos);
+ if (!ContextStartAddrs.insert({reinterpret_cast<uint64_t>(Ctx), true})
+ .second)
+ return false;
+ Pos += Ctx->size();
+ }
+ }
+
+ // Now traverse the contexts again the same way, but validate all nonull
+ // subcontext addresses appear in the set computed above.
+ for (const auto *Mem = Root->FirstMemBlock; Mem; Mem = Mem->next()) {
+ const auto *Pos = Mem->start();
+ while (Pos < Mem->pos()) {
+ const auto *Ctx = reinterpret_cast<const ContextNode *>(Pos);
+ for (uint32_t I = 0; I < Ctx->callsites_size(); ++I)
+ for (auto *Sub = Ctx->subContexts()[I]; Sub; Sub = Sub->next())
+ if (!ContextStartAddrs.find(reinterpret_cast<uint64_t>(Sub)))
+ return false;
+
+ Pos += Ctx->size();
+ }
+ }
+ return true;
+}
+} // namespace
+
+// the scratch buffer - what we give when we can't produce a real context (the
+// scratch isn't "real" in that it's expected to be clobbered carelessly - we
+// don't read it). The other important thing is that the callees from a scratch
+// context also get a scratch context.
+// Eventually this can be replaced with per-function buffers, a'la the typical
+// (flat) instrumented FDO buffers. The clobbering aspect won't apply there, but
+// the part about determining the nature of the subcontexts does.
+__thread char __Buffer[kBuffSize] = {0};
+
+#define TheScratchContext \
+ markAsScratch(reinterpret_cast<ContextNode *>(__Buffer))
+
+// init the TLSes
+__thread void *volatile __llvm_ctx_profile_expected_callee[2] = {nullptr,
+ nullptr};
+__thread ContextNode **volatile __llvm_ctx_profile_callsite[2] = {0, 0};
+
+__thread ContextRoot *volatile __llvm_ctx_profile_current_context_root =
+ nullptr;
+
// FIXME(mtrofin): use malloc / mmap instead of sanitizer common APIs to reduce
// the dependency on the latter.
Arena *Arena::allocateNewArena(size_t Size, Arena *Prev) {
assert(!Prev || Prev->Next == nullptr);
- Arena *NewArena =
- new (__sanitizer::InternalAlloc(Size + sizeof(Arena))) Arena(Size);
+ Arena *NewArena = new (__sanitizer::InternalAlloc(
+ Size + sizeof(Arena), /*cache=*/nullptr, /*alignment=*/ExpectedAlignment))
+ Arena(Size);
if (Prev)
Prev->Next = NewArena;
return NewArena;
@@ -38,3 +133,187 @@ void Arena::freeArenaList(Arena *&A) {
}
A = nullptr;
}
+
+inline ContextNode *ContextNode::alloc(char *Place, GUID Guid,
+ uint32_t NrCounters,
+ uint32_t NrCallsites,
+ ContextNode *Next) {
+ assert(reinterpret_cast<uint64_t>(Place) % ExpectedAlignment == 0);
+ return new (Place) ContextNode(Guid, NrCounters, NrCallsites, Next);
+}
+
+void ContextNode::reset() {
+ // FIXME(mtrofin): this is std::memset, which we can probably use if we
+ // drop/reduce the dependency on sanitizer_common.
+ for (uint32_t I = 0; I < NrCounters; ++I)
+ counters()[I] = 0;
+ for (uint32_t I = 0; I < NrCallsites; ++I)
+ for (auto *Next = subContexts()[I]; Next; Next = Next->Next)
+ Next->reset();
+}
+
+// If this is the first time we hit a callsite with this (Guid) particular
+// callee, we need to allocate.
+ContextNode *getCallsiteSlow(uint64_t Guid, ContextNode **InsertionPoint,
+ uint32_t NrCounters, uint32_t NrCallsites) {
+ auto AllocSize = ContextNode::getAllocSize(NrCounters, NrCallsites);
+ auto *Mem = __llvm_ctx_profile_current_context_root->CurrentMem;
+ char *AllocPlace = Mem->tryBumpAllocate(AllocSize);
+ if (!AllocPlace) {
+ // if we failed to allocate on the current arena, allocate a new arena,
+ // and place it on __llvm_ctx_profile_current_context_root->CurrentMem so we
+ // find it from now on for other cases when we need to getCallsiteSlow.
+ // Note that allocateNewArena will link the allocated memory in the list of
+ // Arenas.
+ __llvm_ctx_profile_current_context_root->CurrentMem = Mem =
+ Mem->allocateNewArena(getArenaAllocSize(AllocSize), Mem);
+ AllocPlace = Mem->tryBumpAllocate(AllocSize);
+ }
+ auto *Ret = ContextNode::alloc(AllocPlace, Guid, NrCounters, NrCallsites,
+ *InsertionPoint);
+ *InsertionPoint = Ret;
+ return Ret;
+}
+
+ContextNode *__llvm_ctx_profile_get_context(void *Callee, GUID Guid,
+ uint32_t NrCounters,
+ uint32_t NrCallsites) {
+ // fast "out" if we're not even doing contextual collection.
+ if (!__llvm_ctx_profile_current_context_root)
+ return TheScratchContext;
+
+ // also fast "out" if the caller is scratch. We can see if it's scratch by
+ // looking at the interior pointer into the subcontexts vector that the caller
+ // provided, which, if the context is scratch, so is that interior pointer
+ // (because all the address calculations are using even values. Or more
+ // precisely, aligned - 8 values)
+ auto **CallsiteContext = consume(__llvm_ctx_profile_callsite[0]);
+ if (!CallsiteContext || isScratch(CallsiteContext))
+ return TheScratchContext;
+
+ // if the callee isn't the expected one, return scratch.
+ // Signal handler(s) could have been invoked at any point in the execution.
+ // Should that have happened, and had it (the handler) be built with
+ // instrumentation, its __llvm_ctx_profile_get_context would have failed here.
+ // Its sub call graph would have then populated
+ // __llvm_ctx_profile_{expected_callee | callsite} at index 1.
+ // The normal call graph may be impacted in that, if the signal handler
+ // happened somewhere before we read the TLS here, we'd see the TLS reset and
+ // we'd also fail here. That would just mean we would loose counter values for
+ // the normal subgraph, this time around. That should be very unlikely, but if
+ // it happens too frequently, we should be able to detect discrepancies in
+ // entry counts (caller-callee). At the moment, the design goes on the
+ // assumption that is so unfrequent, though, that it's not worth doing more
+ // for that case.
+ auto *ExpectedCallee = consume(__llvm_ctx_profile_expected_callee[0]);
+ if (ExpectedCallee != Callee)
+ return TheScratchContext;
+
+ auto *Callsite = *CallsiteContext;
+ // in the case of indirect calls, we will have all seen targets forming a
+ // linked list here. Find the one corresponding to this callee.
+ while (Callsite && Callsite->guid() != Guid) {
+ Callsite = Callsite->next();
+ }
+ auto *Ret = Callsite ? Callsite
+ : getCallsiteSlow(Guid, CallsiteContext, NrCounters,
+ NrCallsites);
+ if (Ret->callsites_size() != NrCallsites ||
+ Ret->counters_size() != NrCounters)
+ __sanitizer::Printf("[ctxprof] Returned ctx
diff ers from what's asked: "
+ "Context: %p, Asked: %lu %u %u, Got: %lu %u %u \n",
+ Ret, Guid, NrCallsites, NrCounters, Ret->guid(),
+ Ret->callsites_size(), Ret->counters_size());
+ Ret->onEntry();
+ return Ret;
+}
+
+// This should be called once for a Root. Allocate the first arena, set up the
+// first context.
+void setupContext(ContextRoot *Root, GUID Guid, uint32_t NrCounters,
+ uint32_t NrCallsites) {
+ __sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock(
+ &AllContextsMutex);
+ // Re-check - we got here without having had taken a lock.
+ if (Root->FirstMemBlock)
+ return;
+ const auto Needed = ContextNode::getAllocSize(NrCounters, NrCallsites);
+ auto *M = Arena::allocateNewArena(getArenaAllocSize(Needed));
+ Root->FirstMemBlock = M;
+ Root->CurrentMem = M;
+ Root->FirstNode = ContextNode::alloc(M->tryBumpAllocate(Needed), Guid,
+ NrCounters, NrCallsites);
+ AllContextRoots.PushBack(Root);
+}
+
+ContextNode *__llvm_ctx_profile_start_context(
+ ContextRoot *Root, GUID Guid, uint32_t Counters,
+ uint32_t Callsites) SANITIZER_NO_THREAD_SAFETY_ANALYSIS {
+ if (!Root->FirstMemBlock) {
+ setupContext(Root, Guid, Counters, Callsites);
+ }
+ if (Root->Taken.TryLock()) {
+ __llvm_ctx_profile_current_context_root = Root;
+ Root->FirstNode->onEntry();
+ return Root->FirstNode;
+ }
+ // If this thread couldn't take the lock, return scratch context.
+ __llvm_ctx_profile_current_context_root = nullptr;
+ return TheScratchContext;
+}
+
+void __llvm_ctx_profile_release_context(ContextRoot *Root)
+ SANITIZER_NO_THREAD_SAFETY_ANALYSIS {
+ if (__llvm_ctx_profile_current_context_root) {
+ __llvm_ctx_profile_current_context_root = nullptr;
+ Root->Taken.Unlock();
+ }
+}
+
+void __llvm_ctx_profile_start_collection() {
+ size_t NrMemUnits = 0;
+ __sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock(
+ &AllContextsMutex);
+ for (uint32_t I = 0; I < AllContextRoots.Size(); ++I) {
+ auto *Root = AllContextRoots[I];
+ __sanitizer::GenericScopedLock<__sanitizer::StaticSpinMutex> Lock(
+ &Root->Taken);
+ for (auto *Mem = Root->FirstMemBlock; Mem; Mem = Mem->next())
+ ++NrMemUnits;
+
+ Root->FirstNode->reset();
+ }
+ __sanitizer::Printf("[ctxprof] Initial NrMemUnits: %zu \n", NrMemUnits);
+}
+
+bool __llvm_ctx_profile_fetch(
+ void *Data, bool (*Writer)(void *W, const __ctx_profile::ContextNode &)) {
+ assert(Writer);
+ __sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock(
+ &AllContextsMutex);
+
+ for (int I = 0, E = AllContextRoots.Size(); I < E; ++I) {
+ auto *Root = AllContextRoots[I];
+ __sanitizer::GenericScopedLock<__sanitizer::StaticSpinMutex> TakenLock(
+ &Root->Taken);
+ if (!validate(Root)) {
+ __sanitizer::Printf("[ctxprof] Contextual Profile is %s\n", "invalid");
+ return false;
+ }
+ if (!Writer(Data, *Root->FirstNode))
+ return false;
+ }
+ return true;
+}
+
+void __llvm_ctx_profile_free() {
+ __sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock(
+ &AllContextsMutex);
+ for (int I = 0, E = AllContextRoots.Size(); I < E; ++I)
+ for (auto *A = AllContextRoots[I]->FirstMemBlock; A;) {
+ auto *C = A;
+ A = A->next();
+ __sanitizer::InternalFree(C);
+ }
+ AllContextRoots.Reset();
+}
diff --git a/compiler-rt/lib/ctx_profile/CtxInstrProfiling.h b/compiler-rt/lib/ctx_profile/CtxInstrProfiling.h
index c1789c32a64c2..8c4be5d8a23a7 100644
--- a/compiler-rt/lib/ctx_profile/CtxInstrProfiling.h
+++ b/compiler-rt/lib/ctx_profile/CtxInstrProfiling.h
@@ -9,9 +9,16 @@
#ifndef CTX_PROFILE_CTXINSTRPROFILING_H_
#define CTX_PROFILE_CTXINSTRPROFILING_H_
+#include "sanitizer_common/sanitizer_mutex.h"
#include <sanitizer/common_interface_defs.h>
namespace __ctx_profile {
+using GUID = uint64_t;
+static constexpr size_t ExpectedAlignment = 8;
+// We really depend on this, see further below. We currently support x86_64.
+// When we want to support other archs, we need to trace the places Alignment is
+// used and adjust accordingly.
+static_assert(sizeof(void *) == ExpectedAlignment);
/// Arena (bump allocator) forming a linked list. Intentionally not thread safe.
/// Allocation and de-allocation happen using sanitizer APIs. We make that
@@ -51,5 +58,206 @@ class Arena final {
const uint64_t Size;
};
+// The memory available for allocation follows the Arena header, and we expect
+// it to be thus aligned.
+static_assert(alignof(Arena) == ExpectedAlignment);
+
+/// The contextual profile is a directed tree where each node has one parent. A
+/// node (ContextNode) corresponds to a function activation. The root of the
+/// tree is at a function that was marked as entrypoint to the compiler. A node
+/// stores counter values for edges and a vector of subcontexts. These are the
+/// contexts of callees. The index in the subcontext vector corresponds to the
+/// index of the callsite (as was instrumented via llvm.instrprof.callsite). At
+/// that index we find a linked list, potentially empty, of ContextNodes. Direct
+/// calls will have 0 or 1 values in the linked list, but indirect callsites may
+/// have more.
+///
+/// The ContextNode has a fixed sized header describing it - the GUID of the
+/// function, the size of the counter and callsite vectors. It is also an
+/// (intrusive) linked list for the purposes of the indirect call case above.
+///
+/// Allocation is expected to happen on an Arena. The allocation lays out inline
+/// the counter and subcontexts vectors. The class offers APIs to correctly
+/// reference the latter.
+///
+/// The layout is as follows:
+///
+/// [[declared fields][counters vector][vector of ptrs to subcontexts]]
+///
+/// See also documentation on the counters and subContexts members below.
+///
+/// The structure of the ContextNode is known to LLVM, because LLVM needs to:
+/// (1) increment counts, and
+/// (2) form a GEP for the position in the subcontext list of a callsite
+/// This means changes to LLVM contextual profile lowering and changes here
+/// must be coupled.
+/// Note: the header content isn't interesting to LLVM (other than its size)
+///
+/// Part of contextual collection is the notion of "scratch contexts". These are
+/// buffers that are "large enough" to allow for memory-safe acceses during
+/// counter increments - meaning the counter increment code in LLVM doesn't need
+/// to be concerned with memory safety. Their subcontexts never get populated,
+/// though. The runtime code here produces and recognizes them.
+class ContextNode final {
+ const GUID Guid;
+ ContextNode *const Next;
+ const uint32_t NrCounters;
+ const uint32_t NrCallsites;
+
+public:
+ ContextNode(GUID Guid, uint32_t NrCounters, uint32_t NrCallsites,
+ ContextNode *Next = nullptr)
+ : Guid(Guid), Next(Next), NrCounters(NrCounters),
+ NrCallsites(NrCallsites) {}
+ static inline ContextNode *alloc(char *Place, GUID Guid, uint32_t NrCounters,
+ uint32_t NrCallsites,
+ ContextNode *Next = nullptr);
+
+ static inline size_t getAllocSize(uint32_t NrCounters, uint32_t NrCallsites) {
+ return sizeof(ContextNode) + sizeof(uint64_t) * NrCounters +
+ sizeof(ContextNode *) * NrCallsites;
+ }
+
+ // The counters vector starts right after the static header.
+ uint64_t *counters() {
+ ContextNode *addr_after = &(this[1]);
+ return reinterpret_cast<uint64_t *>(addr_after);
+ }
+
+ uint32_t counters_size() const { return NrCounters; }
+ uint32_t callsites_size() const { return NrCallsites; }
+
+ const uint64_t *counters() const {
+ return const_cast<ContextNode *>(this)->counters();
+ }
+
+ // The subcontexts vector starts right after the end of the counters vector.
+ ContextNode **subContexts() {
+ return reinterpret_cast<ContextNode **>(&(counters()[NrCounters]));
+ }
+
+ ContextNode *const *subContexts() const {
+ return const_cast<ContextNode *>(this)->subContexts();
+ }
+
+ GUID guid() const { return Guid; }
+ ContextNode *next() { return Next; }
+
+ size_t size() const { return getAllocSize(NrCounters, NrCallsites); }
+
+ void reset();
+
+ // since we go through the runtime to get a context back to LLVM, in the entry
+ // basic block, might as well handle incrementing the entry basic block
+ // counter.
+ void onEntry() { ++counters()[0]; }
+
+ uint64_t entrycount() const { return counters()[0]; }
+};
+
+// Verify maintenance to ContextNode doesn't change this invariant, which makes
+// sure the inlined vectors are appropriately aligned.
+static_assert(alignof(ContextNode) == ExpectedAlignment);
+
+/// ContextRoots are allocated by LLVM for entrypoints. LLVM is only concerned
+/// with allocating and zero-initializing the global value (as in, GlobalValue)
+/// for it.
+struct ContextRoot {
+ ContextNode *FirstNode = nullptr;
+ Arena *FirstMemBlock = nullptr;
+ Arena *CurrentMem = nullptr;
+ // This is init-ed by the static zero initializer in LLVM.
+ // Taken is used to ensure only one thread traverses the contextual graph -
+ // either to read it or to write it. On server side, the same entrypoint will
+ // be entered by numerous threads, but over time, the profile aggregated by
+ // collecting sequentially on one thread at a time is expected to converge to
+ // the aggregate profile that may have been observable on all the threads.
+ // Note that this is node-by-node aggregation, i.e. summing counters of nodes
+ // at the same position in the graph, not flattening.
+ // Threads that cannot lock Taken (fail TryLock) are given a "scratch context"
+ // - a buffer they can clobber, safely from a memory access perspective.
+ //
+ // Note about "scratch"-ness: we currently ignore the data written in them
+ // (which is anyway clobbered). The design allows for that not be the case -
+ // because "scratch"-ness is first and foremost about not trying to build
+ // subcontexts, and is captured by tainting the pointer value (pointer to the
+ // memory treated as context), but right now, we drop that info.
+ //
+ // We could consider relaxing the requirement of more than one thread
+ // entering by holding a few context trees per entrypoint and then aggregating
+ // them (as explained above) at the end of the profile collection - it's a
+ // tradeoff between collection time and memory use: higher precision can be
+ // obtained with either less concurrent collections but more collection time,
+ // or with more concurrent collections (==more memory) and less collection
+ // time. Note that concurrent collection does happen for
diff erent
+ // entrypoints, regardless.
+ ::__sanitizer::StaticSpinMutex Taken;
+
+ // If (unlikely) StaticSpinMutex internals change, we need to modify the LLVM
+ // instrumentation lowering side because it is responsible for allocating and
+ // zero-initializing ContextRoots.
+ static_assert(sizeof(Taken) == 1);
+};
+
+/// This API is exposed for testing. See the APIs below about the contract with
+/// LLVM.
+inline bool isScratch(const void *Ctx) {
+ return (reinterpret_cast<uint64_t>(Ctx) & 1);
+}
+
} // namespace __ctx_profile
+
+extern "C" {
+
+// LLVM fills these in when lowering a llvm.instrprof.callsite intrinsic.
+// position 0 is used when the current context isn't scratch, 1 when it is. They
+// are volatile because of signal handlers - we mean to specifically control
+// when the data is loaded.
+//
+/// TLS where LLVM stores the pointer of the called value, as part of lowering a
+/// llvm.instrprof.callsite
+extern __thread void *volatile __llvm_ctx_profile_expected_callee[2];
+/// TLS where LLVM stores the pointer inside a caller's subcontexts vector that
+/// corresponds to the callsite being lowered.
+extern __thread __ctx_profile::ContextNode *
+ *volatile __llvm_ctx_profile_callsite[2];
+
+// __llvm_ctx_profile_current_context_root is exposed for unit testing,
+// othwerise it's only used internally by compiler-rt/ctx_profile.
+extern __thread __ctx_profile::ContextRoot
+ *volatile __llvm_ctx_profile_current_context_root;
+
+/// called by LLVM in the entry BB of a "entry point" function. The returned
+/// pointer may be "tainted" - its LSB set to 1 - to indicate it's scratch.
+__ctx_profile::ContextNode *
+__llvm_ctx_profile_start_context(__ctx_profile::ContextRoot *Root,
+ __ctx_profile::GUID Guid, uint32_t Counters,
+ uint32_t Callsites);
+
+/// paired with __llvm_ctx_profile_start_context, and called at the exit of the
+/// entry point function.
+void __llvm_ctx_profile_release_context(__ctx_profile::ContextRoot *Root);
+
+/// called for any other function than entry points, in the entry BB of such
+/// function. Same consideration about LSB of returned value as .._start_context
+__ctx_profile::ContextNode *
+__llvm_ctx_profile_get_context(void *Callee, __ctx_profile::GUID Guid,
+ uint32_t NrCounters, uint32_t NrCallsites);
+
+/// Prepares for collection. Currently this resets counter values but preserves
+/// internal context tree structure.
+void __llvm_ctx_profile_start_collection();
+
+/// Completely free allocated memory.
+void __llvm_ctx_profile_free();
+
+/// Used to obtain the profile. The Writer is called for each root ContextNode,
+/// with the ContextRoot::Taken taken. The Writer is responsible for traversing
+/// the structure underneath.
+/// The Writer's first parameter plays the role of closure for Writer, and is
+/// what the caller of __llvm_ctx_profile_fetch passes as the Data parameter.
+/// The second parameter is the root of a context tree.
+bool __llvm_ctx_profile_fetch(
+ void *Data, bool (*Writer)(void *, const __ctx_profile::ContextNode &));
+}
#endif // CTX_PROFILE_CTXINSTRPROFILING_H_
diff --git a/compiler-rt/lib/ctx_profile/tests/CtxInstrProfilingTest.cpp b/compiler-rt/lib/ctx_profile/tests/CtxInstrProfilingTest.cpp
index 44f37d2576320..f6ebe6ab2e50c 100644
--- a/compiler-rt/lib/ctx_profile/tests/CtxInstrProfilingTest.cpp
+++ b/compiler-rt/lib/ctx_profile/tests/CtxInstrProfilingTest.cpp
@@ -1,8 +1,17 @@
#include "../CtxInstrProfiling.h"
#include "gtest/gtest.h"
+#include <thread>
using namespace __ctx_profile;
+class ContextTest : public ::testing::Test {
+ void SetUp() override { memset(&Root, 0, sizeof(ContextRoot)); }
+ void TearDown() override { __llvm_ctx_profile_free(); }
+
+public:
+ ContextRoot Root;
+};
+
TEST(ArenaTest, Basic) {
Arena *A = Arena::allocateNewArena(1024);
EXPECT_EQ(A->size(), 1024U);
@@ -20,3 +29,186 @@ TEST(ArenaTest, Basic) {
Arena::freeArenaList(A);
EXPECT_EQ(A, nullptr);
}
+
+TEST_F(ContextTest, Basic) {
+ auto *Ctx = __llvm_ctx_profile_start_context(&Root, 1, 10, 4);
+ ASSERT_NE(Ctx, nullptr);
+ EXPECT_NE(Root.CurrentMem, nullptr);
+ EXPECT_EQ(Root.FirstMemBlock, Root.CurrentMem);
+ EXPECT_EQ(Ctx->size(), sizeof(ContextNode) + 10 * sizeof(uint64_t) +
+ 4 * sizeof(ContextNode *));
+ EXPECT_EQ(Ctx->counters_size(), 10U);
+ EXPECT_EQ(Ctx->callsites_size(), 4U);
+ EXPECT_EQ(__llvm_ctx_profile_current_context_root, &Root);
+ Root.Taken.CheckLocked();
+ EXPECT_FALSE(Root.Taken.TryLock());
+ __llvm_ctx_profile_release_context(&Root);
+ EXPECT_EQ(__llvm_ctx_profile_current_context_root, nullptr);
+ EXPECT_TRUE(Root.Taken.TryLock());
+ Root.Taken.Unlock();
+}
+
+TEST_F(ContextTest, Callsite) {
+ auto *Ctx = __llvm_ctx_profile_start_context(&Root, 1, 10, 4);
+ int FakeCalleeAddress = 0;
+ const bool IsScratch = isScratch(Ctx);
+ EXPECT_FALSE(IsScratch);
+ // This is the sequence the caller performs - it's the lowering of the
+ // instrumentation of the callsite "2". "2" is arbitrary here.
+ __llvm_ctx_profile_expected_callee[0] = &FakeCalleeAddress;
+ __llvm_ctx_profile_callsite[0] = &Ctx->subContexts()[2];
+ // This is what the callee does
+ auto *Subctx = __llvm_ctx_profile_get_context(&FakeCalleeAddress, 2, 3, 1);
+ // We expect the subcontext to be appropriately placed and dimensioned
+ EXPECT_EQ(Ctx->subContexts()[2], Subctx);
+ EXPECT_EQ(Subctx->counters_size(), 3U);
+ EXPECT_EQ(Subctx->callsites_size(), 1U);
+ // We reset these in _get_context.
+ EXPECT_EQ(__llvm_ctx_profile_expected_callee[0], nullptr);
+ EXPECT_EQ(__llvm_ctx_profile_callsite[0], nullptr);
+
+ EXPECT_EQ(Subctx->size(), sizeof(ContextNode) + 3 * sizeof(uint64_t) +
+ 1 * sizeof(ContextNode *));
+ __llvm_ctx_profile_release_context(&Root);
+}
+
+TEST_F(ContextTest, ScratchNoCollection) {
+ EXPECT_EQ(__llvm_ctx_profile_current_context_root, nullptr);
+ int FakeCalleeAddress = 0;
+ // this would be the very first function executing this. the TLS is empty,
+ // too.
+ auto *Ctx = __llvm_ctx_profile_get_context(&FakeCalleeAddress, 2, 3, 1);
+ // We never entered a context (_start_context was never called) - so the
+ // returned context must be scratch.
+ EXPECT_TRUE(isScratch(Ctx));
+}
+
+TEST_F(ContextTest, ScratchDuringCollection) {
+ auto *Ctx = __llvm_ctx_profile_start_context(&Root, 1, 10, 4);
+ int FakeCalleeAddress = 0;
+ int OtherFakeCalleeAddress = 0;
+ __llvm_ctx_profile_expected_callee[0] = &FakeCalleeAddress;
+ __llvm_ctx_profile_callsite[0] = &Ctx->subContexts()[2];
+ auto *Subctx =
+ __llvm_ctx_profile_get_context(&OtherFakeCalleeAddress, 2, 3, 1);
+ // We expected a
diff erent callee - so return scratch. It mimics what happens
+ // in the case of a signal handler - in this case, OtherFakeCalleeAddress is
+ // the signal handler.
+ EXPECT_TRUE(isScratch(Subctx));
+ EXPECT_EQ(__llvm_ctx_profile_expected_callee[0], nullptr);
+ EXPECT_EQ(__llvm_ctx_profile_callsite[0], nullptr);
+
+ int ThirdFakeCalleeAddress = 0;
+ __llvm_ctx_profile_expected_callee[1] = &ThirdFakeCalleeAddress;
+ __llvm_ctx_profile_callsite[1] = &Subctx->subContexts()[0];
+
+ auto *Subctx2 =
+ __llvm_ctx_profile_get_context(&ThirdFakeCalleeAddress, 3, 0, 0);
+ // We again expect scratch because the '0' position is where the runtime
+ // looks, so it doesn't matter the '1' position is populated correctly.
+ EXPECT_TRUE(isScratch(Subctx2));
+
+ __llvm_ctx_profile_expected_callee[0] = &ThirdFakeCalleeAddress;
+ __llvm_ctx_profile_callsite[0] = &Subctx->subContexts()[0];
+ auto *Subctx3 =
+ __llvm_ctx_profile_get_context(&ThirdFakeCalleeAddress, 3, 0, 0);
+ // We expect scratch here, too, because the value placed in
+ // __llvm_ctx_profile_callsite is scratch
+ EXPECT_TRUE(isScratch(Subctx3));
+
+ __llvm_ctx_profile_release_context(&Root);
+}
+
+TEST_F(ContextTest, NeedMoreMemory) {
+ auto *Ctx = __llvm_ctx_profile_start_context(&Root, 1, 10, 4);
+ int FakeCalleeAddress = 0;
+ const bool IsScratch = isScratch(Ctx);
+ EXPECT_FALSE(IsScratch);
+ const auto *CurrentMem = Root.CurrentMem;
+ __llvm_ctx_profile_expected_callee[0] = &FakeCalleeAddress;
+ __llvm_ctx_profile_callsite[0] = &Ctx->subContexts()[2];
+ // Allocate a massive subcontext to force new arena allocation
+ auto *Subctx =
+ __llvm_ctx_profile_get_context(&FakeCalleeAddress, 3, 1 << 20, 1);
+ EXPECT_EQ(Ctx->subContexts()[2], Subctx);
+ EXPECT_NE(CurrentMem, Root.CurrentMem);
+ EXPECT_NE(Root.CurrentMem, nullptr);
+}
+
+TEST_F(ContextTest, ConcurrentRootCollection) {
+ std::atomic<int> NonScratch = 0;
+ std::atomic<int> Executions = 0;
+
+ __sanitizer::Semaphore GotCtx;
+
+ auto Entrypoint = [&]() {
+ ++Executions;
+ auto *Ctx = __llvm_ctx_profile_start_context(&Root, 1, 10, 4);
+ GotCtx.Post();
+ const bool IS = isScratch(Ctx);
+ NonScratch += (!IS);
+ if (!IS) {
+ GotCtx.Wait();
+ GotCtx.Wait();
+ }
+ __llvm_ctx_profile_release_context(&Root);
+ };
+ std::thread T1(Entrypoint);
+ std::thread T2(Entrypoint);
+ T1.join();
+ T2.join();
+ EXPECT_EQ(NonScratch, 1);
+ EXPECT_EQ(Executions, 2);
+}
+
+TEST_F(ContextTest, Dump) {
+ auto *Ctx = __llvm_ctx_profile_start_context(&Root, 1, 10, 4);
+ int FakeCalleeAddress = 0;
+ __llvm_ctx_profile_expected_callee[0] = &FakeCalleeAddress;
+ __llvm_ctx_profile_callsite[0] = &Ctx->subContexts()[2];
+ auto *Subctx = __llvm_ctx_profile_get_context(&FakeCalleeAddress, 2, 3, 1);
+ (void)Subctx;
+ __llvm_ctx_profile_release_context(&Root);
+
+ struct Writer {
+ ContextRoot *const Root;
+ const size_t Entries;
+ bool State = false;
+ Writer(ContextRoot *Root, size_t Entries) : Root(Root), Entries(Entries) {}
+
+ bool write(const ContextNode &Node) {
+ EXPECT_FALSE(Root->Taken.TryLock());
+ EXPECT_EQ(Node.guid(), 1);
+ EXPECT_EQ(Node.counters()[0], Entries);
+ EXPECT_EQ(Node.counters_size(), 10);
+ EXPECT_EQ(Node.callsites_size(), 4);
+ EXPECT_EQ(Node.subContexts()[0], nullptr);
+ EXPECT_EQ(Node.subContexts()[1], nullptr);
+ EXPECT_NE(Node.subContexts()[2], nullptr);
+ EXPECT_EQ(Node.subContexts()[3], nullptr);
+ const auto &SN = *Node.subContexts()[2];
+ EXPECT_EQ(SN.guid(), 2);
+ EXPECT_EQ(SN.counters()[0], Entries);
+ EXPECT_EQ(SN.counters_size(), 3);
+ EXPECT_EQ(SN.callsites_size(), 1);
+ EXPECT_EQ(SN.subContexts()[0], nullptr);
+ State = true;
+ return true;
+ }
+ };
+ Writer W(&Root, 1);
+ EXPECT_FALSE(W.State);
+ __llvm_ctx_profile_fetch(&W, [](void *W, const ContextNode &Node) -> bool {
+ return reinterpret_cast<Writer *>(W)->write(Node);
+ });
+ EXPECT_TRUE(W.State);
+
+ // this resets all counters but not the internal structure.
+ __llvm_ctx_profile_start_collection();
+ Writer W2(&Root, 0);
+ EXPECT_FALSE(W2.State);
+ __llvm_ctx_profile_fetch(&W2, [](void *W, const ContextNode &Node) -> bool {
+ return reinterpret_cast<Writer *>(W)->write(Node);
+ });
+ EXPECT_TRUE(W2.State);
+}
More information about the llvm-commits
mailing list