[llvm-branch-commits] [compiler-rt] [llvm] RootAutodetect (PR #133147)
Mircea Trofin via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Wed Mar 26 12:53:11 PDT 2025
https://github.com/mtrofin created https://github.com/llvm/llvm-project/pull/133147
None
>From 7182baeef88e3d9448062118fd8af808a17fbcd9 Mon Sep 17 00:00:00 2001
From: Mircea Trofin <mtrofin at google.com>
Date: Mon, 24 Mar 2025 12:01:10 -0700
Subject: [PATCH] RootAutodetect
---
compiler-rt/lib/ctx_profile/CMakeLists.txt | 2 +-
.../lib/ctx_profile/CtxInstrContextNode.h | 1 +
.../lib/ctx_profile/CtxInstrProfiling.cpp | 119 +++++++++++-------
.../lib/ctx_profile/CtxInstrProfiling.h | 2 +-
.../lib/ctx_profile/RootAutoDetector.cpp | 84 +++++++++++++
.../lib/ctx_profile/RootAutoDetector.h | 29 +++++
.../TestCases/generate-context.cpp | 4 +-
.../llvm/ProfileData/CtxInstrContextNode.h | 1 +
8 files changed, 195 insertions(+), 47 deletions(-)
diff --git a/compiler-rt/lib/ctx_profile/CMakeLists.txt b/compiler-rt/lib/ctx_profile/CMakeLists.txt
index bb606449c61b1..446ebc96408dd 100644
--- a/compiler-rt/lib/ctx_profile/CMakeLists.txt
+++ b/compiler-rt/lib/ctx_profile/CMakeLists.txt
@@ -27,7 +27,7 @@ endif()
add_compiler_rt_runtime(clang_rt.ctx_profile
STATIC
ARCHS ${CTX_PROFILE_SUPPORTED_ARCH}
- OBJECT_LIBS RTSanitizerCommon RTSanitizerCommonLibc
+ OBJECT_LIBS RTSanitizerCommon RTSanitizerCommonLibc RTSanitizerCommonSymbolizer
CFLAGS ${EXTRA_FLAGS}
SOURCES ${CTX_PROFILE_SOURCES}
ADDITIONAL_HEADERS ${CTX_PROFILE_HEADERS}
diff --git a/compiler-rt/lib/ctx_profile/CtxInstrContextNode.h b/compiler-rt/lib/ctx_profile/CtxInstrContextNode.h
index a42bf9ebb01ea..aa052bc7eea6c 100644
--- a/compiler-rt/lib/ctx_profile/CtxInstrContextNode.h
+++ b/compiler-rt/lib/ctx_profile/CtxInstrContextNode.h
@@ -127,6 +127,7 @@ class ContextNode final {
/// MUTEXDECL takes one parameter, the name of a field that is a mutex.
#define CTXPROF_FUNCTION_DATA(PTRDECL, VOLATILE_PTRDECL, MUTEXDECL) \
PTRDECL(FunctionData, Next) \
+ PTRDECL(void, EntryAddress) \
VOLATILE_PTRDECL(ContextRoot, CtxRoot) \
VOLATILE_PTRDECL(ContextNode, FlatCtx) \
MUTEXDECL(Mutex)
diff --git a/compiler-rt/lib/ctx_profile/CtxInstrProfiling.cpp b/compiler-rt/lib/ctx_profile/CtxInstrProfiling.cpp
index da291e0bbabdd..7e73214e639a3 100644
--- a/compiler-rt/lib/ctx_profile/CtxInstrProfiling.cpp
+++ b/compiler-rt/lib/ctx_profile/CtxInstrProfiling.cpp
@@ -7,6 +7,7 @@
//===----------------------------------------------------------------------===//
#include "CtxInstrProfiling.h"
+#include "RootAutoDetector.h"
#include "sanitizer_common/sanitizer_allocator_internal.h"
#include "sanitizer_common/sanitizer_atomic.h"
#include "sanitizer_common/sanitizer_atomic_clang.h"
@@ -43,6 +44,12 @@ Arena *FlatCtxArena = nullptr;
__thread bool IsUnderContext = false;
__sanitizer::atomic_uint8_t ProfilingStarted = {};
+__sanitizer::atomic_uintptr_t RootDetector = {};
+RootAutoDetector *getRootDetector() {
+ return reinterpret_cast<RootAutoDetector *>(
+ __sanitizer::atomic_load_relaxed(&RootDetector));
+}
+
// utility to taint a pointer by setting the LSB. There is an assumption
// throughout that the addresses of contexts are even (really, they should be
// align(8), but "even"-ness is the minimum assumption)
@@ -201,7 +208,7 @@ ContextNode *getCallsiteSlow(GUID Guid, ContextNode **InsertionPoint,
return Ret;
}
-ContextNode *getFlatProfile(FunctionData &Data, GUID Guid,
+ContextNode *getFlatProfile(FunctionData &Data, void *Callee, GUID Guid,
uint32_t NumCounters) {
if (ContextNode *Existing = Data.FlatCtx)
return Existing;
@@ -232,6 +239,7 @@ ContextNode *getFlatProfile(FunctionData &Data, GUID Guid,
auto *Ret = allocContextNode(AllocBuff, Guid, NumCounters, 0);
Data.FlatCtx = Ret;
+ Data.EntryAddress = Callee;
Data.Next = reinterpret_cast<FunctionData *>(
__sanitizer::atomic_load_relaxed(&AllFunctionsData));
while (!__sanitizer::atomic_compare_exchange_strong(
@@ -277,8 +285,29 @@ ContextRoot *FunctionData::getOrAllocateContextRoot() {
return Root;
}
-ContextNode *getUnhandledContext(FunctionData &Data, GUID Guid,
- uint32_t NumCounters) {
+ContextNode *tryStartContextGivenRoot(ContextRoot *Root, GUID Guid,
+ uint32_t Counters, uint32_t Callsites)
+ SANITIZER_NO_THREAD_SAFETY_ANALYSIS {
+ IsUnderContext = true;
+ __sanitizer::atomic_fetch_add(&Root->TotalEntries, 1,
+ __sanitizer::memory_order_relaxed);
+
+ if (!Root->FirstMemBlock) {
+ setupContext(Root, Guid, Counters, Callsites);
+ }
+ if (Root->Taken.TryLock()) {
+ __llvm_ctx_profile_current_context_root = Root;
+ onContextEnter(*Root->FirstNode);
+ return Root->FirstNode;
+ }
+ // If this thread couldn't take the lock, return scratch context.
+ __llvm_ctx_profile_current_context_root = nullptr;
+ return TheScratchContext;
+}
+
+ContextNode *getUnhandledContext(FunctionData &Data, void *Callee, GUID Guid,
+ uint32_t NumCounters, uint32_t NumCallsites,
+ ContextRoot *CtxRoot) {
// 1) if we are currently collecting a contextual profile, fetch a ContextNode
// in the `Unhandled` set. We want to do this regardless of `ProfilingStarted`
@@ -297,27 +326,30 @@ ContextNode *getUnhandledContext(FunctionData &Data, GUID Guid,
// entered once and never exit. They should be assumed to be entered before
// profiling starts - because profiling should start after the server is up
// and running (which is equivalent to "message pumps are set up").
- ContextRoot *R = __llvm_ctx_profile_current_context_root;
- if (!R) {
+ if (!CtxRoot) {
+ if (auto *RAD = getRootDetector())
+ RAD->sample();
if (IsUnderContext || !__sanitizer::atomic_load_relaxed(&ProfilingStarted))
return TheScratchContext;
else
return markAsScratch(
- onContextEnter(*getFlatProfile(Data, Guid, NumCounters)));
+ onContextEnter(*getFlatProfile(Data, Callee, Guid, NumCounters)));
}
- auto [Iter, Ins] = R->Unhandled.insert({Guid, nullptr});
+ auto [Iter, Ins] = CtxRoot->Unhandled.insert({Guid, nullptr});
if (Ins)
- Iter->second =
- getCallsiteSlow(Guid, &R->FirstUnhandledCalleeNode, NumCounters, 0);
+ Iter->second = getCallsiteSlow(Guid, &CtxRoot->FirstUnhandledCalleeNode,
+ NumCounters, 0);
return markAsScratch(onContextEnter(*Iter->second));
}
ContextNode *__llvm_ctx_profile_get_context(FunctionData *Data, void *Callee,
GUID Guid, uint32_t NumCounters,
uint32_t NumCallsites) {
+ auto *CtxRoot = __llvm_ctx_profile_current_context_root;
// fast "out" if we're not even doing contextual collection.
- if (!__llvm_ctx_profile_current_context_root)
- return getUnhandledContext(*Data, Guid, NumCounters);
+ if (!CtxRoot)
+ return getUnhandledContext(*Data, Callee, Guid, NumCounters, NumCallsites,
+ nullptr);
// also fast "out" if the caller is scratch. We can see if it's scratch by
// looking at the interior pointer into the subcontexts vector that the caller
@@ -326,7 +358,8 @@ ContextNode *__llvm_ctx_profile_get_context(FunctionData *Data, void *Callee,
// precisely, aligned - 8 values)
auto **CallsiteContext = consume(__llvm_ctx_profile_callsite[0]);
if (!CallsiteContext || isScratch(CallsiteContext))
- return getUnhandledContext(*Data, Guid, NumCounters);
+ return getUnhandledContext(*Data, Callee, Guid, NumCounters, NumCallsites,
+ CtxRoot);
// if the callee isn't the expected one, return scratch.
// Signal handler(s) could have been invoked at any point in the execution.
@@ -344,7 +377,8 @@ ContextNode *__llvm_ctx_profile_get_context(FunctionData *Data, void *Callee,
// for that case.
auto *ExpectedCallee = consume(__llvm_ctx_profile_expected_callee[0]);
if (ExpectedCallee != Callee)
- return getUnhandledContext(*Data, Guid, NumCounters);
+ return getUnhandledContext(*Data, Callee, Guid, NumCounters, NumCallsites,
+ CtxRoot);
auto *Callsite = *CallsiteContext;
// in the case of indirect calls, we will have all seen targets forming a
@@ -366,40 +400,26 @@ ContextNode *__llvm_ctx_profile_get_context(FunctionData *Data, void *Callee,
return Ret;
}
-ContextNode *__llvm_ctx_profile_start_context(
- FunctionData *FData, GUID Guid, uint32_t Counters,
- uint32_t Callsites) SANITIZER_NO_THREAD_SAFETY_ANALYSIS {
- IsUnderContext = true;
-
- auto *Root = FData->getOrAllocateContextRoot();
-
- __sanitizer::atomic_fetch_add(&Root->TotalEntries, 1,
- __sanitizer::memory_order_relaxed);
+ContextNode *__llvm_ctx_profile_start_context(FunctionData *FData, GUID Guid,
+ uint32_t Counters,
+ uint32_t Callsites) {
- if (!Root->FirstMemBlock) {
- setupContext(Root, Guid, Counters, Callsites);
- }
- if (Root->Taken.TryLock()) {
- __llvm_ctx_profile_current_context_root = Root;
- onContextEnter(*Root->FirstNode);
- return Root->FirstNode;
- }
- // If this thread couldn't take the lock, return scratch context.
- __llvm_ctx_profile_current_context_root = nullptr;
- return TheScratchContext;
+ return tryStartContextGivenRoot(FData->getOrAllocateContextRoot(), Guid,
+ Counters, Callsites);
}
void __llvm_ctx_profile_release_context(FunctionData *FData)
SANITIZER_NO_THREAD_SAFETY_ANALYSIS {
+ const auto *CurrentRoot = __llvm_ctx_profile_current_context_root;
+ if (!CurrentRoot || FData->CtxRoot != CurrentRoot)
+ return;
IsUnderContext = false;
- if (__llvm_ctx_profile_current_context_root) {
- __llvm_ctx_profile_current_context_root = nullptr;
- assert(FData->CtxRoot);
- FData->CtxRoot->Taken.Unlock();
- }
+ assert(FData->CtxRoot);
+ __llvm_ctx_profile_current_context_root = nullptr;
+ FData->CtxRoot->Taken.Unlock();
}
-void __llvm_ctx_profile_start_collection() {
+void __llvm_ctx_profile_start_collection(bool AutodetectRoots) {
size_t NumMemUnits = 0;
__sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock(
&AllContextsMutex);
@@ -415,12 +435,24 @@ void __llvm_ctx_profile_start_collection() {
resetContextNode(*Root->FirstUnhandledCalleeNode);
__sanitizer::atomic_store_relaxed(&Root->TotalEntries, 0);
}
- __sanitizer::atomic_store_relaxed(&ProfilingStarted, true);
- __sanitizer::Printf("[ctxprof] Initial NumMemUnits: %zu \n", NumMemUnits);
+ if (AutodetectRoots) {
+ auto *RD = new (__sanitizer::InternalAlloc(sizeof(RootAutoDetector)))
+ RootAutoDetector(AllFunctionsData, RootDetector);
+ RD->start();
+ } else {
+ __sanitizer::atomic_store_relaxed(&ProfilingStarted, true);
+ __sanitizer::Printf("[ctxprof] Initial NumMemUnits: %zu \n", NumMemUnits);
+ }
}
bool __llvm_ctx_profile_fetch(ProfileWriter &Writer) {
__sanitizer::atomic_store_relaxed(&ProfilingStarted, false);
+ if (auto *RD = getRootDetector()) {
+ __sanitizer::Printf("[ctxprof] Expected the root autodetector to have "
+ "finished well before attempting to fetch a context");
+ RD->join();
+ }
+
__sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock(
&AllContextsMutex);
@@ -445,8 +477,9 @@ bool __llvm_ctx_profile_fetch(ProfileWriter &Writer) {
const auto *Pos = reinterpret_cast<const FunctionData *>(
__sanitizer::atomic_load_relaxed(&AllFunctionsData));
for (; Pos; Pos = Pos->Next)
- Writer.writeFlat(Pos->FlatCtx->guid(), Pos->FlatCtx->counters(),
- Pos->FlatCtx->counters_size());
+ if (!Pos->CtxRoot)
+ Writer.writeFlat(Pos->FlatCtx->guid(), Pos->FlatCtx->counters(),
+ Pos->FlatCtx->counters_size());
Writer.endFlatSection();
return true;
}
diff --git a/compiler-rt/lib/ctx_profile/CtxInstrProfiling.h b/compiler-rt/lib/ctx_profile/CtxInstrProfiling.h
index 6326beaa53085..220a8bd25e6ef 100644
--- a/compiler-rt/lib/ctx_profile/CtxInstrProfiling.h
+++ b/compiler-rt/lib/ctx_profile/CtxInstrProfiling.h
@@ -207,7 +207,7 @@ ContextNode *__llvm_ctx_profile_get_context(__ctx_profile::FunctionData *FData,
/// Prepares for collection. Currently this resets counter values but preserves
/// internal context tree structure.
-void __llvm_ctx_profile_start_collection();
+void __llvm_ctx_profile_start_collection(bool AutodetectRoots = false);
/// Completely free allocated memory.
void __llvm_ctx_profile_free();
diff --git a/compiler-rt/lib/ctx_profile/RootAutoDetector.cpp b/compiler-rt/lib/ctx_profile/RootAutoDetector.cpp
index 7daa8f31e16ea..5888545a79d65 100644
--- a/compiler-rt/lib/ctx_profile/RootAutoDetector.cpp
+++ b/compiler-rt/lib/ctx_profile/RootAutoDetector.cpp
@@ -18,6 +18,90 @@
using namespace __ctx_profile;
+namespace __sanitizer {
+void BufferedStackTrace::UnwindImpl(uptr pc, uptr bp, void *context,
+ bool request_fast, u32 max_depth) {
+ // We can't implement the fast variant. The fast variant ends up invoking an
+ // external allocator, because of pthread_attr_getstack. If this happens
+ // during an allocation of the program being instrumented, a non-reentrant
+ // lock may be taken (this was observed). The allocator called by
+ // pthread_attr_getstack will also try to take that lock.
+ UnwindSlow(pc, max_depth);
+}
+} // namespace __sanitizer
+
+RootAutoDetector::PerThreadSamples::PerThreadSamples(RootAutoDetector &Parent) {
+ GenericScopedLock<SpinMutex> L(&Parent.AllSamplesMutex);
+ Parent.AllSamples.PushBack(this);
+}
+
+void RootAutoDetector::start() {
+ atomic_store_relaxed(&Self, reinterpret_cast<uintptr_t>(this));
+ pthread_create(
+ &WorkerThread, nullptr,
+ +[](void *Ctx) -> void * {
+ RootAutoDetector *RAD = reinterpret_cast<RootAutoDetector *>(Ctx);
+ SleepForSeconds(30);
+ Vector<PerThreadSamples*> Copy;
+ {
+ GenericScopedLock<SpinMutex> M(&RAD->AllSamplesMutex);
+ Copy.Resize(RAD->AllSamples.Size());
+ for (uptr I = 0; I < RAD->AllSamples.Size(); ++I)
+ Copy[I] = RAD->AllSamples[I];
+ }
+ DenseMap<uptr, uint64_t> AllRoots;
+ for (uptr I = 0; I < Copy.Size(); ++I) {
+ GenericScopedLock<SpinMutex>(&Copy[I]->M);
+ Copy[I]->TrieRoot.determineRoots().forEach([&](auto &KVP) {
+ auto [FAddr, Count] = KVP;
+ AllRoots[FAddr] += Count;
+ return true;
+ });
+ }
+ for (auto *FD = reinterpret_cast<FunctionData *>(
+ atomic_load_relaxed(&RAD->FunctionDataListHead));
+ FD; FD = FD->Next) {
+ if (AllRoots.contains(reinterpret_cast<uptr>(FD->EntryAddress))) {
+ GenericScopedLock<SpinMutex> M(&FD->Mutex);
+ FD->getOrAllocateContextRoot();
+ }
+ }
+ atomic_store_relaxed(&RAD->Self, 0);
+ return nullptr;
+ },
+ this);
+}
+
+void RootAutoDetector::join() {
+ pthread_join(WorkerThread, nullptr);
+}
+
+void RootAutoDetector::sample() {
+ static thread_local bool Entered = false;
+ static thread_local uint64_t Entries = 0;
+ if (Entered || (++Entries % SampleRate))
+ return;
+ Entered = true;
+ collectStack();
+ Entered = false;
+}
+
+void RootAutoDetector::collectStack() {
+ GET_CALLER_PC_BP;
+ BufferedStackTrace CurrentStack;
+ CurrentStack.Unwind(pc, bp, nullptr, false);
+ if (CurrentStack.size <= 2) return;
+ static thread_local PerThreadSamples *ThisThreadSamples =
+ new (__sanitizer::InternalAlloc(sizeof(PerThreadSamples)))
+ PerThreadSamples(*this);
+
+ if (!ThisThreadSamples->M.TryLock())
+ return;
+
+ ThisThreadSamples->TrieRoot.insertStack(CurrentStack);
+ ThisThreadSamples->M.Unlock();
+}
+
uptr PerThreadCallsiteTrie::getFctStartAddr(uptr CallsiteAddress) const {
// this requires --linkopt=-Wl,--export-dynamic
Dl_info Info;
diff --git a/compiler-rt/lib/ctx_profile/RootAutoDetector.h b/compiler-rt/lib/ctx_profile/RootAutoDetector.h
index ab51a342d3617..254a40b163632 100644
--- a/compiler-rt/lib/ctx_profile/RootAutoDetector.h
+++ b/compiler-rt/lib/ctx_profile/RootAutoDetector.h
@@ -12,6 +12,7 @@
#include "sanitizer_common/sanitizer_dense_map.h"
#include "sanitizer_common/sanitizer_internal_defs.h"
#include "sanitizer_common/sanitizer_stacktrace.h"
+#include "sanitizer_common/sanitizer_vector.h"
#include <pthread.h>
#include <sanitizer/common_interface_defs.h>
@@ -64,5 +65,33 @@ class PerThreadCallsiteTrie {
const Trie &start() const { return T; }
};
+
+class RootAutoDetector final {
+ static const uint64_t SampleRate = 6113;
+ pthread_t WorkerThread;
+
+ struct PerThreadSamples {
+ PerThreadSamples(RootAutoDetector &Parent);
+
+ PerThreadCallsiteTrie TrieRoot;
+ SpinMutex M;
+ };
+ SpinMutex AllSamplesMutex;
+ SANITIZER_GUARDED_BY(AllSamplesMutex)
+ Vector<PerThreadSamples*> AllSamples;
+ atomic_uintptr_t &FunctionDataListHead;
+ atomic_uintptr_t &Self;
+ void collectStack();
+
+public:
+ RootAutoDetector(atomic_uintptr_t &FunctionDataListHead,
+ atomic_uintptr_t &Self)
+ : FunctionDataListHead(FunctionDataListHead), Self(Self) {}
+
+ void sample();
+ void start();
+ void join();
+};
+
} // namespace __ctx_profile
#endif
diff --git a/compiler-rt/test/ctx_profile/TestCases/generate-context.cpp b/compiler-rt/test/ctx_profile/TestCases/generate-context.cpp
index 3dc53637a35d8..7c0d7804ff4a4 100644
--- a/compiler-rt/test/ctx_profile/TestCases/generate-context.cpp
+++ b/compiler-rt/test/ctx_profile/TestCases/generate-context.cpp
@@ -16,7 +16,7 @@
#include <iostream>
using namespace llvm::ctx_profile;
-extern "C" void __llvm_ctx_profile_start_collection();
+extern "C" void __llvm_ctx_profile_start_collection(bool);
extern "C" bool __llvm_ctx_profile_fetch(ProfileWriter &);
// avoid name mangling
@@ -159,7 +159,7 @@ bool profileWriter() {
}
int main(int argc, char **argv) {
- __llvm_ctx_profile_start_collection();
+ __llvm_ctx_profile_start_collection(false);
theRoot();
flatFct();
// This would be implemented in a specific RPC handler, but here we just call
diff --git a/llvm/include/llvm/ProfileData/CtxInstrContextNode.h b/llvm/include/llvm/ProfileData/CtxInstrContextNode.h
index a42bf9ebb01ea..aa052bc7eea6c 100644
--- a/llvm/include/llvm/ProfileData/CtxInstrContextNode.h
+++ b/llvm/include/llvm/ProfileData/CtxInstrContextNode.h
@@ -127,6 +127,7 @@ class ContextNode final {
/// MUTEXDECL takes one parameter, the name of a field that is a mutex.
#define CTXPROF_FUNCTION_DATA(PTRDECL, VOLATILE_PTRDECL, MUTEXDECL) \
PTRDECL(FunctionData, Next) \
+ PTRDECL(void, EntryAddress) \
VOLATILE_PTRDECL(ContextRoot, CtxRoot) \
VOLATILE_PTRDECL(ContextNode, FlatCtx) \
MUTEXDECL(Mutex)
More information about the llvm-branch-commits
mailing list