[compiler-rt] [compiler-rt][ctx_profile] Add the instrumented contextual profiling APIs (PR #89838)

Mon May 6 08:46:42 PDT 2024

================
@@ -38,3 +128,177 @@ void Arena::freeArenaList(Arena *&A) {
   }
   A = nullptr;
 }
+
+inline ContextNode *ContextNode::alloc(char *Place, GUID Guid,
+                                       uint32_t NrCounters,
+                                       uint32_t NrCallsites,
+                                       ContextNode *Next) {
+  assert(reinterpret_cast<uint64_t>(Place) % sizeof(void *) == 0);
+  return new (Place) ContextNode(Guid, NrCounters, NrCallsites, Next);
+}
+
+void ContextNode::reset() {
+  for (uint32_t I = 0; I < NrCounters; ++I)
+    counters()[I] = 0;
+  for (uint32_t I = 0; I < NrCallsites; ++I)
+    for (auto *Next = subContexts()[I]; Next; Next = Next->Next)
+      Next->reset();
+}
+
+// If this is the first time we hit a callsite with this (Guid) particular
+// callee, we need to allocate.
+ContextNode *getCallsiteSlow(uint64_t Guid, ContextNode **InsertionPoint,
+                             uint32_t NrCounters, uint32_t NrCallsites) {
+  auto AllocSize = ContextNode::getAllocSize(NrCounters, NrCallsites);
+  auto *Mem = __llvm_ctx_profile_current_context_root->CurrentMem;
+  char *AllocPlace = Mem->tryBumpAllocate(AllocSize);
+  if (!AllocPlace) {
+    // if we failed to allocate on the current arena, allocate a new arena,
+    // and place it on __llvm_ctx_profile_current_context_root->CurrentMem so we
+    // find it from now on for other cases when we need to getCallsiteSlow.
+    // Note that allocateNewArena will link the allocated memory in the list of
+    // Arenas.
+    __llvm_ctx_profile_current_context_root->CurrentMem = Mem =
+        Mem->allocateNewArena(getArenaAllocSize(AllocSize), Mem);
+  }
+  auto *Ret = ContextNode::alloc(AllocPlace, Guid, NrCounters, NrCallsites,
+                                 *InsertionPoint);
+  *InsertionPoint = Ret;
+  return Ret;
+}
+
+ContextNode *__llvm_ctx_profile_get_context(void *Callee, GUID Guid,
+                                            uint32_t NrCounters,
+                                            uint32_t NrCallsites) {
+  // fast "out" if we're not even doing contextual collection.
+  if (!__llvm_ctx_profile_current_context_root)
+    return TheScratchContext;
+
+  // also fast "out" if the caller is scratch.
+  auto **CallsiteContext = consume(__llvm_ctx_profile_callsite[0]);
+  if (!CallsiteContext || isScratch(*CallsiteContext))
+    return TheScratchContext;
+
+  // if the callee isn't the expected one, return scratch.
+  // Signal handler(s) could have been invoked at any point in the execution.
+  // Should that have happened, and had it (the handler) be built with
+  // instrumentation, its __llvm_ctx_profile_get_context would have failed here.
+  // Its sub call graph would have then populated
+  // __llvm_ctx_profile_{expected_callee | callsite} at index 1.
+  // The normal call graph may be impacted in that, if the signal handler
+  // happened somewhere before we read the TLS here, we'd see the TLS reset and
+  // we'd also fail here. That would just mean we would loose counter values for
+  // the normal subgraph, this time around. That should be very unlikely, but if
+  // it happens too frequently, we should be able to detect discrepancies in
+  // entry counts (caller-callee). At the moment, the design goes on the
+  // assumption that is so unfrequent, though, that it's not worth doing more
+  // for that case.
+  auto *ExpectedCallee = consume(__llvm_ctx_profile_expected_callee[0]);
+  if (ExpectedCallee != Callee)
+    return TheScratchContext;
+
+  auto *Callsite = *CallsiteContext;
+  while (Callsite && Callsite->guid() != Guid) {
+    Callsite = Callsite->next();
+  }
+  auto *Ret = Callsite ? Callsite
+                       : getCallsiteSlow(Guid, CallsiteContext, NrCounters,
+                                         NrCallsites);
+  if (Ret->callsites_size() != NrCallsites ||
+      Ret->counters_size() != NrCounters)
+    __sanitizer::Printf("[ctxprof] Returned ctx differs from what's asked: "
+                        "Context: %p, Asked: %lu %u %u, Got: %lu %u %u \n",
+                        Ret, Guid, NrCallsites, NrCounters, Ret->guid(),
+                        Ret->callsites_size(), Ret->counters_size());
+  Ret->onEntry();
+  return Ret;
+}
+
+// This should be called once for a Root. Allocate the first arena, set up the
+// first context.
+void setupContext(ContextRoot *Root, GUID Guid, uint32_t NrCounters,
+                  uint32_t NrCallsites) {
+  __sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock(
+      &AllContextsMutex);
+  // Re-check - we got here without having had taken a lock.
+  if (Root->FirstMemBlock)
+    return;
+  const auto Needed = ContextNode::getAllocSize(NrCounters, NrCallsites);
+  auto *M = Arena::allocateNewArena(getArenaAllocSize(Needed));
+  Root->FirstMemBlock = M;
+  Root->CurrentMem = M;
+  Root->FirstNode = ContextNode::alloc(M->tryBumpAllocate(Needed), Guid,
+                                       NrCounters, NrCallsites);
+  AllContextRoots.PushBack(Root);
+}
+
+ContextNode *__llvm_ctx_profile_start_context(
+    ContextRoot *Root, GUID Guid, uint32_t Counters,
+    uint32_t Callsites) SANITIZER_NO_THREAD_SAFETY_ANALYSIS {
+  if (!Root->FirstMemBlock) {
+    setupContext(Root, Guid, Counters, Callsites);
+  }
+  if (Root->Taken.TryLock()) {
+    __llvm_ctx_profile_current_context_root = Root;
+    Root->FirstNode->onEntry();
+    return Root->FirstNode;
+  }
+  __llvm_ctx_profile_current_context_root = nullptr;
+  return TheScratchContext;
+}
+
+void __llvm_ctx_profile_release_context(ContextRoot *Root)
+    SANITIZER_NO_THREAD_SAFETY_ANALYSIS {
+  if (__llvm_ctx_profile_current_context_root) {
+    __llvm_ctx_profile_current_context_root = nullptr;
+    Root->Taken.Unlock();
+  }
+}
+
+void __llvm_ctx_profile_start_collection() {
+  size_t NrMemUnits = 0;
+  __sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock(
+      &AllContextsMutex);
+  for (uint32_t I = 0; I < AllContextRoots.Size(); ++I) {
+    auto *Root = AllContextRoots[I];
+    __sanitizer::GenericScopedLock<__sanitizer::StaticSpinMutex> Lock(
+        &Root->Taken);
+    for (auto *Mem = Root->FirstMemBlock; Mem; Mem = Mem->next())
+      ++NrMemUnits;
+
+    Root->FirstNode->reset();
+  }
+  __sanitizer::Printf("[ctxprof] Initial NrMemUnits: %zu \n", NrMemUnits);
+}
+
+bool __llvm_ctx_profile_fetch(
+    void *Data, bool (*Writer)(void *W, const __ctx_profile::ContextNode &)) {
+  assert(Writer);
+  __sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock(
+      &AllContextsMutex);
+
+  for (int I = 0, E = AllContextRoots.Size(); I < E; ++I) {
+    auto *Root = AllContextRoots[I];
+    __sanitizer::GenericScopedLock<__sanitizer::StaticSpinMutex> TakenLock(
+        &Root->Taken);
+    if (!validate(Root)) {
+      __sanitizer::Printf("[ctxprof] Contextual Profile is %s\n", "invalid");
----------------
teresajohnson wrote:

why not just put "invalid" directly in the format string? Does __sanitizer::Printf require at least 2 arguments or something like that?

https://github.com/llvm/llvm-project/pull/89838