[compiler-rt] [compiler-rt][ctx_profile] Add the instrumented contextual profiling APIs (PR #89838)

Mon May 6 16:44:42 PDT 2024

https://github.com/mtrofin updated https://github.com/llvm/llvm-project/pull/89838

>From 75161d49e39a766648bee39f57f419552ecb39bc Mon Sep 17 00:00:00 2001
From: Mircea Trofin <mtrofin at google.com>
Date: Tue, 23 Apr 2024 14:48:19 -0700
Subject: [PATCH 1/5] [compiler-rt][ctx_profile] Add the instrumented
 contextual profiling APIs

APIs for contextual profiling.

(Tracking Issue: #89287, RFC referenced there)
---
 .../lib/ctx_profile/CtxInstrProfiling.cpp     | 213 ++++++++++++++++++
 .../lib/ctx_profile/CtxInstrProfiling.h       | 116 ++++++++++
 .../tests/CtxInstrProfilingTest.cpp           | 154 +++++++++++++
 3 files changed, 483 insertions(+)

diff --git a/compiler-rt/lib/ctx_profile/CtxInstrProfiling.cpp b/compiler-rt/lib/ctx_profile/CtxInstrProfiling.cpp
index 7620ce92f7ebde..1db74eec35a1eb 100644
--- a/compiler-rt/lib/ctx_profile/CtxInstrProfiling.cpp
+++ b/compiler-rt/lib/ctx_profile/CtxInstrProfiling.cpp
@@ -13,11 +13,76 @@
 #include "sanitizer_common/sanitizer_mutex.h"
 #include "sanitizer_common/sanitizer_placement_new.h"
 #include "sanitizer_common/sanitizer_thread_safety.h"
+#include "sanitizer_common/sanitizer_vector.h"
 
 #include <assert.h>
 
 using namespace __ctx_profile;
 
+namespace {
+__sanitizer::SpinMutex AllContextsMutex;
+SANITIZER_GUARDED_BY(AllContextsMutex)
+__sanitizer::Vector<ContextRoot *> AllContextRoots;
+
+ContextNode *markAsScratch(const ContextNode *Ctx) {
+  return reinterpret_cast<ContextNode *>(reinterpret_cast<uint64_t>(Ctx) | 1);
+}
+
+template <typename T> T consume(T &V) {
+  auto R = V;
+  V = {0};
+  return R;
+}
+
+constexpr size_t kPower = 20;
+constexpr size_t kBuffSize = 1 << kPower;
+
+size_t getArenaAllocSize(size_t Needed) {
+  if (Needed >= kBuffSize)
+    return 2 * Needed;
+  return kBuffSize;
+}
+
+bool validate(const ContextRoot *Root) {
+  __sanitizer::DenseMap<uint64_t, bool> ContextStartAddrs;
+  for (const auto *Mem = Root->FirstMemBlock; Mem; Mem = Mem->next()) {
+    const auto *Pos = Mem->start();
+    while (Pos < Mem->pos()) {
+      const auto *Ctx = reinterpret_cast<const ContextNode *>(Pos);
+      if (!ContextStartAddrs.insert({reinterpret_cast<uint64_t>(Ctx), true})
+               .second)
+        return false;
+      Pos += Ctx->size();
+    }
+  }
+
+  for (const auto *Mem = Root->FirstMemBlock; Mem; Mem = Mem->next()) {
+    const auto *Pos = Mem->start();
+    while (Pos < Mem->pos()) {
+      const auto *Ctx = reinterpret_cast<const ContextNode *>(Pos);
+      for (uint32_t I = 0; I < Ctx->callsites_size(); ++I)
+        for (auto *Sub = Ctx->subContexts()[I]; Sub; Sub = Sub->next())
+          if (!ContextStartAddrs.find(reinterpret_cast<uint64_t>(Sub)))
+            return false;
+
+      Pos += Ctx->size();
+    }
+  }
+  return true;
+}
+} // namespace
+
+__thread char __Buffer[kBuffSize] = {0};
+
+#define TheScratchContext                                                      \
+  markAsScratch(reinterpret_cast<ContextNode *>(__Buffer))
+__thread void *volatile __llvm_ctx_profile_expected_callee[2] = {nullptr,
+                                                                 nullptr};
+__thread ContextNode **volatile __llvm_ctx_profile_callsite[2] = {0, 0};
+
+__thread ContextRoot *volatile __llvm_ctx_profile_current_context_root =
+    nullptr;
+
 // FIXME(mtrofin): use malloc / mmap instead of sanitizer common APIs to reduce
 // the dependency on the latter.
 Arena *Arena::allocateNewArena(size_t Size, Arena *Prev) {
@@ -38,3 +103,151 @@ void Arena::freeArenaList(Arena *&A) {
   }
   A = nullptr;
 }
+
+inline ContextNode *ContextNode::alloc(char *Place, GUID Guid,
+                                       uint32_t NrCounters,
+                                       uint32_t NrCallsites,
+                                       ContextNode *Next) {
+  return new (Place) ContextNode(Guid, NrCounters, NrCallsites, Next);
+}
+
+void ContextNode::reset() {
+  for (uint32_t I = 0; I < NrCounters; ++I)
+    counters()[I] = 0;
+  for (uint32_t I = 0; I < NrCallsites; ++I)
+    for (auto *Next = subContexts()[I]; Next; Next = Next->Next)
+      Next->reset();
+}
+
+ContextNode *getCallsiteSlow(uint64_t Guid, ContextNode **InsertionPoint,
+                             uint32_t NrCounters, uint32_t NrCallsites) {
+  auto AllocSize = ContextNode::getAllocSize(NrCounters, NrCallsites);
+  auto *Mem = __llvm_ctx_profile_current_context_root->CurrentMem;
+  char *AllocPlace = Mem->tryBumpAllocate(AllocSize);
+  if (!AllocPlace) {
+    __llvm_ctx_profile_current_context_root->CurrentMem = Mem =
+        Mem->allocateNewArena(getArenaAllocSize(AllocSize), Mem);
+  }
+  auto *Ret = ContextNode::alloc(AllocPlace, Guid, NrCounters, NrCallsites,
+                                 *InsertionPoint);
+  *InsertionPoint = Ret;
+  return Ret;
+}
+
+ContextNode *__llvm_ctx_profile_get_context(void *Callee, GUID Guid,
+                                            uint32_t NrCounters,
+                                            uint32_t NrCallsites) {
+  if (!__llvm_ctx_profile_current_context_root) {
+    return TheScratchContext;
+  }
+  auto **CallsiteContext = consume(__llvm_ctx_profile_callsite[0]);
+  if (!CallsiteContext || isScratch(*CallsiteContext))
+    return TheScratchContext;
+
+  auto *ExpectedCallee = consume(__llvm_ctx_profile_expected_callee[0]);
+  if (ExpectedCallee != Callee)
+    return TheScratchContext;
+
+  auto *Callsite = *CallsiteContext;
+  while (Callsite && Callsite->guid() != Guid) {
+    Callsite = Callsite->next();
+  }
+  auto *Ret = Callsite ? Callsite
+                       : getCallsiteSlow(Guid, CallsiteContext, NrCounters,
+                                         NrCallsites);
+  if (Ret->callsites_size() != NrCallsites ||
+      Ret->counters_size() != NrCounters)
+    __sanitizer::Printf("[ctxprof] Returned ctx differs from what's asked: "
+                        "Context: %p, Asked: %lu %u %u, Got: %lu %u %u \n",
+                        Ret, Guid, NrCallsites, NrCounters, Ret->guid(),
+                        Ret->callsites_size(), Ret->counters_size());
+  Ret->onEntry();
+  return Ret;
+}
+
+void setupContext(ContextRoot *Root, GUID Guid, uint32_t NrCounters,
+                  uint32_t NrCallsites) {
+  __sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock(
+      &AllContextsMutex);
+  // Re-check - we got here without having had taken a lock.
+  if (Root->FirstMemBlock)
+    return;
+  const auto Needed = ContextNode::getAllocSize(NrCounters, NrCallsites);
+  auto *M = Arena::allocateNewArena(getArenaAllocSize(Needed));
+  Root->FirstMemBlock = M;
+  Root->CurrentMem = M;
+  Root->FirstNode = ContextNode::alloc(M->tryBumpAllocate(Needed), Guid,
+                                       NrCounters, NrCallsites);
+  AllContextRoots.PushBack(Root);
+}
+
+ContextNode *__llvm_ctx_profile_start_context(
+    ContextRoot *Root, GUID Guid, uint32_t Counters,
+    uint32_t Callsites) SANITIZER_NO_THREAD_SAFETY_ANALYSIS {
+  if (!Root->FirstMemBlock) {
+    setupContext(Root, Guid, Counters, Callsites);
+  }
+  if (Root->Taken.TryLock()) {
+    __llvm_ctx_profile_current_context_root = Root;
+    Root->FirstNode->onEntry();
+    return Root->FirstNode;
+  }
+  __llvm_ctx_profile_current_context_root = nullptr;
+  return TheScratchContext;
+}
+
+void __llvm_ctx_profile_release_context(ContextRoot *Root)
+    SANITIZER_NO_THREAD_SAFETY_ANALYSIS {
+  if (__llvm_ctx_profile_current_context_root) {
+    __llvm_ctx_profile_current_context_root = nullptr;
+    Root->Taken.Unlock();
+  }
+}
+
+void __llvm_ctx_profile_start_collection() {
+  size_t NrMemUnits = 0;
+  __sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock(
+      &AllContextsMutex);
+  for (uint32_t I = 0; I < AllContextRoots.Size(); ++I) {
+    auto *Root = AllContextRoots[I];
+    __sanitizer::GenericScopedLock<__sanitizer::StaticSpinMutex> Lock(
+        &Root->Taken);
+    for (auto *Mem = Root->FirstMemBlock; Mem; Mem = Mem->next())
+      ++NrMemUnits;
+
+    Root->FirstNode->reset();
+  }
+  __sanitizer::Printf("[ctxprof] Initial NrMemUnits: %zu \n", NrMemUnits);
+}
+
+bool __llvm_ctx_profile_fetch(
+    void *Data, bool (*Writer)(void *W, const __ctx_profile::ContextNode &)) {
+  assert(Writer);
+  __sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock(
+      &AllContextsMutex);
+
+  for (int I = 0, E = AllContextRoots.Size(); I < E; ++I) {
+    auto *Root = AllContextRoots[I];
+    __sanitizer::GenericScopedLock<__sanitizer::StaticSpinMutex> TakenLock(
+        &Root->Taken);
+    if (!validate(Root)) {
+      __sanitizer::Printf("[ctxprof] Contextual Profile is %s\n", "invalid");
+      return false;
+    }
+    if (!Writer(Data, *Root->FirstNode))
+      return false;
+  }
+  return true;
+}
+
+void __llvm_ctx_profile_free() {
+  __sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock(
+      &AllContextsMutex);
+  for (int I = 0, E = AllContextRoots.Size(); I < E; ++I)
+    for (auto *A = AllContextRoots[I]->FirstMemBlock; A;) {
+      auto *C = A;
+      A = A->next();
+      __sanitizer::InternalFree(C);
+    }
+  AllContextRoots.Reset();
+}
diff --git a/compiler-rt/lib/ctx_profile/CtxInstrProfiling.h b/compiler-rt/lib/ctx_profile/CtxInstrProfiling.h
index c1789c32a64c25..9a1a74ca4cb8d6 100644
--- a/compiler-rt/lib/ctx_profile/CtxInstrProfiling.h
+++ b/compiler-rt/lib/ctx_profile/CtxInstrProfiling.h
@@ -9,9 +9,11 @@
 #ifndef CTX_PROFILE_CTXINSTRPROFILING_H_
 #define CTX_PROFILE_CTXINSTRPROFILING_H_
 
+#include "sanitizer_common/sanitizer_mutex.h"
 #include <sanitizer/common_interface_defs.h>
 
 namespace __ctx_profile {
+using GUID = uint64_t;
 
 /// Arena (bump allocator) forming a linked list. Intentionally not thread safe.
 /// Allocation and de-allocation happen using sanitizer APIs. We make that
@@ -51,5 +53,119 @@ class Arena final {
   const uint64_t Size;
 };
 
+class ContextNode final {
+  const GUID Guid;
+  ContextNode *const Next;
+  const uint32_t NrCounters;
+  const uint32_t NrCallsites;
+
+public:
+  ContextNode(GUID Guid, uint32_t NrCounters, uint32_t NrCallsites,
+              ContextNode *Next = nullptr)
+      : Guid(Guid), Next(Next), NrCounters(NrCounters),
+        NrCallsites(NrCallsites) {}
+  static inline ContextNode *alloc(char *Place, GUID Guid, uint32_t NrCounters,
+                                   uint32_t NrCallsites,
+                                   ContextNode *Next = nullptr);
+
+  static inline size_t getAllocSize(uint32_t NrCounters, uint32_t NrCallsites) {
+    return sizeof(ContextNode) + sizeof(uint64_t) * NrCounters +
+           sizeof(ContextNode *) * NrCallsites;
+  }
+
+  uint64_t *counters() {
+    ContextNode *addr_after = &(this[1]);
+    return reinterpret_cast<uint64_t *>(reinterpret_cast<char *>(addr_after));
+  }
+
+  uint32_t counters_size() const { return NrCounters; }
+  uint32_t callsites_size() const { return NrCallsites; }
+
+  const uint64_t *counters() const {
+    return const_cast<ContextNode *>(this)->counters();
+  }
+
+  ContextNode **subContexts() {
+    return reinterpret_cast<ContextNode **>(&(counters()[NrCounters]));
+  }
+
+  ContextNode *const *subContexts() const {
+    return const_cast<ContextNode *>(this)->subContexts();
+  }
+
+  GUID guid() const { return Guid; }
+  ContextNode *next() { return Next; }
+
+  size_t size() const { return getAllocSize(NrCounters, NrCallsites); }
+
+  void reset();
+
+  void onEntry() { ++counters()[0]; }
+
+  uint64_t entrycount() const { return counters()[0]; }
+};
+
+/// ContextRoots are allocated by LLVM for entrypoints. The main concern is
+/// the total size, LLVM doesn't actually dereference members.
+struct ContextRoot {
+  ContextNode *FirstNode = nullptr;
+  Arena *FirstMemBlock = nullptr;
+  Arena *CurrentMem = nullptr;
+  // This is init-ed by the static zero initializer in LLVM.
+  ::__sanitizer::StaticSpinMutex Taken;
+
+  // Avoid surprises due to (unlikely) StaticSpinMutex changes.
+  static_assert(sizeof(Taken) == 1);
+};
+
+/// This API is exposed for testing.
+inline bool isScratch(const ContextNode *Ctx) {
+  return (reinterpret_cast<uint64_t>(Ctx) & 1);
+}
+
 } // namespace __ctx_profile
+
+extern "C" {
+
+// LLVM fills these in when lowering a llvm.instrprof.callsite intrinsic.
+// position 0 is used when the current context isn't scratch, 1 when it is.
+extern __thread void *volatile __llvm_ctx_profile_expected_callee[2];
+extern __thread __ctx_profile::ContextNode *
+    *volatile __llvm_ctx_profile_callsite[2];
+
+// __llvm_ctx_profile_current_context_root is exposed for unit testing,
+// othwerise it's only used internally.
+extern __thread __ctx_profile::ContextRoot
+    *volatile __llvm_ctx_profile_current_context_root;
+
+/// called by LLVM in the entry BB of a "entry point" function. The returned
+/// pointer may be "tainted" - its LSB set to 1 - to indicate it's scratch.
+__ctx_profile::ContextNode *
+__llvm_ctx_profile_start_context(__ctx_profile::ContextRoot *Root,
+                                 __ctx_profile::GUID Guid, uint32_t Counters,
+                                 uint32_t Callsites);
+
+/// paired with __llvm_ctx_profile_start_context, and called at the exit of the
+/// entry point function.
+void __llvm_ctx_profile_release_context(__ctx_profile::ContextRoot *Root);
+
+/// called for any other function than entry points, in the entry BB of such
+/// function. Same consideration about LSB of returned value as .._start_context
+__ctx_profile::ContextNode *
+__llvm_ctx_profile_get_context(void *Callee, __ctx_profile::GUID Guid,
+                               uint32_t NrCounters, uint32_t NrCallsites);
+
+/// Prepares for collection. Currently this resets counter values but preserves
+/// internal structure.
+void __llvm_ctx_profile_start_collection();
+
+/// Completely free allocated memory.
+void __llvm_ctx_profile_free();
+
+/// Used to obtain the profile. The Writer is called for each root ContextNode,
+/// with the ContextRoot::Taken taken. The Writer is responsible for traversing
+/// the structure underneath.
+bool __llvm_ctx_profile_fetch(
+    void *Data, bool (*Writer)(void *, const __ctx_profile::ContextNode &));
+}
 #endif // CTX_PROFILE_CTXINSTRPROFILING_H_
diff --git a/compiler-rt/lib/ctx_profile/tests/CtxInstrProfilingTest.cpp b/compiler-rt/lib/ctx_profile/tests/CtxInstrProfilingTest.cpp
index 44f37d25763206..e22edeb6192311 100644
--- a/compiler-rt/lib/ctx_profile/tests/CtxInstrProfilingTest.cpp
+++ b/compiler-rt/lib/ctx_profile/tests/CtxInstrProfilingTest.cpp
@@ -1,8 +1,17 @@
 #include "../CtxInstrProfiling.h"
 #include "gtest/gtest.h"
+#include <thread>
 
 using namespace __ctx_profile;
 
+class ContextTest : public ::testing::Test {
+  void SetUp() override { memset(&Root, 0, sizeof(ContextRoot)); }
+  void TearDown() override { __llvm_ctx_profile_free(); }
+
+public:
+  ContextRoot Root;
+};
+
 TEST(ArenaTest, Basic) {
   Arena *A = Arena::allocateNewArena(1024);
   EXPECT_EQ(A->size(), 1024U);
@@ -20,3 +29,148 @@ TEST(ArenaTest, Basic) {
   Arena::freeArenaList(A);
   EXPECT_EQ(A, nullptr);
 }
+
+TEST_F(ContextTest, Basic) {
+  auto *Ctx = __llvm_ctx_profile_start_context(&Root, 1, 10, 4);
+  EXPECT_NE(Ctx, nullptr);
+  EXPECT_NE(Root.CurrentMem, nullptr);
+  EXPECT_EQ(Root.FirstMemBlock, Root.CurrentMem);
+  EXPECT_EQ(Ctx->size(), sizeof(ContextNode) + 10 * sizeof(uint64_t) +
+                             4 * sizeof(ContextNode *));
+  EXPECT_EQ(Ctx->counters_size(), 10U);
+  EXPECT_EQ(Ctx->callsites_size(), 4U);
+  EXPECT_EQ(__llvm_ctx_profile_current_context_root, &Root);
+  Root.Taken.CheckLocked();
+  EXPECT_FALSE(Root.Taken.TryLock());
+  __llvm_ctx_profile_release_context(&Root);
+  EXPECT_EQ(__llvm_ctx_profile_current_context_root, nullptr);
+  EXPECT_TRUE(Root.Taken.TryLock());
+  Root.Taken.Unlock();
+}
+
+TEST_F(ContextTest, Callsite) {
+  auto *Ctx = __llvm_ctx_profile_start_context(&Root, 1, 10, 4);
+  int OpaqueValue = 0;
+  const bool IsScratch = isScratch(Ctx);
+  EXPECT_FALSE(IsScratch);
+  __llvm_ctx_profile_expected_callee[0] = &OpaqueValue;
+  __llvm_ctx_profile_callsite[0] = &Ctx->subContexts()[2];
+  auto *Subctx = __llvm_ctx_profile_get_context(&OpaqueValue, 2, 3, 1);
+  EXPECT_EQ(Ctx->subContexts()[2], Subctx);
+  EXPECT_EQ(Subctx->counters_size(), 3U);
+  EXPECT_EQ(Subctx->callsites_size(), 1U);
+  EXPECT_EQ(__llvm_ctx_profile_expected_callee[0], nullptr);
+  EXPECT_EQ(__llvm_ctx_profile_callsite[0], nullptr);
+
+  EXPECT_EQ(Subctx->size(), sizeof(ContextNode) + 3 * sizeof(uint64_t) +
+                                1 * sizeof(ContextNode *));
+  __llvm_ctx_profile_release_context(&Root);
+}
+
+TEST_F(ContextTest, ScratchNoCollection) {
+  EXPECT_EQ(__llvm_ctx_profile_current_context_root, nullptr);
+  int OpaqueValue = 0;
+  // this would be the very first function executing this. the TLS is empty,
+  // too.
+  auto *Ctx = __llvm_ctx_profile_get_context(&OpaqueValue, 2, 3, 1);
+  EXPECT_TRUE(isScratch(Ctx));
+}
+
+TEST_F(ContextTest, ScratchDuringCollection) {
+  auto *Ctx = __llvm_ctx_profile_start_context(&Root, 1, 10, 4);
+  int OpaqueValue = 0;
+  int OtherOpaqueValue = 0;
+  __llvm_ctx_profile_expected_callee[0] = &OpaqueValue;
+  __llvm_ctx_profile_callsite[0] = &Ctx->subContexts()[2];
+  auto *Subctx = __llvm_ctx_profile_get_context(&OtherOpaqueValue, 2, 3, 1);
+  EXPECT_TRUE(isScratch(Subctx));
+  EXPECT_EQ(__llvm_ctx_profile_expected_callee[0], nullptr);
+  EXPECT_EQ(__llvm_ctx_profile_callsite[0], nullptr);
+
+  int ThirdOpaqueValue = 0;
+  __llvm_ctx_profile_expected_callee[1] = &ThirdOpaqueValue;
+  __llvm_ctx_profile_callsite[1] = &Subctx->subContexts()[0];
+
+  auto *Subctx2 = __llvm_ctx_profile_get_context(&ThirdOpaqueValue, 3, 0, 0);
+  EXPECT_TRUE(isScratch(Subctx2));
+
+  __llvm_ctx_profile_release_context(&Root);
+}
+
+TEST_F(ContextTest, ConcurrentRootCollection) {
+  std::atomic<int> NonScratch = 0;
+  std::atomic<int> Executions = 0;
+
+  __sanitizer::Semaphore GotCtx;
+
+  auto Entrypoint = [&]() {
+    ++Executions;
+    auto *Ctx = __llvm_ctx_profile_start_context(&Root, 1, 10, 4);
+    GotCtx.Post();
+    const bool IS = isScratch(Ctx);
+    NonScratch += (!IS);
+    if (!IS) {
+      GotCtx.Wait();
+      GotCtx.Wait();
+    }
+    __llvm_ctx_profile_release_context(&Root);
+  };
+  std::thread T1(Entrypoint);
+  std::thread T2(Entrypoint);
+  T1.join();
+  T2.join();
+  EXPECT_EQ(NonScratch, 1);
+  EXPECT_EQ(Executions, 2);
+}
+
+TEST_F(ContextTest, Dump) {
+  auto *Ctx = __llvm_ctx_profile_start_context(&Root, 1, 10, 4);
+  int OpaqueValue = 0;
+  __llvm_ctx_profile_expected_callee[0] = &OpaqueValue;
+  __llvm_ctx_profile_callsite[0] = &Ctx->subContexts()[2];
+  auto *Subctx = __llvm_ctx_profile_get_context(&OpaqueValue, 2, 3, 1);
+  (void)Subctx;
+  __llvm_ctx_profile_release_context(&Root);
+
+  struct Writer {
+    ContextRoot *const Root;
+    const size_t Entries;
+    bool State = false;
+    Writer(ContextRoot *Root, size_t Entries) : Root(Root), Entries(Entries) {}
+
+    bool write(const ContextNode &Node) {
+      EXPECT_FALSE(Root->Taken.TryLock());
+      EXPECT_EQ(Node.guid(), 1);
+      EXPECT_EQ(Node.counters()[0], Entries);
+      EXPECT_EQ(Node.counters_size(), 10);
+      EXPECT_EQ(Node.callsites_size(), 4);
+      EXPECT_EQ(Node.subContexts()[0], nullptr);
+      EXPECT_EQ(Node.subContexts()[1], nullptr);
+      EXPECT_NE(Node.subContexts()[2], nullptr);
+      EXPECT_EQ(Node.subContexts()[3], nullptr);
+      const auto &SN = *Node.subContexts()[2];
+      EXPECT_EQ(SN.guid(), 2);
+      EXPECT_EQ(SN.counters()[0], Entries);
+      EXPECT_EQ(SN.counters_size(), 3);
+      EXPECT_EQ(SN.callsites_size(), 1);
+      EXPECT_EQ(SN.subContexts()[0], nullptr);
+      State = true;
+      return true;
+    }
+  };
+  Writer W(&Root, 1);
+  EXPECT_FALSE(W.State);
+  __llvm_ctx_profile_fetch(&W, [](void *W, const ContextNode &Node) -> bool {
+    return reinterpret_cast<Writer *>(W)->write(Node);
+  });
+  EXPECT_TRUE(W.State);
+
+  // this resets all counters but not the internal structure.
+  __llvm_ctx_profile_start_collection();
+  Writer W2(&Root, 0);
+  EXPECT_FALSE(W2.State);
+  __llvm_ctx_profile_fetch(&W2, [](void *W, const ContextNode &Node) -> bool {
+    return reinterpret_cast<Writer *>(W)->write(Node);
+  });
+  EXPECT_TRUE(W2.State);
+}

>From 50449f5f4b1d424c3cda1824a449b85497713461 Mon Sep 17 00:00:00 2001
From: Mircea Trofin <mtrofin at google.com>
Date: Tue, 30 Apr 2024 16:44:52 -0700
Subject: [PATCH 2/5] more comments

---
 .../lib/ctx_profile/CtxInstrProfiling.cpp     | 57 ++++++++++++++++++-
 .../lib/ctx_profile/CtxInstrProfiling.h       | 48 ++++++++++++++--
 2 files changed, 96 insertions(+), 9 deletions(-)

diff --git a/compiler-rt/lib/ctx_profile/CtxInstrProfiling.cpp b/compiler-rt/lib/ctx_profile/CtxInstrProfiling.cpp
index 1db74eec35a1eb..28ab157d236157 100644
--- a/compiler-rt/lib/ctx_profile/CtxInstrProfiling.cpp
+++ b/compiler-rt/lib/ctx_profile/CtxInstrProfiling.cpp
@@ -20,30 +20,44 @@
 using namespace __ctx_profile;
 
 namespace {
+// Keep track of all the context roots we actually saw, so we can then traverse
+// them when the user asks for the profile in __llvm_ctx_profile_fetch
 __sanitizer::SpinMutex AllContextsMutex;
 SANITIZER_GUARDED_BY(AllContextsMutex)
 __sanitizer::Vector<ContextRoot *> AllContextRoots;
 
+// utility to taint a pointer by setting the LSB. There is an assumption
+// throughout that the addresses of contexts are even (really, they should be
+// align(8), but "even"-ness is the minimum assumption)
 ContextNode *markAsScratch(const ContextNode *Ctx) {
   return reinterpret_cast<ContextNode *>(reinterpret_cast<uint64_t>(Ctx) | 1);
 }
 
-template <typename T> T consume(T &V) {
+// Used when getting the data from TLS. We don't *really* need to reset, but
+// it's a simpler system if we do.
+template <typename T> inline T consume(T &V) {
   auto R = V;
   V = {0};
   return R;
 }
 
+// We allocate at least kBuffSize Arena pages. The scratch buffer is also that
+// large.
 constexpr size_t kPower = 20;
 constexpr size_t kBuffSize = 1 << kPower;
 
+// Highly unlikely we need more than kBuffSize for a context.
 size_t getArenaAllocSize(size_t Needed) {
   if (Needed >= kBuffSize)
     return 2 * Needed;
   return kBuffSize;
 }
 
+// verify the structural integrity of the context
 bool validate(const ContextRoot *Root) {
+  // all contexts should be laid out in some arena page. Go over each arena
+  // allocated for this Root, and jump over contained contexts based on
+  // self-reported sizes.
   __sanitizer::DenseMap<uint64_t, bool> ContextStartAddrs;
   for (const auto *Mem = Root->FirstMemBlock; Mem; Mem = Mem->next()) {
     const auto *Pos = Mem->start();
@@ -56,6 +70,8 @@ bool validate(const ContextRoot *Root) {
     }
   }
 
+  // Now traverse the contexts again the same way, but validate all nonull
+  // subcontext addresses appear in the set computed above.
   for (const auto *Mem = Root->FirstMemBlock; Mem; Mem = Mem->next()) {
     const auto *Pos = Mem->start();
     while (Pos < Mem->pos()) {
@@ -72,10 +88,19 @@ bool validate(const ContextRoot *Root) {
 }
 } // namespace
 
+// the scratch buffer - what we give when we can't produce a real context (the
+// scratch isn't "real" in that it's expected to be clobbered carelessly - we
+// don't read it). The other important thing is that the callees from a scratch
+// context also get a scratch context.
+// Eventually this can be replaced with per-function buffers, a'la the typical
+// (flat) instrumented FDO buffers. The clobbering aspect won't apply there, but
+// the part about determining the nature of the subcontexts does.
 __thread char __Buffer[kBuffSize] = {0};
 
 #define TheScratchContext                                                      \
   markAsScratch(reinterpret_cast<ContextNode *>(__Buffer))
+
+// init the TLSes
 __thread void *volatile __llvm_ctx_profile_expected_callee[2] = {nullptr,
                                                                  nullptr};
 __thread ContextNode **volatile __llvm_ctx_profile_callsite[2] = {0, 0};
@@ -108,6 +133,7 @@ inline ContextNode *ContextNode::alloc(char *Place, GUID Guid,
                                        uint32_t NrCounters,
                                        uint32_t NrCallsites,
                                        ContextNode *Next) {
+  assert(reinterpret_cast<uint64_t>(Place) % sizeof(void*) == 0);
   return new (Place) ContextNode(Guid, NrCounters, NrCallsites, Next);
 }
 
@@ -119,12 +145,19 @@ void ContextNode::reset() {
       Next->reset();
 }
 
+// If this is the first time we hit a callsite with this (Guid) particular
+// callee, we need to allocate.
 ContextNode *getCallsiteSlow(uint64_t Guid, ContextNode **InsertionPoint,
                              uint32_t NrCounters, uint32_t NrCallsites) {
   auto AllocSize = ContextNode::getAllocSize(NrCounters, NrCallsites);
   auto *Mem = __llvm_ctx_profile_current_context_root->CurrentMem;
   char *AllocPlace = Mem->tryBumpAllocate(AllocSize);
   if (!AllocPlace) {
+    // if we failed to allocate on the current arena, allocate a new arena,
+    // and place it on __llvm_ctx_profile_current_context_root->CurrentMem so we
+    // find it from now on for other cases when we need to getCallsiteSlow.
+    // Note that allocateNewArena will link the allocated memory in the list of
+    // Arenas.
     __llvm_ctx_profile_current_context_root->CurrentMem = Mem =
         Mem->allocateNewArena(getArenaAllocSize(AllocSize), Mem);
   }
@@ -137,13 +170,29 @@ ContextNode *getCallsiteSlow(uint64_t Guid, ContextNode **InsertionPoint,
 ContextNode *__llvm_ctx_profile_get_context(void *Callee, GUID Guid,
                                             uint32_t NrCounters,
                                             uint32_t NrCallsites) {
-  if (!__llvm_ctx_profile_current_context_root) {
+  // fast "out" if we're not even doing contextual collection.
+  if (!__llvm_ctx_profile_current_context_root)
     return TheScratchContext;
-  }
+
+  // also fast "out" if the caller is scratch.
   auto **CallsiteContext = consume(__llvm_ctx_profile_callsite[0]);
   if (!CallsiteContext || isScratch(*CallsiteContext))
     return TheScratchContext;
 
+  // if the callee isn't the expected one, return scratch.
+  // Signal handler(s) could have been invoked at any point in the execution.
+  // Should that have happened, and had it (the handler) be built with
+  // instrumentation, its __llvm_ctx_profile_get_context would have failed here.
+  // Its sub call graph would have then populated
+  // __llvm_ctx_profile_{expected_callee | callsite} at index 1.
+  // The normal call graph may be impacted in that, if the signal handler
+  // happened somewhere before we read the TLS here, we'd see the TLS reset and
+  // we'd also fail here. That would just mean we would loose counter values for
+  // the normal subgraph, this time around. That should be very unlikely, but if
+  // it happens too frequently, we should be able to detect discrepancies in
+  // entry counts (caller-callee). At the moment, the design goes on the
+  // assumption that is so unfrequent, though, that it's not worth doing more
+  // for that case.
   auto *ExpectedCallee = consume(__llvm_ctx_profile_expected_callee[0]);
   if (ExpectedCallee != Callee)
     return TheScratchContext;
@@ -165,6 +214,8 @@ ContextNode *__llvm_ctx_profile_get_context(void *Callee, GUID Guid,
   return Ret;
 }
 
+// This should be called once for a Root. Allocate the first arena, set up the
+// first context.
 void setupContext(ContextRoot *Root, GUID Guid, uint32_t NrCounters,
                   uint32_t NrCallsites) {
   __sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock(
diff --git a/compiler-rt/lib/ctx_profile/CtxInstrProfiling.h b/compiler-rt/lib/ctx_profile/CtxInstrProfiling.h
index 9a1a74ca4cb8d6..4afd78ac41b8b4 100644
--- a/compiler-rt/lib/ctx_profile/CtxInstrProfiling.h
+++ b/compiler-rt/lib/ctx_profile/CtxInstrProfiling.h
@@ -53,6 +53,28 @@ class Arena final {
   const uint64_t Size;
 };
 
+/// The contextual profile is a directed tree where each node has one parent. A
+/// node (ContextNode) corresponds to a function activation. The root of the
+/// tree is at a function that was marked as entrypoint to the compiler. A node
+/// stores counter values for edges and a vector of subcontexts. These are the
+/// contexts of callees. The index in the subcontext vector corresponds to the
+/// index of the callsite (as was instrumented via llvm.instrprof.callsite). At
+/// that index we find a linked list, potentially empty, of ContextNodes. Direct
+/// calls will have 0 or 1 values in the linked list, but indirect callsites may
+/// have more.
+///
+/// The ContextNode has a fixed sized header describing it - the GUID of the
+/// function, the size of the counter and callsite vectors. It is also an
+/// (intrusive) linked list for the purposes of the indirect call case above.
+///
+/// Allocation is expected to happen on an Arena.
+///
+/// The structure of the ContextNode is known to LLVM, because LLVM needs to:
+///   (1) increment countes, and
+///   (2) form a GEP for the position in the subcontext list of a callsite
+/// This means changes to LLVM contextual profile lowering and changes here
+/// must be coupled.
+/// Note: the header content isn't interesting to LLVM (other than its size)
 class ContextNode final {
   const GUID Guid;
   ContextNode *const Next;
@@ -100,13 +122,16 @@ class ContextNode final {
 
   void reset();
 
+  // since we go through the runtime to get a context back to LLVM, in the entry
+  // basic block, might as well handle incrementing the entry basic block
+  // counter.
   void onEntry() { ++counters()[0]; }
 
   uint64_t entrycount() const { return counters()[0]; }
 };
 
-/// ContextRoots are allocated by LLVM for entrypoints. The main concern is
-/// the total size, LLVM doesn't actually dereference members.
+/// ContextRoots are allocated by LLVM for entrypoints. LLVM is only concerned
+/// with allocating and zero-initializing the global value for it.
 struct ContextRoot {
   ContextNode *FirstNode = nullptr;
   Arena *FirstMemBlock = nullptr;
@@ -118,7 +143,8 @@ struct ContextRoot {
   static_assert(sizeof(Taken) == 1);
 };
 
-/// This API is exposed for testing.
+/// This API is exposed for testing. See the APIs below about the contract with
+/// LLVM.
 inline bool isScratch(const ContextNode *Ctx) {
   return (reinterpret_cast<uint64_t>(Ctx) & 1);
 }
@@ -128,13 +154,20 @@ inline bool isScratch(const ContextNode *Ctx) {
 extern "C" {
 
 // LLVM fills these in when lowering a llvm.instrprof.callsite intrinsic.
-// position 0 is used when the current context isn't scratch, 1 when it is.
+// position 0 is used when the current context isn't scratch, 1 when it is. They
+// are volatile because of signal handlers - we mean to specifically control
+// when the data is loaded.
+//
+/// TLS where LLVM stores the pointer of the called value, as part of lowering a
+/// llvm.instrprof.callsite
 extern __thread void *volatile __llvm_ctx_profile_expected_callee[2];
+/// TLS where LLVM stores the pointer inside a caller's subcontexts vector that
+/// corresponds to the callsite being lowered.
 extern __thread __ctx_profile::ContextNode *
     *volatile __llvm_ctx_profile_callsite[2];
 
 // __llvm_ctx_profile_current_context_root is exposed for unit testing,
-// othwerise it's only used internally.
+// othwerise it's only used internally by compiler-rt/ctx_profile.
 extern __thread __ctx_profile::ContextRoot
     *volatile __llvm_ctx_profile_current_context_root;
 
@@ -156,7 +189,7 @@ __llvm_ctx_profile_get_context(void *Callee, __ctx_profile::GUID Guid,
                                uint32_t NrCounters, uint32_t NrCallsites);
 
 /// Prepares for collection. Currently this resets counter values but preserves
-/// internal structure.
+/// internal context tree structure.
 void __llvm_ctx_profile_start_collection();
 
 /// Completely free allocated memory.
@@ -165,6 +198,9 @@ void __llvm_ctx_profile_free();
 /// Used to obtain the profile. The Writer is called for each root ContextNode,
 /// with the ContextRoot::Taken taken. The Writer is responsible for traversing
 /// the structure underneath.
+/// The Writer's first parameter is an object it knows about, which is what the
+/// caller of __llvm_ctx_profile_fetch passes as the Data parameter. The second
+/// parameter is the root of a context tree.
 bool __llvm_ctx_profile_fetch(
     void *Data, bool (*Writer)(void *, const __ctx_profile::ContextNode &));
 }

>From de7676cd3513bd6ad983fc0e62d4ee7623d9f016 Mon Sep 17 00:00:00 2001
From: Mircea Trofin <mtrofin at google.com>
Date: Tue, 30 Apr 2024 16:49:34 -0700
Subject: [PATCH 3/5] formatting

---
 compiler-rt/lib/ctx_profile/CtxInstrProfiling.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/compiler-rt/lib/ctx_profile/CtxInstrProfiling.cpp b/compiler-rt/lib/ctx_profile/CtxInstrProfiling.cpp
index 28ab157d236157..b0f49d7854e190 100644
--- a/compiler-rt/lib/ctx_profile/CtxInstrProfiling.cpp
+++ b/compiler-rt/lib/ctx_profile/CtxInstrProfiling.cpp
@@ -133,7 +133,7 @@ inline ContextNode *ContextNode::alloc(char *Place, GUID Guid,
                                        uint32_t NrCounters,
                                        uint32_t NrCallsites,
                                        ContextNode *Next) {
-  assert(reinterpret_cast<uint64_t>(Place) % sizeof(void*) == 0);
+  assert(reinterpret_cast<uint64_t>(Place) % sizeof(void *) == 0);
   return new (Place) ContextNode(Guid, NrCounters, NrCallsites, Next);
 }
 

>From 48afbe38bc5446e405324cddd8109978baab3f55 Mon Sep 17 00:00:00 2001
From: Mircea Trofin <mtrofin at google.com>
Date: Mon, 6 May 2024 09:57:58 -0700
Subject: [PATCH 4/5] more comments / addressed feedback

---
 .../lib/ctx_profile/CtxInstrProfiling.cpp     |  7 +++
 .../lib/ctx_profile/CtxInstrProfiling.h       | 46 ++++++++++++++++---
 .../tests/CtxInstrProfilingTest.cpp           | 25 ++++++++++
 3 files changed, 72 insertions(+), 6 deletions(-)

diff --git a/compiler-rt/lib/ctx_profile/CtxInstrProfiling.cpp b/compiler-rt/lib/ctx_profile/CtxInstrProfiling.cpp
index b0f49d7854e190..5df8a5870784bd 100644
--- a/compiler-rt/lib/ctx_profile/CtxInstrProfiling.cpp
+++ b/compiler-rt/lib/ctx_profile/CtxInstrProfiling.cpp
@@ -29,6 +29,9 @@ __sanitizer::Vector<ContextRoot *> AllContextRoots;
 // utility to taint a pointer by setting the LSB. There is an assumption
 // throughout that the addresses of contexts are even (really, they should be
 // align(8), but "even"-ness is the minimum assumption)
+// "scratch contexts" are buffers that we return in certain cases - they are
+// large enough to allow for memory safe counter access, but they don't link
+// subcontexts below them (the runtime recognizes them and enforces that)
 ContextNode *markAsScratch(const ContextNode *Ctx) {
   return reinterpret_cast<ContextNode *>(reinterpret_cast<uint64_t>(Ctx) | 1);
 }
@@ -160,6 +163,7 @@ ContextNode *getCallsiteSlow(uint64_t Guid, ContextNode **InsertionPoint,
     // Arenas.
     __llvm_ctx_profile_current_context_root->CurrentMem = Mem =
         Mem->allocateNewArena(getArenaAllocSize(AllocSize), Mem);
+    AllocPlace = Mem->tryBumpAllocate(AllocSize);
   }
   auto *Ret = ContextNode::alloc(AllocPlace, Guid, NrCounters, NrCallsites,
                                  *InsertionPoint);
@@ -198,6 +202,8 @@ ContextNode *__llvm_ctx_profile_get_context(void *Callee, GUID Guid,
     return TheScratchContext;
 
   auto *Callsite = *CallsiteContext;
+  // in the case of indirect calls, we will have all seen targets forming a
+  // linked list here. Find the one corresponding to this callee.
   while (Callsite && Callsite->guid() != Guid) {
     Callsite = Callsite->next();
   }
@@ -243,6 +249,7 @@ ContextNode *__llvm_ctx_profile_start_context(
     Root->FirstNode->onEntry();
     return Root->FirstNode;
   }
+  // If this thread couldn't take the lock, return scratch context.
   __llvm_ctx_profile_current_context_root = nullptr;
   return TheScratchContext;
 }
diff --git a/compiler-rt/lib/ctx_profile/CtxInstrProfiling.h b/compiler-rt/lib/ctx_profile/CtxInstrProfiling.h
index 4afd78ac41b8b4..f97d31a29a2101 100644
--- a/compiler-rt/lib/ctx_profile/CtxInstrProfiling.h
+++ b/compiler-rt/lib/ctx_profile/CtxInstrProfiling.h
@@ -67,14 +67,28 @@ class Arena final {
 /// function, the size of the counter and callsite vectors. It is also an
 /// (intrusive) linked list for the purposes of the indirect call case above.
 ///
-/// Allocation is expected to happen on an Arena.
+/// Allocation is expected to happen on an Arena. The allocation lays out inline
+/// the counter and subcontexts vectors. The class offers APIs to correctly
+/// reference the latter.
+///
+/// The layout is as follows:
+///
+/// [[statically-declared fields][counters vector][subcontexts vector]]
+///
+/// See also documentation on the counters and subContexts members below.
 ///
 /// The structure of the ContextNode is known to LLVM, because LLVM needs to:
-///   (1) increment countes, and
+///   (1) increment counts, and
 ///   (2) form a GEP for the position in the subcontext list of a callsite
 /// This means changes to LLVM contextual profile lowering and changes here
 /// must be coupled.
 /// Note: the header content isn't interesting to LLVM (other than its size)
+///
+/// Part of contextual collection is the notion of "scratch contexts". These are
+/// buffers that are "large enough" to allow for memory-safe acceses during
+/// counter increments - meaning the counter increment code in LLVM doesn't need
+/// to be concerned with memory safety. Their subcontexts never get populated,
+/// though. The runtime code here produces and recognizes them.
 class ContextNode final {
   const GUID Guid;
   ContextNode *const Next;
@@ -95,6 +109,7 @@ class ContextNode final {
            sizeof(ContextNode *) * NrCallsites;
   }
 
+  // The counters vector starts right after the static header.
   uint64_t *counters() {
     ContextNode *addr_after = &(this[1]);
     return reinterpret_cast<uint64_t *>(reinterpret_cast<char *>(addr_after));
@@ -107,6 +122,7 @@ class ContextNode final {
     return const_cast<ContextNode *>(this)->counters();
   }
 
+  // The subcontexts vector starts rigth after the end of the counters vector.
   ContextNode **subContexts() {
     return reinterpret_cast<ContextNode **>(&(counters()[NrCounters]));
   }
@@ -131,12 +147,30 @@ class ContextNode final {
 };
 
 /// ContextRoots are allocated by LLVM for entrypoints. LLVM is only concerned
-/// with allocating and zero-initializing the global value for it.
+/// with allocating and zero-initializing the global value (as in, GlobalValue)
+/// for it.
 struct ContextRoot {
   ContextNode *FirstNode = nullptr;
   Arena *FirstMemBlock = nullptr;
   Arena *CurrentMem = nullptr;
   // This is init-ed by the static zero initializer in LLVM.
+  // Taken is used to ensure only one thread traverses the contextual graph -
+  // either to read it or to write it. On server side, the same entrypoint will
+  // be entered by numerous threads, but over time, the profile aggregated by
+  // collecting sequentially on one thread at a time is expected to converge to
+  // the aggregate profile that may have been observable on all the threads.
+  // Note that this is node-by-node aggregation, i.e. summing counters of nodes
+  // at the same position in the graph, not flattening.
+  // Threads that cannot lock Taken (fail TryLock) are given a "scratch context"
+  // - a buffer they can clobber, safely from a memory access perspective.
+  // We could consider relaxing the requirement of more than one thread entering
+  // by holding a few context trees per entrypoint and then aggregating them (as
+  // explained above) at the end of the profile collection - it's a tradeoff
+  // between collection time and memory use: higher precision can be obtained
+  // with either less concurrent collections but more collection time, or with
+  // more concurrent collections (==more memory) and less collection time.
+  // Note that concurrent collection does happen for different entrypoints,
+  // regardless.
   ::__sanitizer::StaticSpinMutex Taken;
 
   // Avoid surprises due to (unlikely) StaticSpinMutex changes.
@@ -198,9 +232,9 @@ void __llvm_ctx_profile_free();
 /// Used to obtain the profile. The Writer is called for each root ContextNode,
 /// with the ContextRoot::Taken taken. The Writer is responsible for traversing
 /// the structure underneath.
-/// The Writer's first parameter is an object it knows about, which is what the
-/// caller of __llvm_ctx_profile_fetch passes as the Data parameter. The second
-/// parameter is the root of a context tree.
+/// The Writer's first parameter plays the role of closure for Writer, and is
+/// what the caller of __llvm_ctx_profile_fetch passes as the Data parameter.
+/// The second parameter is the root of a context tree.
 bool __llvm_ctx_profile_fetch(
     void *Data, bool (*Writer)(void *, const __ctx_profile::ContextNode &));
 }
diff --git a/compiler-rt/lib/ctx_profile/tests/CtxInstrProfilingTest.cpp b/compiler-rt/lib/ctx_profile/tests/CtxInstrProfilingTest.cpp
index e22edeb6192311..ada9c98b5b0f68 100644
--- a/compiler-rt/lib/ctx_profile/tests/CtxInstrProfilingTest.cpp
+++ b/compiler-rt/lib/ctx_profile/tests/CtxInstrProfilingTest.cpp
@@ -53,12 +53,17 @@ TEST_F(ContextTest, Callsite) {
   int OpaqueValue = 0;
   const bool IsScratch = isScratch(Ctx);
   EXPECT_FALSE(IsScratch);
+  // This is the sequence the caller performs - it's the lowering of the
+  // instrumentation of the callsite "2". "2" is arbitrary here.
   __llvm_ctx_profile_expected_callee[0] = &OpaqueValue;
   __llvm_ctx_profile_callsite[0] = &Ctx->subContexts()[2];
+  // This is what the callee does
   auto *Subctx = __llvm_ctx_profile_get_context(&OpaqueValue, 2, 3, 1);
+  // We expect the subcontext to be appropriately placed and dimensioned
   EXPECT_EQ(Ctx->subContexts()[2], Subctx);
   EXPECT_EQ(Subctx->counters_size(), 3U);
   EXPECT_EQ(Subctx->callsites_size(), 1U);
+  // We reset these in _get_context.
   EXPECT_EQ(__llvm_ctx_profile_expected_callee[0], nullptr);
   EXPECT_EQ(__llvm_ctx_profile_callsite[0], nullptr);
 
@@ -73,6 +78,8 @@ TEST_F(ContextTest, ScratchNoCollection) {
   // this would be the very first function executing this. the TLS is empty,
   // too.
   auto *Ctx = __llvm_ctx_profile_get_context(&OpaqueValue, 2, 3, 1);
+  // We never entered a context (_start_context was never called) - so the
+  // returned context must be scratch.
   EXPECT_TRUE(isScratch(Ctx));
 }
 
@@ -83,6 +90,9 @@ TEST_F(ContextTest, ScratchDuringCollection) {
   __llvm_ctx_profile_expected_callee[0] = &OpaqueValue;
   __llvm_ctx_profile_callsite[0] = &Ctx->subContexts()[2];
   auto *Subctx = __llvm_ctx_profile_get_context(&OtherOpaqueValue, 2, 3, 1);
+  // We expected a different callee - so return scratch. It mimics what happens
+  // in the case of a signal handler - in this case, OtherOpaqueValue is the
+  // signal handler.
   EXPECT_TRUE(isScratch(Subctx));
   EXPECT_EQ(__llvm_ctx_profile_expected_callee[0], nullptr);
   EXPECT_EQ(__llvm_ctx_profile_callsite[0], nullptr);
@@ -97,6 +107,21 @@ TEST_F(ContextTest, ScratchDuringCollection) {
   __llvm_ctx_profile_release_context(&Root);
 }
 
+TEST_F(ContextTest, NeedMoreMemory) {
+  auto *Ctx = __llvm_ctx_profile_start_context(&Root, 1, 10, 4);
+  int OpaqueValue = 0;
+  const bool IsScratch = isScratch(Ctx);
+  EXPECT_FALSE(IsScratch);
+  const auto *CurrentMem = Root.CurrentMem;
+  __llvm_ctx_profile_expected_callee[0] = &OpaqueValue;
+  __llvm_ctx_profile_callsite[0] = &Ctx->subContexts()[2];
+  // Allocate a massive subcontext to force new arena allocation
+  auto *Subctx = __llvm_ctx_profile_get_context(&OpaqueValue, 3, 1 << 20, 1);
+  EXPECT_EQ(Ctx->subContexts()[2], Subctx);
+  EXPECT_NE(CurrentMem, Root.CurrentMem);
+  EXPECT_NE(Root.CurrentMem, nullptr);
+}
+
 TEST_F(ContextTest, ConcurrentRootCollection) {
   std::atomic<int> NonScratch = 0;
   std::atomic<int> Executions = 0;

>From 658faccb0422bd741b418564e0232b69794d1787 Mon Sep 17 00:00:00 2001
From: Mircea Trofin <mtrofin at google.com>
Date: Mon, 6 May 2024 16:34:33 -0700
Subject: [PATCH 5/5] more comments, asserts about alignment

---
 .../lib/ctx_profile/CtxInstrProfiling.cpp       |  7 ++++---
 compiler-rt/lib/ctx_profile/CtxInstrProfiling.h | 17 +++++++++++++++--
 2 files changed, 19 insertions(+), 5 deletions(-)

diff --git a/compiler-rt/lib/ctx_profile/CtxInstrProfiling.cpp b/compiler-rt/lib/ctx_profile/CtxInstrProfiling.cpp
index 5df8a5870784bd..ff862fb40e404c 100644
--- a/compiler-rt/lib/ctx_profile/CtxInstrProfiling.cpp
+++ b/compiler-rt/lib/ctx_profile/CtxInstrProfiling.cpp
@@ -115,8 +115,9 @@ __thread ContextRoot *volatile __llvm_ctx_profile_current_context_root =
 // the dependency on the latter.
 Arena *Arena::allocateNewArena(size_t Size, Arena *Prev) {
   assert(!Prev || Prev->Next == nullptr);
-  Arena *NewArena =
-      new (__sanitizer::InternalAlloc(Size + sizeof(Arena))) Arena(Size);
+  Arena *NewArena = new (__sanitizer::InternalAlloc(
+      Size + sizeof(Arena), /*cache=*/nullptr, /*alignment=*/ExpectedAlignment))
+      Arena(Size);
   if (Prev)
     Prev->Next = NewArena;
   return NewArena;
@@ -136,7 +137,7 @@ inline ContextNode *ContextNode::alloc(char *Place, GUID Guid,
                                        uint32_t NrCounters,
                                        uint32_t NrCallsites,
                                        ContextNode *Next) {
-  assert(reinterpret_cast<uint64_t>(Place) % sizeof(void *) == 0);
+  assert(reinterpret_cast<uint64_t>(Place) % ExpectedAlignment == 0);
   return new (Place) ContextNode(Guid, NrCounters, NrCallsites, Next);
 }
 
diff --git a/compiler-rt/lib/ctx_profile/CtxInstrProfiling.h b/compiler-rt/lib/ctx_profile/CtxInstrProfiling.h
index f97d31a29a2101..5280ff358ec6f2 100644
--- a/compiler-rt/lib/ctx_profile/CtxInstrProfiling.h
+++ b/compiler-rt/lib/ctx_profile/CtxInstrProfiling.h
@@ -14,6 +14,11 @@
 
 namespace __ctx_profile {
 using GUID = uint64_t;
+static constexpr size_t ExpectedAlignment = 8;
+// We really depend on this, see further below. We currently support x86_64.
+// When we want to support other archs, we need to trace the places Alignment is
+// used and adjust accordingly.
+static_assert(sizeof(void *) == ExpectedAlignment);
 
 /// Arena (bump allocator) forming a linked list. Intentionally not thread safe.
 /// Allocation and de-allocation happen using sanitizer APIs. We make that
@@ -53,6 +58,10 @@ class Arena final {
   const uint64_t Size;
 };
 
+// The memory available for allocation follows the Arena header, and we expect
+// it to be thus aligned.
+static_assert(sizeof(Arena) % ExpectedAlignment == 0);
+
 /// The contextual profile is a directed tree where each node has one parent. A
 /// node (ContextNode) corresponds to a function activation. The root of the
 /// tree is at a function that was marked as entrypoint to the compiler. A node
@@ -73,7 +82,7 @@ class Arena final {
 ///
 /// The layout is as follows:
 ///
-/// [[statically-declared fields][counters vector][subcontexts vector]]
+/// [[declared fields][counters vector][vector of ptrs to subcontexts]]
 ///
 /// See also documentation on the counters and subContexts members below.
 ///
@@ -112,7 +121,7 @@ class ContextNode final {
   // The counters vector starts right after the static header.
   uint64_t *counters() {
     ContextNode *addr_after = &(this[1]);
-    return reinterpret_cast<uint64_t *>(reinterpret_cast<char *>(addr_after));
+    return reinterpret_cast<uint64_t *>(addr_after);
   }
 
   uint32_t counters_size() const { return NrCounters; }
@@ -146,6 +155,10 @@ class ContextNode final {
   uint64_t entrycount() const { return counters()[0]; }
 };
 
+// Verify maintenance to ContextNode doesn't change this invariant, which makes
+// sure the inlined vectors are appropriately aligned.
+static_assert(sizeof(ContextNode) % Alignment == 0);
+
 /// ContextRoots are allocated by LLVM for entrypoints. LLVM is only concerned
 /// with allocating and zero-initializing the global value (as in, GlobalValue)
 /// for it.