[compiler-rt] [llvm] [ctxprof] Flat profile collection (PR #130655)
Mircea Trofin via llvm-commits
llvm-commits at lists.llvm.org
Tue Mar 11 15:51:36 PDT 2025
https://github.com/mtrofin updated https://github.com/llvm/llvm-project/pull/130655
>From aacc2ad87cb3bae363977ec76786558215518994 Mon Sep 17 00:00:00 2001
From: Mircea Trofin <mtrofin at google.com>
Date: Thu, 6 Mar 2025 16:33:55 -0800
Subject: [PATCH] [ctxprof] Flat profile collection
---
.../lib/ctx_profile/CtxInstrContextNode.h | 6 +
.../lib/ctx_profile/CtxInstrProfiling.cpp | 136 ++++++++++++++++--
.../lib/ctx_profile/CtxInstrProfiling.h | 25 +++-
.../tests/CtxInstrProfilingTest.cpp | 87 +++++++++--
.../TestCases/generate-context.cpp | 37 +++++
.../llvm/ProfileData/CtxInstrContextNode.h | 6 +
.../llvm/ProfileData/PGOCtxProfWriter.h | 6 +-
.../Instrumentation/PGOCtxProfLowering.cpp | 23 ++-
.../PGOProfile/ctx-instrumentation.ll | 15 +-
9 files changed, 303 insertions(+), 38 deletions(-)
diff --git a/compiler-rt/lib/ctx_profile/CtxInstrContextNode.h b/compiler-rt/lib/ctx_profile/CtxInstrContextNode.h
index fe8ddcdf79129..0fc4883305145 100644
--- a/compiler-rt/lib/ctx_profile/CtxInstrContextNode.h
+++ b/compiler-rt/lib/ctx_profile/CtxInstrContextNode.h
@@ -122,6 +122,12 @@ class ProfileWriter {
virtual void startContextSection() = 0;
virtual void writeContextual(const ctx_profile::ContextNode &RootNode) = 0;
virtual void endContextSection() = 0;
+
+ virtual void startFlatSection() = 0;
+ virtual void writeFlat(ctx_profile::GUID Guid, const uint64_t *Buffer,
+ size_t BufferSize) = 0;
+ virtual void endFlatSection() = 0;
+
virtual ~ProfileWriter() = default;
};
} // namespace ctx_profile
diff --git a/compiler-rt/lib/ctx_profile/CtxInstrProfiling.cpp b/compiler-rt/lib/ctx_profile/CtxInstrProfiling.cpp
index 992aa94a6631d..d7ec8fde4ec7d 100644
--- a/compiler-rt/lib/ctx_profile/CtxInstrProfiling.cpp
+++ b/compiler-rt/lib/ctx_profile/CtxInstrProfiling.cpp
@@ -8,6 +8,8 @@
#include "CtxInstrProfiling.h"
#include "sanitizer_common/sanitizer_allocator_internal.h"
+#include "sanitizer_common/sanitizer_atomic.h"
+#include "sanitizer_common/sanitizer_atomic_clang.h"
#include "sanitizer_common/sanitizer_common.h"
#include "sanitizer_common/sanitizer_dense_map.h"
#include "sanitizer_common/sanitizer_libc.h"
@@ -27,6 +29,20 @@ __sanitizer::SpinMutex AllContextsMutex;
SANITIZER_GUARDED_BY(AllContextsMutex)
__sanitizer::Vector<ContextRoot *> AllContextRoots;
+__sanitizer::atomic_uintptr_t AllFunctionsData = {};
+
+// Keep all the functions for which we collect a flat profile in a linked list.
+__sanitizer::SpinMutex FlatCtxArenaMutex;
+SANITIZER_GUARDED_BY(FlatCtxArenaMutex)
+Arena *FlatCtxArenaHead = nullptr;
+SANITIZER_GUARDED_BY(FlatCtxArenaMutex)
+Arena *FlatCtxArena = nullptr;
+
+// Set to true when we enter a root, and false when we exit - regardless if this
+// thread collects a contextual profile for that root.
+__thread bool IsUnderContext = false;
+__sanitizer::atomic_uint8_t ProfilingStarted = {};
+
// utility to taint a pointer by setting the LSB. There is an assumption
// throughout that the addresses of contexts are even (really, they should be
// align(8), but "even"-ness is the minimum assumption)
@@ -109,7 +125,10 @@ void resetContextNode(ContextNode &Node) {
resetContextNode(*Next);
}
-void onContextEnter(ContextNode &Node) { ++Node.counters()[0]; }
+ContextNode *onContextEnter(ContextNode &Node) {
+ ++Node.counters()[0];
+ return &Node;
+}
} // namespace
@@ -182,12 +201,75 @@ ContextNode *getCallsiteSlow(GUID Guid, ContextNode **InsertionPoint,
return Ret;
}
-ContextNode *__llvm_ctx_profile_get_context(void *Callee, GUID Guid,
- uint32_t NumCounters,
+ContextNode *getFlatProfile(FunctionData &Data, GUID Guid,
+ uint32_t NumCounters) {
+ if (ContextNode *Existing = Data.FlatCtx)
+ return Existing;
+ {
+ // We could instead try to take the lock and, if that fails, return
+ // TheScratchContext. But that could leave message pump loops more sparsely
+ // profiled than everything else. Maybe that doesn't matter, and we can
+ // optimize this later.
+ __sanitizer::GenericScopedLock<__sanitizer::StaticSpinMutex> L(&Data.Mutex);
+ if (ContextNode *Existing = Data.FlatCtx)
+ return Existing;
+
+ auto NeededSize = ContextNode::getAllocSize(NumCounters, 0);
+ char *AllocBuff = nullptr;
+ {
+ __sanitizer::GenericScopedLock<__sanitizer::SpinMutex> FL(
+ &FlatCtxArenaMutex);
+ if (FlatCtxArena)
+ AllocBuff = FlatCtxArena->tryBumpAllocate(NeededSize);
+ if (!AllocBuff) {
+ FlatCtxArena = Arena::allocateNewArena(getArenaAllocSize(NeededSize),
+ FlatCtxArena);
+ AllocBuff = FlatCtxArena->tryBumpAllocate(NeededSize);
+ }
+ if (!FlatCtxArenaHead)
+ FlatCtxArenaHead = FlatCtxArena;
+ }
+ auto *Ret = allocContextNode(AllocBuff, Guid, NumCounters, 0);
+ Data.FlatCtx = Ret;
+
+ Data.Next = reinterpret_cast<FunctionData *>(
+ __sanitizer::atomic_load_relaxed(&AllFunctionsData));
+ while (!__sanitizer::atomic_compare_exchange_strong(
+ &AllFunctionsData, reinterpret_cast<uintptr_t *>(&Data.Next),
+ reinterpret_cast<uintptr_t>(&Data),
+ __sanitizer::memory_order_release)) {
+ }
+ }
+
+ return Data.FlatCtx;
+}
+
+ContextNode *getUnhandledContext(FunctionData &Data, GUID Guid,
+ uint32_t NumCounters) {
+ // 1) if we are under a root (regardless if this thread is collecting or not a
+ // contextual profile for that root), do not collect a flat profile. We want
+ // to keep flat profiles only for activations that can't happen under a root,
+ // to avoid confusing profiles. We can, for example, combine flattened and
+ // flat profiles meaningfully, as we wouldn't double-count anything.
+ //
+ // 2) to avoid lengthy startup, don't bother with flat profiles until the
+ // profiling started. We would reset them anyway when profiling starts.
+ // HOWEVER. This does lose profiling for message pumps: those functions are
+ // entered once and never exit. They should be assumed to be entered before
+ // profiling starts - because profiling should start after the server is up
+ // and running (which is equivalent to "message pumps are set up").
+ if (IsUnderContext || !__sanitizer::atomic_load_relaxed(&ProfilingStarted))
+ return TheScratchContext;
+ return markAsScratch(
+ onContextEnter(*getFlatProfile(Data, Guid, NumCounters)));
+}
+
+ContextNode *__llvm_ctx_profile_get_context(FunctionData *Data, void *Callee,
+ GUID Guid, uint32_t NumCounters,
uint32_t NumCallsites) {
// fast "out" if we're not even doing contextual collection.
if (!__llvm_ctx_profile_current_context_root)
- return TheScratchContext;
+ return getUnhandledContext(*Data, Guid, NumCounters);
// also fast "out" if the caller is scratch. We can see if it's scratch by
// looking at the interior pointer into the subcontexts vector that the caller
@@ -196,7 +278,7 @@ ContextNode *__llvm_ctx_profile_get_context(void *Callee, GUID Guid,
// precisely, aligned - 8 values)
auto **CallsiteContext = consume(__llvm_ctx_profile_callsite[0]);
if (!CallsiteContext || isScratch(CallsiteContext))
- return TheScratchContext;
+ return getUnhandledContext(*Data, Guid, NumCounters);
// if the callee isn't the expected one, return scratch.
// Signal handler(s) could have been invoked at any point in the execution.
@@ -214,7 +296,7 @@ ContextNode *__llvm_ctx_profile_get_context(void *Callee, GUID Guid,
// for that case.
auto *ExpectedCallee = consume(__llvm_ctx_profile_expected_callee[0]);
if (ExpectedCallee != Callee)
- return TheScratchContext;
+ return getUnhandledContext(*Data, Guid, NumCounters);
auto *Callsite = *CallsiteContext;
// in the case of indirect calls, we will have all seen targets forming a
@@ -257,6 +339,7 @@ void setupContext(ContextRoot *Root, GUID Guid, uint32_t NumCounters,
ContextNode *__llvm_ctx_profile_start_context(
ContextRoot *Root, GUID Guid, uint32_t Counters,
uint32_t Callsites) SANITIZER_NO_THREAD_SAFETY_ANALYSIS {
+ IsUnderContext = true;
if (!Root->FirstMemBlock) {
setupContext(Root, Guid, Counters, Callsites);
}
@@ -272,6 +355,7 @@ ContextNode *__llvm_ctx_profile_start_context(
void __llvm_ctx_profile_release_context(ContextRoot *Root)
SANITIZER_NO_THREAD_SAFETY_ANALYSIS {
+ IsUnderContext = false;
if (__llvm_ctx_profile_current_context_root) {
__llvm_ctx_profile_current_context_root = nullptr;
Root->Taken.Unlock();
@@ -291,10 +375,12 @@ void __llvm_ctx_profile_start_collection() {
resetContextNode(*Root->FirstNode);
}
+ __sanitizer::atomic_store_relaxed(&ProfilingStarted, true);
__sanitizer::Printf("[ctxprof] Initial NumMemUnits: %zu \n", NumMemUnits);
}
bool __llvm_ctx_profile_fetch(ProfileWriter &Writer) {
+ __sanitizer::atomic_store_relaxed(&ProfilingStarted, false);
__sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock(
&AllContextsMutex);
@@ -310,17 +396,43 @@ bool __llvm_ctx_profile_fetch(ProfileWriter &Writer) {
Writer.writeContextual(*Root->FirstNode);
}
Writer.endContextSection();
+ Writer.startFlatSection();
+ // The list progresses behind the head, so taking this snapshot allows the
+ // list to grow concurrently without causing a race condition with our
+ // traversing it.
+ const auto *Pos = reinterpret_cast<const FunctionData *>(
+ __sanitizer::atomic_load_relaxed(&AllFunctionsData));
+ for (; Pos; Pos = Pos->Next)
+ Writer.writeFlat(Pos->FlatCtx->guid(), Pos->FlatCtx->counters(),
+ Pos->FlatCtx->counters_size());
+ Writer.endFlatSection();
return true;
}
void __llvm_ctx_profile_free() {
- __sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock(
- &AllContextsMutex);
- for (int I = 0, E = AllContextRoots.Size(); I < E; ++I)
- for (auto *A = AllContextRoots[I]->FirstMemBlock; A;) {
+ __sanitizer::atomic_store_relaxed(&ProfilingStarted, false);
+ {
+ __sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock(
+ &AllContextsMutex);
+ for (int I = 0, E = AllContextRoots.Size(); I < E; ++I)
+ for (auto *A = AllContextRoots[I]->FirstMemBlock; A;) {
+ auto *C = A;
+ A = A->next();
+ __sanitizer::InternalFree(C);
+ }
+ AllContextRoots.Reset();
+ }
+ __sanitizer::atomic_store_relaxed(&AllFunctionsData, 0U);
+ {
+ __sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock(
+ &FlatCtxArenaMutex);
+ FlatCtxArena = nullptr;
+ for (auto *A = FlatCtxArenaHead; A;) {
auto *C = A;
- A = A->next();
+ A = C->next();
__sanitizer::InternalFree(C);
}
- AllContextRoots.Reset();
+
+ FlatCtxArenaHead = nullptr;
+ }
}
diff --git a/compiler-rt/lib/ctx_profile/CtxInstrProfiling.h b/compiler-rt/lib/ctx_profile/CtxInstrProfiling.h
index 8a6949d4ec288..c41a77457178c 100644
--- a/compiler-rt/lib/ctx_profile/CtxInstrProfiling.h
+++ b/compiler-rt/lib/ctx_profile/CtxInstrProfiling.h
@@ -113,6 +113,28 @@ struct ContextRoot {
static_assert(sizeof(Taken) == 1);
};
+// This is allocated and zero-initialized by the compiler, the in-place
+// initialization serves mostly as self-documentation and for testing.
+// The design is influenced by the observation that typically (at least for
+// datacenter binaries, which is the motivating target of this profiler) less
+// than 10% of functions in a binary even appear in a profile (of any kind).
+//
+// 1) We could pre-allocate the flat profile storage in the compiler, just like
+// the flat instrumented profiling does. But that penalizes the static size of
+// the binary for little reason
+//
+// 2) We could do the above but zero-initialize the buffers (which should place
+// them in .bss), and dynamically populate them. This, though, would page-in
+// more memory upfront for the binary's runtime
+//
+// The current design trades off a bit of overhead at the first time a function
+// is encountered *for flat profiling* for avoiding size penalties.
+struct FunctionData {
+ FunctionData *Next = nullptr;
+ ContextNode *volatile FlatCtx = nullptr;
+ ::__sanitizer::StaticSpinMutex Mutex;
+};
+
/// This API is exposed for testing. See the APIs below about the contract with
/// LLVM.
inline bool isScratch(const void *Ctx) {
@@ -152,7 +174,8 @@ void __llvm_ctx_profile_release_context(__ctx_profile::ContextRoot *Root);
/// called for any other function than entry points, in the entry BB of such
/// function. Same consideration about LSB of returned value as .._start_context
-ContextNode *__llvm_ctx_profile_get_context(void *Callee, GUID Guid,
+ContextNode *__llvm_ctx_profile_get_context(__ctx_profile::FunctionData *Data,
+ void *Callee, GUID Guid,
uint32_t NumCounters,
uint32_t NumCallsites);
diff --git a/compiler-rt/lib/ctx_profile/tests/CtxInstrProfilingTest.cpp b/compiler-rt/lib/ctx_profile/tests/CtxInstrProfilingTest.cpp
index 97292f9f1abff..01a8274774ecb 100644
--- a/compiler-rt/lib/ctx_profile/tests/CtxInstrProfilingTest.cpp
+++ b/compiler-rt/lib/ctx_profile/tests/CtxInstrProfilingTest.cpp
@@ -40,6 +40,7 @@ TEST(ArenaTest, Basic) {
}
TEST_F(ContextTest, Basic) {
+ __llvm_ctx_profile_start_collection();
auto *Ctx = __llvm_ctx_profile_start_context(&Root, 1, 10, 4);
ASSERT_NE(Ctx, nullptr);
EXPECT_NE(Root.CurrentMem, nullptr);
@@ -58,6 +59,7 @@ TEST_F(ContextTest, Basic) {
}
TEST_F(ContextTest, Callsite) {
+ __llvm_ctx_profile_start_collection();
auto *Ctx = __llvm_ctx_profile_start_context(&Root, 1, 10, 4);
int FakeCalleeAddress = 0;
const bool IsScratch = isScratch(Ctx);
@@ -67,7 +69,11 @@ TEST_F(ContextTest, Callsite) {
__llvm_ctx_profile_expected_callee[0] = &FakeCalleeAddress;
__llvm_ctx_profile_callsite[0] = &Ctx->subContexts()[2];
// This is what the callee does
- auto *Subctx = __llvm_ctx_profile_get_context(&FakeCalleeAddress, 2, 3, 1);
+ FunctionData FData = {0};
+ auto *Subctx =
+ __llvm_ctx_profile_get_context(&FData, &FakeCalleeAddress, 2, 3, 1);
+ // This should not have required creating a flat context.
+ EXPECT_EQ(FData.FlatCtx, nullptr);
// We expect the subcontext to be appropriately placed and dimensioned
EXPECT_EQ(Ctx->subContexts()[2], Subctx);
EXPECT_EQ(Subctx->counters_size(), 3U);
@@ -81,29 +87,59 @@ TEST_F(ContextTest, Callsite) {
__llvm_ctx_profile_release_context(&Root);
}
-TEST_F(ContextTest, ScratchNoCollection) {
+TEST_F(ContextTest, ScratchNoCollectionProfilingNotStarted) {
+ // This test intentionally does not call __llvm_ctx_profile_start_collection.
EXPECT_EQ(__llvm_ctx_profile_current_context_root, nullptr);
int FakeCalleeAddress = 0;
// this would be the very first function executing this. the TLS is empty,
// too.
- auto *Ctx = __llvm_ctx_profile_get_context(&FakeCalleeAddress, 2, 3, 1);
+ FunctionData FData = {0};
+ auto *Ctx =
+ __llvm_ctx_profile_get_context(&FData, &FakeCalleeAddress, 2, 3, 1);
+ // We never entered a context (_start_context was never called) - so the
+ // returned context must be a tagged pointer.
+ EXPECT_TRUE(isScratch(Ctx));
+ // Because we didn't start collection, no flat profile should have been
+ // allocated.
+ EXPECT_EQ(FData.FlatCtx, nullptr);
+}
+
+TEST_F(ContextTest, ScratchNoCollectionProfilingStarted) {
+ ASSERT_EQ(__llvm_ctx_profile_current_context_root, nullptr);
+ int FakeCalleeAddress = 0;
+ // Start collection, so the function gets a flat profile instead of scratch.
+ __llvm_ctx_profile_start_collection();
+ // this would be the very first function executing this. the TLS is empty,
+ // too.
+ FunctionData FData = {0};
+ auto *Ctx =
+ __llvm_ctx_profile_get_context(&FData, &FakeCalleeAddress, 2, 3, 1);
// We never entered a context (_start_context was never called) - so the
- // returned context must be scratch.
+ // returned context must be a tagged pointer.
EXPECT_TRUE(isScratch(Ctx));
+ // Because we never entered a context, we should have allocated a flat context
+ EXPECT_NE(FData.FlatCtx, nullptr);
+ EXPECT_EQ(reinterpret_cast<uintptr_t>(FData.FlatCtx) + 1,
+ reinterpret_cast<uintptr_t>(Ctx));
}
TEST_F(ContextTest, ScratchDuringCollection) {
+ __llvm_ctx_profile_start_collection();
auto *Ctx = __llvm_ctx_profile_start_context(&Root, 1, 10, 4);
int FakeCalleeAddress = 0;
int OtherFakeCalleeAddress = 0;
__llvm_ctx_profile_expected_callee[0] = &FakeCalleeAddress;
__llvm_ctx_profile_callsite[0] = &Ctx->subContexts()[2];
- auto *Subctx =
- __llvm_ctx_profile_get_context(&OtherFakeCalleeAddress, 2, 3, 1);
+ FunctionData FData[3] = {0};
+ auto *Subctx = __llvm_ctx_profile_get_context(
+ &FData[0], &OtherFakeCalleeAddress, 2, 3, 1);
// We expected a different callee - so return scratch. It mimics what happens
// in the case of a signal handler - in this case, OtherFakeCalleeAddress is
// the signal handler.
EXPECT_TRUE(isScratch(Subctx));
+ // We shouldn't have tried to return a flat context because we're under a
+ // root.
+ EXPECT_EQ(FData[0].FlatCtx, nullptr);
EXPECT_EQ(__llvm_ctx_profile_expected_callee[0], nullptr);
EXPECT_EQ(__llvm_ctx_profile_callsite[0], nullptr);
@@ -111,24 +147,27 @@ TEST_F(ContextTest, ScratchDuringCollection) {
__llvm_ctx_profile_expected_callee[1] = &ThirdFakeCalleeAddress;
__llvm_ctx_profile_callsite[1] = &Subctx->subContexts()[0];
- auto *Subctx2 =
- __llvm_ctx_profile_get_context(&ThirdFakeCalleeAddress, 3, 0, 0);
+ auto *Subctx2 = __llvm_ctx_profile_get_context(
+ &FData[1], &ThirdFakeCalleeAddress, 3, 0, 0);
// We again expect scratch because the '0' position is where the runtime
// looks, so it doesn't matter the '1' position is populated correctly.
EXPECT_TRUE(isScratch(Subctx2));
+ EXPECT_EQ(FData[1].FlatCtx, nullptr);
__llvm_ctx_profile_expected_callee[0] = &ThirdFakeCalleeAddress;
__llvm_ctx_profile_callsite[0] = &Subctx->subContexts()[0];
- auto *Subctx3 =
- __llvm_ctx_profile_get_context(&ThirdFakeCalleeAddress, 3, 0, 0);
+ auto *Subctx3 = __llvm_ctx_profile_get_context(
+ &FData[2], &ThirdFakeCalleeAddress, 3, 0, 0);
// We expect scratch here, too, because the value placed in
// __llvm_ctx_profile_callsite is scratch
EXPECT_TRUE(isScratch(Subctx3));
+ EXPECT_EQ(FData[2].FlatCtx, nullptr);
__llvm_ctx_profile_release_context(&Root);
}
TEST_F(ContextTest, NeedMoreMemory) {
+ __llvm_ctx_profile_start_collection();
auto *Ctx = __llvm_ctx_profile_start_context(&Root, 1, 10, 4);
int FakeCalleeAddress = 0;
const bool IsScratch = isScratch(Ctx);
@@ -136,9 +175,11 @@ TEST_F(ContextTest, NeedMoreMemory) {
const auto *CurrentMem = Root.CurrentMem;
__llvm_ctx_profile_expected_callee[0] = &FakeCalleeAddress;
__llvm_ctx_profile_callsite[0] = &Ctx->subContexts()[2];
+ FunctionData FData = {0};
// Allocate a massive subcontext to force new arena allocation
auto *Subctx =
- __llvm_ctx_profile_get_context(&FakeCalleeAddress, 3, 1 << 20, 1);
+ __llvm_ctx_profile_get_context(&FData, &FakeCalleeAddress, 3, 1 << 20, 1);
+ EXPECT_EQ(FData.FlatCtx, nullptr);
EXPECT_EQ(Ctx->subContexts()[2], Subctx);
EXPECT_NE(CurrentMem, Root.CurrentMem);
EXPECT_NE(Root.CurrentMem, nullptr);
@@ -175,7 +216,9 @@ TEST_F(ContextTest, Dump) {
int FakeCalleeAddress = 0;
__llvm_ctx_profile_expected_callee[0] = &FakeCalleeAddress;
__llvm_ctx_profile_callsite[0] = &Ctx->subContexts()[2];
- auto *Subctx = __llvm_ctx_profile_get_context(&FakeCalleeAddress, 2, 3, 1);
+ FunctionData FData = {0};
+ auto *Subctx =
+ __llvm_ctx_profile_get_context(&FData, &FakeCalleeAddress, 2, 3, 1);
(void)Subctx;
__llvm_ctx_profile_release_context(&Root);
@@ -186,6 +229,9 @@ TEST_F(ContextTest, Dump) {
int EnteredSectionCount = 0;
int ExitedSectionCount = 0;
+ int EnteredFlatCount = 0;
+ int ExitedFlatCount = 0;
+ int FlatsWritten = 0;
bool State = false;
@@ -217,6 +263,16 @@ TEST_F(ContextTest, Dump) {
EXPECT_EQ(EnteredSectionCount, 1);
++ExitedSectionCount;
}
+ void startFlatSection() override { ++EnteredFlatCount; }
+ void writeFlat(GUID Guid, const uint64_t *Buffer,
+ size_t BufferSize) override {
+ ++FlatsWritten;
+ EXPECT_EQ(BufferSize, 3);
+ EXPECT_EQ(Buffer[0], 15U);
+ EXPECT_EQ(Buffer[1], 0U);
+ EXPECT_EQ(Buffer[2], 0U);
+ }
+ void endFlatSection() override { ++ExitedFlatCount; }
};
TestProfileWriter W(&Root, 1);
@@ -226,10 +282,17 @@ TEST_F(ContextTest, Dump) {
// this resets all counters but not the internal structure.
__llvm_ctx_profile_start_collection();
+ auto *Flat =
+ __llvm_ctx_profile_get_context(&FData, &FakeCalleeAddress, 2, 3, 1);
+ EXPECT_NE(FData.FlatCtx, nullptr);
+ FData.FlatCtx->counters()[0] = 15U;
TestProfileWriter W2(&Root, 0);
EXPECT_FALSE(W2.State);
__llvm_ctx_profile_fetch(W2);
EXPECT_TRUE(W2.State);
EXPECT_EQ(W2.EnteredSectionCount, 1);
EXPECT_EQ(W2.ExitedSectionCount, 1);
+ EXPECT_EQ(W2.EnteredFlatCount, 1);
+ EXPECT_EQ(W2.FlatsWritten, 1);
+ EXPECT_EQ(W2.ExitedFlatCount, 1);
}
diff --git a/compiler-rt/test/ctx_profile/TestCases/generate-context.cpp b/compiler-rt/test/ctx_profile/TestCases/generate-context.cpp
index cdf819cbefc3b..bf33b4423fd1f 100644
--- a/compiler-rt/test/ctx_profile/TestCases/generate-context.cpp
+++ b/compiler-rt/test/ctx_profile/TestCases/generate-context.cpp
@@ -15,6 +15,7 @@
#include <iostream>
using namespace llvm::ctx_profile;
+extern "C" void __llvm_ctx_profile_start_collection();
extern "C" bool __llvm_ctx_profile_fetch(ProfileWriter &);
// avoid name mangling
@@ -36,6 +37,15 @@ __attribute__((noinline)) void theRoot() {
someFunction(I);
}
}
+
+__attribute__((noinline)) void flatFct() {
+ printf("flat check 1\n");
+ someFunction(1);
+#pragma nounroll
+ for (auto I = 0; I < 2; ++I) {
+ someFunction(I);
+ }
+}
}
// Make sure the program actually ran correctly.
@@ -43,6 +53,10 @@ __attribute__((noinline)) void theRoot() {
// CHECK-NEXT: check odd
// CHECK-NEXT: check even
// CHECK-NEXT: check odd
+// CHECK-NEXT: flat check 1
+// CHECK-NEXT: check odd
+// CHECK-NEXT: check even
+// CHECK-NEXT: check odd
class TestProfileWriter : public ProfileWriter {
void printProfile(const ContextNode &Node, const std::string &Indent,
@@ -73,6 +87,22 @@ class TestProfileWriter : public ProfileWriter {
void writeContextual(const ContextNode &RootNode) override {
printProfile(RootNode, "", "");
}
+
+ void startFlatSection() override {
+ std::cout << "Entered Flat Section" << std::endl;
+ }
+
+ void writeFlat(GUID Guid, const uint64_t *Buffer,
+ size_t BufferSize) override {
+ std::cout << "Flat: " << Guid << " " << Buffer[0];
+ for (size_t I = 1U; I < BufferSize; ++I)
+ std::cout << "," << Buffer[I];
+ std::cout << std::endl;
+ };
+
+ void endFlatSection() override {
+ std::cout << "Exited Flat Section" << std::endl;
+ }
};
// 8657661246551306189 is theRoot. We expect 2 callsites and 2 counters - one
@@ -100,6 +130,11 @@ class TestProfileWriter : public ProfileWriter {
// CHECK-NEXT: 2 counters and 2 callsites
// CHECK-NEXT: Counter values: 2 1
// CHECK-NEXT: Exited Context Section
+// CHECK-NEXT: Entered Flat Section
+// CHECK-NEXT: Flat: 6759619411192316602 3,1
+// This is flatFct (guid: 14569438697463215220)
+// CHECK-NEXT: Flat: 14569438697463215220 1,2
+// CHECK-NEXT: Exited Flat Section
bool profileWriter() {
TestProfileWriter W;
@@ -107,7 +142,9 @@ bool profileWriter() {
}
int main(int argc, char **argv) {
+ __llvm_ctx_profile_start_collection();
theRoot();
+ flatFct();
// This would be implemented in a specific RPC handler, but here we just call
// it directly.
return !profileWriter();
diff --git a/llvm/include/llvm/ProfileData/CtxInstrContextNode.h b/llvm/include/llvm/ProfileData/CtxInstrContextNode.h
index fe8ddcdf79129..0fc4883305145 100644
--- a/llvm/include/llvm/ProfileData/CtxInstrContextNode.h
+++ b/llvm/include/llvm/ProfileData/CtxInstrContextNode.h
@@ -122,6 +122,12 @@ class ProfileWriter {
virtual void startContextSection() = 0;
virtual void writeContextual(const ctx_profile::ContextNode &RootNode) = 0;
virtual void endContextSection() = 0;
+
+ virtual void startFlatSection() = 0;
+ virtual void writeFlat(ctx_profile::GUID Guid, const uint64_t *Buffer,
+ size_t BufferSize) = 0;
+ virtual void endFlatSection() = 0;
+
virtual ~ProfileWriter() = default;
};
} // namespace ctx_profile
diff --git a/llvm/include/llvm/ProfileData/PGOCtxProfWriter.h b/llvm/include/llvm/ProfileData/PGOCtxProfWriter.h
index 40f355f99eb53..c5a724d9a2142 100644
--- a/llvm/include/llvm/ProfileData/PGOCtxProfWriter.h
+++ b/llvm/include/llvm/ProfileData/PGOCtxProfWriter.h
@@ -87,10 +87,10 @@ class PGOCtxProfileWriter final : public ctx_profile::ProfileWriter {
void writeContextual(const ctx_profile::ContextNode &RootNode) override;
void endContextSection() override;
- void startFlatSection();
+ void startFlatSection() override;
void writeFlat(ctx_profile::GUID Guid, const uint64_t *Buffer,
- size_t BufferSize);
- void endFlatSection();
+ size_t BufferSize) override;
+ void endFlatSection() override;
// constants used in writing which a reader may find useful.
static constexpr unsigned CodeLen = 2;
diff --git a/llvm/lib/Transforms/Instrumentation/PGOCtxProfLowering.cpp b/llvm/lib/Transforms/Instrumentation/PGOCtxProfLowering.cpp
index aa6bee23ad5ff..ffc2aec77ff91 100644
--- a/llvm/lib/Transforms/Instrumentation/PGOCtxProfLowering.cpp
+++ b/llvm/lib/Transforms/Instrumentation/PGOCtxProfLowering.cpp
@@ -12,6 +12,7 @@
#include "llvm/Analysis/OptimizationRemarkEmitter.h"
#include "llvm/IR/Analysis.h"
#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/GlobalValue.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
@@ -53,6 +54,7 @@ class CtxInstrumentationLowerer final {
ModuleAnalysisManager &MAM;
Type *ContextNodeTy = nullptr;
Type *ContextRootTy = nullptr;
+ Type *FunctionDataTy = nullptr;
DenseMap<const Function *, Constant *> ContextRootMap;
Function *StartCtx = nullptr;
@@ -120,6 +122,12 @@ CtxInstrumentationLowerer::CtxInstrumentationLowerer(Module &M,
PointerTy, /*CurrentMem*/
SanitizerMutexType, /*Taken*/
});
+ FunctionDataTy =
+ StructType::get(M.getContext(), {
+ PointerTy, /*FlatCtx*/
+ SanitizerMutexType, /*Mutex*/
+ });
+
// The Context header.
ContextNodeTy = StructType::get(M.getContext(), {
I64Ty, /*Guid*/
@@ -163,7 +171,8 @@ CtxInstrumentationLowerer::CtxInstrumentationLowerer(Module &M,
GetCtx = cast<Function>(
M.getOrInsertFunction(CompilerRtAPINames::GetCtx,
FunctionType::get(PointerTy,
- {PointerTy, /*Callee*/
+ {PointerTy, /*FunctionData*/
+ PointerTy, /*Callee*/
I64Ty, /*Guid*/
I32Ty, /*NumCounters*/
I32Ty}, /*NumCallsites*/
@@ -224,7 +233,6 @@ bool CtxInstrumentationLowerer::lowerFunction(Function &F) {
assert(Mark->getIndex()->isZero());
IRBuilder<> Builder(Mark);
-
Guid = Builder.getInt64(
AssignGUIDPass::getGUID(cast<Function>(*Mark->getNameValue())));
// The type of the context of this function is now knowable since we have
@@ -248,9 +256,14 @@ bool CtxInstrumentationLowerer::lowerFunction(Function &F) {
ORE.emit(
[&] { return OptimizationRemark(DEBUG_TYPE, "Entrypoint", &F); });
} else {
- Context =
- Builder.CreateCall(GetCtx, {&F, Guid, Builder.getInt32(NumCounters),
- Builder.getInt32(NumCallsites)});
+ // Make up a compact name, these names end up taking up a lot of space
+ // in the binary.
+ auto *FData = new GlobalVariable(
+ M, FunctionDataTy, false, GlobalVariable::InternalLinkage,
+ Constant::getNullValue(FunctionDataTy));
+ Context = Builder.CreateCall(GetCtx, {FData, &F, Guid,
+ Builder.getInt32(NumCounters),
+ Builder.getInt32(NumCallsites)});
ORE.emit([&] {
return OptimizationRemark(DEBUG_TYPE, "RegularFunction", &F);
});
diff --git a/llvm/test/Transforms/PGOProfile/ctx-instrumentation.ll b/llvm/test/Transforms/PGOProfile/ctx-instrumentation.ll
index 1927060de868e..e4a5ebdc818e6 100644
--- a/llvm/test/Transforms/PGOProfile/ctx-instrumentation.ll
+++ b/llvm/test/Transforms/PGOProfile/ctx-instrumentation.ll
@@ -13,6 +13,11 @@ declare void @bar()
; LOWERING: @another_entrypoint_no_callees_ctx_root = global { ptr, ptr, ptr, i8 } zeroinitializer
; LOWERING: @__llvm_ctx_profile_callsite = external hidden thread_local global ptr
; LOWERING: @__llvm_ctx_profile_expected_callee = external hidden thread_local global ptr
+; LOWERING: @[[GLOB0:[0-9]+]] = internal global { ptr, i8 } zeroinitializer
+; LOWERING: @[[GLOB1:[0-9]+]] = internal global { ptr, i8 } zeroinitializer
+; LOWERING: @[[GLOB2:[0-9]+]] = internal global { ptr, i8 } zeroinitializer
+; LOWERING: @[[GLOB3:[0-9]+]] = internal global { ptr, i8 } zeroinitializer
+; LOWERING: @[[GLOB4:[0-9]+]] = internal global { ptr, i8 } zeroinitializer
;.
define void @foo(i32 %a, ptr %fct) {
; INSTRUMENT-LABEL: define void @foo(
@@ -34,7 +39,7 @@ define void @foo(i32 %a, ptr %fct) {
;
; LOWERING-LABEL: define void @foo(
; LOWERING-SAME: i32 [[A:%.*]], ptr [[FCT:%.*]]) !guid [[META0:![0-9]+]] {
-; LOWERING-NEXT: [[TMP1:%.*]] = call ptr @__llvm_ctx_profile_get_context(ptr @foo, i64 6699318081062747564, i32 2, i32 2)
+; LOWERING-NEXT: [[TMP1:%.*]] = call ptr @__llvm_ctx_profile_get_context(ptr @[[GLOB0]], ptr @foo, i64 6699318081062747564, i32 2, i32 2)
; LOWERING-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[TMP1]] to i64
; LOWERING-NEXT: [[TMP3:%.*]] = and i64 [[TMP2]], 1
; LOWERING-NEXT: [[TMP4:%.*]] = call ptr @llvm.threadlocal.address.p0(ptr @__llvm_ctx_profile_expected_callee)
@@ -176,7 +181,7 @@ define void @simple(i32 %a) {
;
; LOWERING-LABEL: define void @simple(
; LOWERING-SAME: i32 [[A:%.*]]) !guid [[META3:![0-9]+]] {
-; LOWERING-NEXT: [[TMP1:%.*]] = call ptr @__llvm_ctx_profile_get_context(ptr @simple, i64 -3006003237940970099, i32 1, i32 0)
+; LOWERING-NEXT: [[TMP1:%.*]] = call ptr @__llvm_ctx_profile_get_context(ptr @[[GLOB1]], ptr @simple, i64 -3006003237940970099, i32 1, i32 0)
; LOWERING-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[TMP1]] to i64
; LOWERING-NEXT: [[TMP3:%.*]] = and i64 [[TMP2]], -2
; LOWERING-NEXT: [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr
@@ -200,7 +205,7 @@ define i32 @no_callsites(i32 %a) {
;
; LOWERING-LABEL: define i32 @no_callsites(
; LOWERING-SAME: i32 [[A:%.*]]) !guid [[META4:![0-9]+]] {
-; LOWERING-NEXT: [[TMP1:%.*]] = call ptr @__llvm_ctx_profile_get_context(ptr @no_callsites, i64 5679753335911435902, i32 2, i32 0)
+; LOWERING-NEXT: [[TMP1:%.*]] = call ptr @__llvm_ctx_profile_get_context(ptr @[[GLOB2]], ptr @no_callsites, i64 5679753335911435902, i32 2, i32 0)
; LOWERING-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[TMP1]] to i64
; LOWERING-NEXT: [[TMP3:%.*]] = and i64 [[TMP2]], -2
; LOWERING-NEXT: [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr
@@ -232,7 +237,7 @@ define void @no_counters() {
;
; LOWERING-LABEL: define void @no_counters(
; LOWERING-SAME: ) !guid [[META5:![0-9]+]] {
-; LOWERING-NEXT: [[TMP1:%.*]] = call ptr @__llvm_ctx_profile_get_context(ptr @no_counters, i64 5458232184388660970, i32 1, i32 1)
+; LOWERING-NEXT: [[TMP1:%.*]] = call ptr @__llvm_ctx_profile_get_context(ptr @[[GLOB3]], ptr @no_counters, i64 5458232184388660970, i32 1, i32 1)
; LOWERING-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[TMP1]] to i64
; LOWERING-NEXT: [[TMP3:%.*]] = and i64 [[TMP2]], 1
; LOWERING-NEXT: [[TMP4:%.*]] = call ptr @llvm.threadlocal.address.p0(ptr @__llvm_ctx_profile_expected_callee)
@@ -260,7 +265,7 @@ define void @inlineasm() {
;
; LOWERING-LABEL: define void @inlineasm(
; LOWERING-SAME: ) !guid [[META6:![0-9]+]] {
-; LOWERING-NEXT: [[TMP1:%.*]] = call ptr @__llvm_ctx_profile_get_context(ptr @inlineasm, i64 -3771893999295659109, i32 1, i32 0)
+; LOWERING-NEXT: [[TMP1:%.*]] = call ptr @__llvm_ctx_profile_get_context(ptr @[[GLOB4]], ptr @inlineasm, i64 -3771893999295659109, i32 1, i32 0)
; LOWERING-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[TMP1]] to i64
; LOWERING-NEXT: [[TMP3:%.*]] = and i64 [[TMP2]], -2
; LOWERING-NEXT: [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr
More information about the llvm-commits
mailing list