[compiler-rt] [ctxprof] Auto root detection: trie for stack samples (PR #133106)
Mircea Trofin via llvm-commits
llvm-commits at lists.llvm.org
Fri Mar 28 19:51:21 PDT 2025
https://github.com/mtrofin updated https://github.com/llvm/llvm-project/pull/133106
>From 1549a25fa5501f667e518b53c8b925f0ff3a801d Mon Sep 17 00:00:00 2001
From: Mircea Trofin <mtrofin at google.com>
Date: Mon, 24 Mar 2025 12:00:49 -0700
Subject: [PATCH] RuntimeCallsiteTrie
---
compiler-rt/lib/ctx_profile/CMakeLists.txt | 2 +
.../lib/ctx_profile/RootAutoDetector.cpp | 90 ++++++++++
.../lib/ctx_profile/RootAutoDetector.h | 57 +++++++
.../lib/ctx_profile/tests/CMakeLists.txt | 4 +-
.../tests/RootAutoDetectorTest.cpp | 155 ++++++++++++++++++
5 files changed, 307 insertions(+), 1 deletion(-)
create mode 100644 compiler-rt/lib/ctx_profile/RootAutoDetector.cpp
create mode 100644 compiler-rt/lib/ctx_profile/RootAutoDetector.h
create mode 100644 compiler-rt/lib/ctx_profile/tests/RootAutoDetectorTest.cpp
diff --git a/compiler-rt/lib/ctx_profile/CMakeLists.txt b/compiler-rt/lib/ctx_profile/CMakeLists.txt
index ce491fc7e8bf0..bb606449c61b1 100644
--- a/compiler-rt/lib/ctx_profile/CMakeLists.txt
+++ b/compiler-rt/lib/ctx_profile/CMakeLists.txt
@@ -2,11 +2,13 @@ add_compiler_rt_component(ctx_profile)
set(CTX_PROFILE_SOURCES
CtxInstrProfiling.cpp
+ RootAutoDetector.cpp
)
set(CTX_PROFILE_HEADERS
CtxInstrContextNode.h
CtxInstrProfiling.h
+ RootAutoDetector.h
)
include_directories(..)
diff --git a/compiler-rt/lib/ctx_profile/RootAutoDetector.cpp b/compiler-rt/lib/ctx_profile/RootAutoDetector.cpp
new file mode 100644
index 0000000000000..483c55c25eefe
--- /dev/null
+++ b/compiler-rt/lib/ctx_profile/RootAutoDetector.cpp
@@ -0,0 +1,90 @@
+//===- RootAutodetector.cpp - detect contextual profiling roots -----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "RootAutoDetector.h"
+
+#include "sanitizer_common/sanitizer_common.h"
+#include "sanitizer_common/sanitizer_placement_new.h" // IWYU pragma: keep (DenseMap)
+#include <assert.h>
+#include <dlfcn.h>
+#include <pthread.h>
+
+using namespace __ctx_profile;
+template <typename T> using Set = DenseMap<T, bool>;
+
+uptr PerThreadCallsiteTrie::getFctStartAddr(uptr CallsiteAddress) const {
+ // this requires --linkopt=-Wl,--export-dynamic
+ Dl_info Info;
+ if (dladdr(reinterpret_cast<const void *>(CallsiteAddress), &Info) != 0)
+ return reinterpret_cast<uptr>(Info.dli_saddr);
+ return 0;
+}
+
+void PerThreadCallsiteTrie::insertStack(const StackTrace &ST) {
+ ++TheTrie.Count;
+ auto *Current = &TheTrie;
+ // the stack is backwards - the first callsite is at the top.
+ for (int I = ST.size - 1; I >= 0; --I) {
+ uptr ChildAddr = ST.trace[I];
+ auto [Iter, _] = Current->Children.insert({ChildAddr, Trie(ChildAddr)});
+ ++Iter->second.Count;
+ Current = &Iter->second;
+ }
+}
+
+DenseMap<uptr, uint64_t> PerThreadCallsiteTrie::determineRoots() const {
+ // Assuming a message pump design, roots are those functions called by the
+ // message pump. The message pump is an infinite loop (for all practical
+ // considerations) fetching data from a queue. The root functions return -
+ // otherwise the message pump doesn't work. This function detects roots as the
+ // first place in the trie (starting from the root) where a function calls 2
+ // or more functions.
+ //
+ // We start with a callsite trie - the nodes are callsites. Different child
+ // nodes may actually correspond to the same function.
+ //
+ // For example: using function(callsite)
+ // f1(csf1_1) -> f2(csf2_1) -> f3
+ // -> f2(csf2_2) -> f4
+ //
+ // would be represented in our trie as:
+ // csf1_1 -> csf2_1 -> f3
+ // -> csf2_2 -> f4
+ //
+ // While we can assert the control flow returns to f2, we don't know if it
+ // ever returns to f1. f2 could be the message pump.
+ //
+ // We need to convert our callsite tree into a function tree. We can also,
+ // more economically, just see how many distinct functions there are at a
+ // certain depth. When that count is greater than 1, we got to potential roots
+ // and everything above should be considered as non-roots.
+ DenseMap<uptr, uint64_t> Result;
+ Set<const Trie *> Worklist;
+ Worklist.insert({&TheTrie, {}});
+
+ while (!Worklist.empty()) {
+ Set<const Trie *> NextWorklist;
+ DenseMap<uptr, uint64_t> Candidates;
+ Worklist.forEach([&](const auto &KVP) {
+ auto [Node, _] = KVP;
+ auto SA = getFctStartAddr(Node->CallsiteAddress);
+ Candidates[SA] += Node->Count;
+ Node->Children.forEach([&](auto &ChildKVP) {
+ NextWorklist.insert({&ChildKVP.second, true});
+ return true;
+ });
+ return true;
+ });
+ if (Candidates.size() > 1) {
+ Result.swap(Candidates);
+ break;
+ }
+ Worklist.swap(NextWorklist);
+ }
+ return Result;
+}
diff --git a/compiler-rt/lib/ctx_profile/RootAutoDetector.h b/compiler-rt/lib/ctx_profile/RootAutoDetector.h
new file mode 100644
index 0000000000000..85dd5ef1c32d9
--- /dev/null
+++ b/compiler-rt/lib/ctx_profile/RootAutoDetector.h
@@ -0,0 +1,57 @@
+/*===- RootAutodetector.h- auto-detect roots for ctxprof -----------------===*\
+|*
+|* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+|* See https://llvm.org/LICENSE.txt for license information.
+|* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+|*
+\*===----------------------------------------------------------------------===*/
+
+#ifndef CTX_PROFILE_ROOTAUTODETECTOR_H_
+#define CTX_PROFILE_ROOTAUTODETECTOR_H_
+
+#include "sanitizer_common/sanitizer_dense_map.h"
+#include "sanitizer_common/sanitizer_internal_defs.h"
+#include "sanitizer_common/sanitizer_stacktrace.h"
+#include <pthread.h>
+#include <sanitizer/common_interface_defs.h>
+
+using namespace __asan;
+using namespace __sanitizer;
+
+namespace __ctx_profile {
+
+/// Capture all the stack traces observed for a specific thread. The "for a
+/// specific thread" part is not enforced, but assumed in determineRoots.
+class PerThreadCallsiteTrie {
+protected:
+ /// A trie. A node is the address of a callsite in a function activation. A
+ /// child is a callsite in the activation made from the callsite
+ /// corresponding to the parent.
+ struct Trie final {
+ const uptr CallsiteAddress;
+ uint64_t Count = 0;
+ DenseMap<uptr, Trie> Children;
+
+ Trie(uptr CallsiteAddress = 0) : CallsiteAddress(CallsiteAddress) {}
+ };
+ Trie TheTrie;
+
+ /// Return the runtime start address of the function that contains the call at
+ /// the runtime address CallsiteAddress. May be overriden for easy testing.
+ virtual uptr getFctStartAddr(uptr CallsiteAddress) const;
+
+public:
+ PerThreadCallsiteTrie(const PerThreadCallsiteTrie &) = delete;
+ PerThreadCallsiteTrie(PerThreadCallsiteTrie &&) = default;
+ PerThreadCallsiteTrie() = default;
+
+ virtual ~PerThreadCallsiteTrie() = default;
+
+ void insertStack(const StackTrace &ST);
+
+ /// Return the runtime address of root functions, as determined for this
+ /// thread, together with the number of samples that included them.
+ DenseMap<uptr, uint64_t> determineRoots() const;
+};
+} // namespace __ctx_profile
+#endif
diff --git a/compiler-rt/lib/ctx_profile/tests/CMakeLists.txt b/compiler-rt/lib/ctx_profile/tests/CMakeLists.txt
index 012fd7aff7862..0954d5cd34487 100644
--- a/compiler-rt/lib/ctx_profile/tests/CMakeLists.txt
+++ b/compiler-rt/lib/ctx_profile/tests/CMakeLists.txt
@@ -22,10 +22,12 @@ append_list_if(COMPILER_RT_HAS_WVARIADIC_MACROS_FLAG -Wno-variadic-macros CTX_PR
file(GLOB CTX_PROFILE_HEADERS ../*.h)
set(CTX_PROFILE_SOURCES
- ../CtxInstrProfiling.cpp)
+ ../CtxInstrProfiling.cpp
+ ../RootAutoDetector.cpp)
set(CTX_PROFILE_UNITTESTS
CtxInstrProfilingTest.cpp
+ RootAutoDetectorTest.cpp
driver.cpp)
include_directories(../../../include)
diff --git a/compiler-rt/lib/ctx_profile/tests/RootAutoDetectorTest.cpp b/compiler-rt/lib/ctx_profile/tests/RootAutoDetectorTest.cpp
new file mode 100644
index 0000000000000..8fd5bf004faf7
--- /dev/null
+++ b/compiler-rt/lib/ctx_profile/tests/RootAutoDetectorTest.cpp
@@ -0,0 +1,155 @@
+#include "../RootAutoDetector.h"
+#include "sanitizer_common/sanitizer_array_ref.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+using namespace __ctx_profile;
+using ::testing::IsEmpty;
+using ::testing::Not;
+using ::testing::SizeIs;
+
+// Utility for describing a preorder traversal. By default it captures the
+// address and count at a callsite node. Implicitly nodes are expected to have 1
+// child. If they have none, we place a Marker::term and if they have more than
+// one, we place a Marker::split(nr_of_children) For example, using a list
+// notation, and letters to denote a pair of address and count:
+// (A (B C) (D (E F))) is a list of markers: A, split(2), B, term, C,
+// term, D, split(2), E, term, F, term
+class Marker {
+ enum class Kind { End, Value, Split };
+ const uptr Value;
+ const uptr Count;
+ const Kind K;
+ Marker(uptr V, uptr C, Kind S) : Value(V), Count(C), K(S) {}
+
+public:
+ Marker(uptr V, uptr C) : Marker(V, C, Kind::Value) {}
+
+ static Marker split(uptr V) { return Marker(V, 0, Kind::Split); }
+ static Marker term() { return Marker(0, 0, Kind::End); }
+
+ bool isSplit() const { return K == Kind::Split; }
+ bool isTerm() const { return K == Kind::End; }
+ bool isVal() const { return K == Kind::Value; }
+
+ bool operator==(const Marker &M) const {
+ return Value == M.Value && Count == M.Count && K == M.K;
+ }
+};
+
+class MockCallsiteTrie final : public PerThreadCallsiteTrie {
+ // Return the first multiple of 100.
+ uptr getFctStartAddr(uptr CallsiteAddress) const override {
+ return (CallsiteAddress / 100) * 100;
+ }
+
+ static void popAndCheck(ArrayRef<Marker> &Preorder, Marker M) {
+ ASSERT_THAT(Preorder, Not(IsEmpty()));
+ ASSERT_EQ(Preorder[0], M);
+ Preorder = Preorder.drop_front();
+ }
+
+ static void checkSameImpl(const Trie &T, ArrayRef<Marker> &Preorder) {
+ popAndCheck(Preorder, {T.CallsiteAddress, T.Count});
+
+ if (T.Children.empty()) {
+ popAndCheck(Preorder, Marker::term());
+ return;
+ }
+
+ if (T.Children.size() > 1)
+ popAndCheck(Preorder, Marker::split(T.Children.size()));
+
+ T.Children.forEach([&](const auto &KVP) {
+ checkSameImpl(KVP.second, Preorder);
+ return true;
+ });
+ }
+
+public:
+ void checkSame(ArrayRef<Marker> Preorder) const {
+ checkSameImpl(TheTrie, Preorder);
+ ASSERT_THAT(Preorder, IsEmpty());
+ }
+};
+
+TEST(PerThreadCallsiteTrieTest, Insert) {
+ MockCallsiteTrie R;
+ uptr Stack1[]{4, 3, 2, 1};
+ R.insertStack(StackTrace(Stack1, 4));
+ R.checkSame(ArrayRef<Marker>(
+ {{0, 1}, {1, 1}, {2, 1}, {3, 1}, {4, 1}, Marker::term()}));
+
+ uptr Stack2[]{5, 4, 3, 2, 1};
+ R.insertStack(StackTrace(Stack2, 5));
+ R.checkSame(ArrayRef<Marker>(
+ {{0, 2}, {1, 2}, {2, 2}, {3, 2}, {4, 2}, {5, 1}, Marker::term()}));
+
+ uptr Stack3[]{6, 3, 2, 1};
+ R.insertStack(StackTrace(Stack3, 4));
+ R.checkSame(ArrayRef<Marker>({{0, 3},
+ {1, 3},
+ {2, 3},
+ {3, 3},
+ Marker::split(2),
+ {4, 2},
+ {5, 1},
+ Marker::term(),
+ {6, 1},
+ Marker::term()}));
+ uptr Stack4[]{7, 2, 1};
+ R.insertStack(StackTrace(Stack4, 3));
+ R.checkSame(ArrayRef<Marker>({{0, 4},
+ {1, 4},
+ {2, 4},
+ Marker::split(2),
+ {7, 1},
+ Marker::term(),
+ {3, 3},
+ Marker::split(2),
+ {4, 2},
+ {5, 1},
+ Marker::term(),
+ {6, 1},
+ Marker::term()}));
+}
+
+TEST(PerThreadCallsiteTrieTest, DetectRoots) {
+ MockCallsiteTrie T;
+
+ uptr Stack1[]{501, 302, 202, 102};
+ uptr Stack2[]{601, 402, 203, 102};
+ T.insertStack({Stack1, 4});
+ T.insertStack({Stack2, 4});
+
+ auto R = T.determineRoots();
+ EXPECT_THAT(R, SizeIs(2U));
+ EXPECT_TRUE(R.contains(300));
+ EXPECT_TRUE(R.contains(400));
+}
+
+TEST(PerThreadCallsiteTrieTest, DetectRootsNoBranches) {
+ MockCallsiteTrie T;
+
+ uptr Stack1[]{501, 302, 202, 102};
+ T.insertStack({Stack1, 4});
+
+ auto R = T.determineRoots();
+ EXPECT_THAT(R, IsEmpty());
+}
+
+TEST(PerThreadCallsiteTrieTest, DetectRootsUnknownFct) {
+ MockCallsiteTrie T;
+
+ uptr Stack1[]{501, 302, 202, 102};
+ // The MockCallsiteTree address resolver resolves addresses over 100, so 40
+ // will be mapped to 0.
+ uptr Stack2[]{601, 40, 203, 102};
+ T.insertStack({Stack1, 4});
+ T.insertStack({Stack2, 4});
+
+ auto R = T.determineRoots();
+ ASSERT_THAT(R, SizeIs(2U));
+ EXPECT_TRUE(R.contains(300));
+ EXPECT_TRUE(R.contains(0));
+}
More information about the llvm-commits
mailing list