[clang] [compiler-rt] [llvm] [InstrProf] Created Thread local counter instrumentation, compiler-rt runtimes (PR #95494)
via cfe-commits
cfe-commits at lists.llvm.org
Thu Jun 13 19:37:22 PDT 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-clang-driver
Author: Andrew Wock (ajwock)
<details>
<summary>Changes</summary>
LLVM can now generate increments to counters in thread local storage.
Use a new compiler-rt runtime to atomically add thread local counters to global counters on thread exit.
The clang driver will link the new runtime libraries in when the new option -fprofile-thread-local is specified.
More details available in the RFC on discourse.
---
Patch is 67.04 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/95494.diff
36 Files Affected:
- (modified) clang/docs/UsersManual.rst (+8)
- (modified) clang/include/clang/Basic/CodeGenOptions.def (+1)
- (modified) clang/include/clang/Driver/Options.td (+3)
- (modified) clang/include/clang/Driver/ToolChain.h (+6)
- (modified) clang/lib/Driver/ToolChain.cpp (+10)
- (modified) clang/lib/Driver/ToolChains/Clang.cpp (+12)
- (modified) clang/lib/Driver/ToolChains/Linux.cpp (+7)
- (modified) compiler-rt/include/profile/InstrProfData.inc (+4)
- (modified) compiler-rt/lib/profile/CMakeLists.txt (+35)
- (added) compiler-rt/lib/profile/InstrProfilingDyLibLinux.cpp (+63)
- (modified) compiler-rt/lib/profile/InstrProfilingFile.c (+6)
- (modified) compiler-rt/lib/profile/InstrProfilingPlatformLinux.c (+1)
- (added) compiler-rt/lib/profile/InstrProfilingStaticTLSLinux.cpp (+123)
- (added) compiler-rt/lib/profile/InstrProfilingTLS.c (+29)
- (added) compiler-rt/lib/profile/InstrProfilingTLS.h (+39)
- (added) compiler-rt/lib/profile/InstrProfilingTLSDyLib.c (+100)
- (added) compiler-rt/lib/profile/InstrProfilingTLSDyLib.h (+4)
- (modified) compiler-rt/lib/tsan/rtl/CMakeLists.txt (+1-1)
- (added) compiler-rt/test/profile/Inputs/instrprof-tls-dlclose-lib.c (+7)
- (added) compiler-rt/test/profile/Inputs/instrprof-tls-dlclose-main.c (+93)
- (added) compiler-rt/test/profile/Inputs/instrprof-tls-dlopen-func.c (+9)
- (added) compiler-rt/test/profile/Inputs/instrprof-tls-dlopen-func2.c (+9)
- (added) compiler-rt/test/profile/Inputs/instrprof-tls-dlopen-main.c (+105)
- (added) compiler-rt/test/profile/Inputs/instrprof-tls-exit.c (+37)
- (added) compiler-rt/test/profile/Linux/instrprof-tls-dlclose-memfault.test (+27)
- (added) compiler-rt/test/profile/Linux/instrprof-tls-dlclose-mix-subset.test (+41)
- (added) compiler-rt/test/profile/Linux/instrprof-tls-dlclose-mix.test (+48)
- (added) compiler-rt/test/profile/Linux/instrprof-tls-dlclose-nodelete.test (+24)
- (added) compiler-rt/test/profile/Linux/instrprof-tls-dlopen.test (+32)
- (added) compiler-rt/test/profile/Linux/instrprof-tls-exit.test (+17)
- (added) compiler-rt/test/profile/Linux/instrprof-tls-noclose-mix.test (+51)
- (added) compiler-rt/test/profile/Linux/instrprof-tls-shared-mix-subset.test (+35)
- (added) compiler-rt/test/profile/Linux/instrprof-tls-shared-mix.test (+48)
- (modified) llvm/include/llvm/ProfileData/InstrProf.h (+3)
- (modified) llvm/include/llvm/ProfileData/InstrProfData.inc (+4)
- (modified) llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp (+68-3)
``````````diff
diff --git a/clang/docs/UsersManual.rst b/clang/docs/UsersManual.rst
index f954857b0235a..f7db513b92909 100644
--- a/clang/docs/UsersManual.rst
+++ b/clang/docs/UsersManual.rst
@@ -2932,6 +2932,14 @@ indexed format, regardeless whether it is produced by frontend or the IR pass.
overhead. ``prefer-atomic`` will be transformed to ``atomic`` when supported
by the target, or ``single`` otherwise.
+.. option:: -fprofile-thread-local
+
+ Increment profile counters in thread local storage and atomically add their
+ values to global counters on thread exit. This has the potential to deliver
+ both accuracy and high performance whenever there is high thread contention
+ on profile counters. This is an experimental option and it is only supported
+ on 64-bit linux.
+
Fine Tuning Profile Collection
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/clang/include/clang/Basic/CodeGenOptions.def b/clang/include/clang/Basic/CodeGenOptions.def
index 7ffc40a00504f..7cd0bfb6d71b5 100644
--- a/clang/include/clang/Basic/CodeGenOptions.def
+++ b/clang/include/clang/Basic/CodeGenOptions.def
@@ -218,6 +218,7 @@ ENUM_CODEGENOPT(ProfileUse, ProfileInstrKind, 2, ProfileNone)
/// instrumented. Selected group numbers can be 0 to N-1 inclusive.
VALUE_CODEGENOPT(ProfileTotalFunctionGroups, 32, 1)
VALUE_CODEGENOPT(ProfileSelectedFunctionGroup, 32, 0)
+CODEGENOPT(InstrProfileThreadLocal, 1, 0) ///< Counters are updated on a per-thread basis
CODEGENOPT(CoverageMapping , 1, 0) ///< Generate coverage mapping regions to
///< enable code coverage analysis.
CODEGENOPT(DumpCoverageMapping , 1, 0) ///< Dump the generated coverage mapping
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index d44faa55c456f..aab5b63c991f1 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -1768,6 +1768,9 @@ def fprofile_instr_generate : Flag<["-"], "fprofile-instr-generate">,
def fprofile_instr_generate_EQ : Joined<["-"], "fprofile-instr-generate=">,
Group<f_Group>, Visibility<[ClangOption, CLOption]>, MetaVarName<"<file>">,
HelpText<"Generate instrumented code to collect execution counts into <file> (overridden by LLVM_PROFILE_FILE env var)">;
+def fprofile_thread_local : Flag<["-"], "fprofile-thread-local">,
+ Group<f_Group>, Visibility<[ClangOption, CLOption]>,
+ HelpText<"Generage profile counters in thread local storage">;
def fprofile_instr_use : Flag<["-"], "fprofile-instr-use">, Group<f_Group>,
Visibility<[ClangOption, CLOption]>;
def fprofile_instr_use_EQ : Joined<["-"], "fprofile-instr-use=">,
diff --git a/clang/include/clang/Driver/ToolChain.h b/clang/include/clang/Driver/ToolChain.h
index 9789cfacafd78..162c730782afb 100644
--- a/clang/include/clang/Driver/ToolChain.h
+++ b/clang/include/clang/Driver/ToolChain.h
@@ -752,6 +752,12 @@ class ToolChain {
virtual void addProfileRTLibs(const llvm::opt::ArgList &Args,
llvm::opt::ArgStringList &CmdArgs) const;
+ /// addThreadLocalProfileRTLibs - With -fprofile-threadlocal, add the
+ /// threadlocal profile runtime static + shared library pair.
+ virtual void
+ addThreadLocalProfileRTLibs(const llvm::opt::ArgList &Args,
+ llvm::opt::ArgStringList &CmdArgs) const;
+
/// Add arguments to use system-specific CUDA includes.
virtual void AddCudaIncludeArgs(const llvm::opt::ArgList &DriverArgs,
llvm::opt::ArgStringList &CC1Args) const;
diff --git a/clang/lib/Driver/ToolChain.cpp b/clang/lib/Driver/ToolChain.cpp
index 40ab2e91125d1..4708cb7df5044 100644
--- a/clang/lib/Driver/ToolChain.cpp
+++ b/clang/lib/Driver/ToolChain.cpp
@@ -1078,6 +1078,16 @@ void ToolChain::addProfileRTLibs(const llvm::opt::ArgList &Args,
CmdArgs.push_back(getCompilerRTArgString(Args, "profile"));
}
+void ToolChain::addThreadLocalProfileRTLibs(
+ const llvm::opt::ArgList &Args, llvm::opt::ArgStringList &CmdArgs) const {
+ if (needsProfileRT(Args) && Args.hasArg(options::OPT_fprofile_thread_local)) {
+ // Static first, so we can specify '-u' where needed
+ CmdArgs.push_back(getCompilerRTArgString(Args, "profile_threadlocal"));
+ CmdArgs.push_back(getCompilerRTArgString(Args, "profile_threadlocal",
+ ToolChain::FT_Shared));
+ }
+}
+
ToolChain::RuntimeLibType ToolChain::GetRuntimeLibType(
const ArgList &Args) const {
if (runtimeLibType)
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index b8d8ff3db5d1f..cd63ac56fecf6 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -720,6 +720,18 @@ static void addPGOAndCoverageFlags(const ToolChain &TC, Compilation &C,
CmdArgs.push_back("-fcoverage-mcdc");
}
+ if (Args.hasArg(options::OPT_fprofile_thread_local)) {
+ if (!ProfileGenerateArg)
+ D.Diag(clang::diag::err_drv_argument_only_allowed_with)
+ << "-fprofile-thread-local"
+ << "-fprofile-instr-generate";
+
+ // Clang cc1 is not in the know about thread local coverage, but llvm
+ // should be
+ CmdArgs.push_back("-mllvm");
+ CmdArgs.push_back("-instr-prof-thread-local");
+ }
+
if (Arg *A = Args.getLastArg(options::OPT_ffile_compilation_dir_EQ,
options::OPT_fcoverage_compilation_dir_EQ)) {
if (A->getOption().matches(options::OPT_ffile_compilation_dir_EQ))
diff --git a/clang/lib/Driver/ToolChains/Linux.cpp b/clang/lib/Driver/ToolChains/Linux.cpp
index 2222dea431c3c..0a889f957786a 100644
--- a/clang/lib/Driver/ToolChains/Linux.cpp
+++ b/clang/lib/Driver/ToolChains/Linux.cpp
@@ -843,6 +843,13 @@ void Linux::addProfileRTLibs(const llvm::opt::ArgList &Args,
CmdArgs.push_back(Args.MakeArgString(
Twine("-u", llvm::getInstrProfRuntimeHookVarName())));
ToolChain::addProfileRTLibs(Args, CmdArgs);
+
+ if (needsProfileRT(Args) && Args.hasArg(options::OPT_fprofile_thread_local)) {
+ CmdArgs.push_back(Args.MakeArgString(Twine(
+ "-u",
+ llvm::StringRef("__llvm_profile_tls_register_thread_exit_handler"))));
+ }
+ ToolChain::addThreadLocalProfileRTLibs(Args, CmdArgs);
}
void Linux::addExtraOpts(llvm::opt::ArgStringList &CmdArgs) const {
diff --git a/compiler-rt/include/profile/InstrProfData.inc b/compiler-rt/include/profile/InstrProfData.inc
index e9866d94b762c..8655bcf498437 100644
--- a/compiler-rt/include/profile/InstrProfData.inc
+++ b/compiler-rt/include/profile/InstrProfData.inc
@@ -312,6 +312,9 @@ INSTR_PROF_SECT_ENTRY(IPSK_data, \
INSTR_PROF_SECT_ENTRY(IPSK_cnts, \
INSTR_PROF_QUOTE(INSTR_PROF_CNTS_COMMON), \
INSTR_PROF_CNTS_COFF, "__DATA,")
+INSTR_PROF_SECT_ENTRY(IPSK_tls_cnts, \
+ INSTR_PROF_QUOTE(INSTR_PROF_TLS_CNTS_COMMON), \
+ INSTR_PROF_CNTS_COFF, "__DATA,")
INSTR_PROF_SECT_ENTRY(IPSK_bitmap, \
INSTR_PROF_QUOTE(INSTR_PROF_BITS_COMMON), \
INSTR_PROF_BITS_COFF, "__DATA,")
@@ -750,6 +753,7 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
#define INSTR_PROF_NAME_COMMON __llvm_prf_names
#define INSTR_PROF_VNAME_COMMON __llvm_prf_vns
#define INSTR_PROF_CNTS_COMMON __llvm_prf_cnts
+#define INSTR_PROF_TLS_CNTS_COMMON __llvm_tls_prf_cnts
#define INSTR_PROF_BITS_COMMON __llvm_prf_bits
#define INSTR_PROF_VALS_COMMON __llvm_prf_vals
#define INSTR_PROF_VNODES_COMMON __llvm_prf_vnds
diff --git a/compiler-rt/lib/profile/CMakeLists.txt b/compiler-rt/lib/profile/CMakeLists.txt
index 45e5164891751..b9f3a20bb328d 100644
--- a/compiler-rt/lib/profile/CMakeLists.txt
+++ b/compiler-rt/lib/profile/CMakeLists.txt
@@ -70,14 +70,25 @@ set(PROFILE_SOURCES
InstrProfilingUtil.c
)
+set(PROFILE_STATIC_TLS_SOURCES
+ InstrProfilingTLS.c
+ InstrProfilingStaticTLSLinux.cpp)
+
+set(PROFILE_SHARED_TLS_SOURCES
+ InstrProfilingTLSDyLib.c
+ InstrProfilingDyLibLinux.cpp)
+
set(PROFILE_HEADERS
InstrProfiling.h
InstrProfilingInternal.h
InstrProfilingPort.h
InstrProfilingUtil.h
+ InstrProfilingTLS.h
WindowsMMap.h
)
+set(PROFILE_LINK_LIBS ${SANITIZER_COMMON_LINK_LIBS})
+
if(WIN32)
list(APPEND PROFILE_SOURCES
WindowsMMap.c
@@ -134,6 +145,30 @@ if(APPLE)
ADDITIONAL_HEADERS ${PROFILE_HEADERS}
PARENT_TARGET profile)
else()
+ #if(UNIX AND NOT APPLE AND NOT ANDROID)
+ if(OS_NAME MATCHES "Linux")
+ add_compiler_rt_runtime(clang_rt.profile_threadlocal
+ STATIC
+ OS ${PROFILE_SUPPORTED_OS}
+ ARCHS ${PROFILE_SUPPORTED_ARCH}
+ CFLAGS ${EXTRA_FLAGS}
+ SOURCES ${PROFILE_STATIC_TLS_SOURCES}
+ ADDITIONAL_HEADERS ${PROFILE_HEADERS}
+ PARENT_TARGET profile)
+
+ add_compiler_rt_runtime(clang_rt.profile_threadlocal
+ SHARED
+ OS ${PROFILE_SUPPORTED_OS}
+ ARCHS ${PROFILE_SUPPORTED_ARCH}
+ CFLAGS ${EXTRA_FLAGS}
+ SOURCES ${PROFILE_SHARED_TLS_SOURCES}
+ ADDITIONAL_HEADERS ${PROFILE_HEADERS}
+ OBJECT_LIBS RTInterception
+ RTSanitizerCommon
+ RTSanitizerCommonLibc
+ PARENT_TARGET profile)
+ endif()
+
add_compiler_rt_runtime(clang_rt.profile
STATIC
ARCHS ${PROFILE_SUPPORTED_ARCH}
diff --git a/compiler-rt/lib/profile/InstrProfilingDyLibLinux.cpp b/compiler-rt/lib/profile/InstrProfilingDyLibLinux.cpp
new file mode 100644
index 0000000000000..47f2baa6a5815
--- /dev/null
+++ b/compiler-rt/lib/profile/InstrProfilingDyLibLinux.cpp
@@ -0,0 +1,63 @@
+#if defined(__linux__) || defined(__FreeBSD__) || defined(__Fuchsia__) || \
+ (defined(__sun__) && defined(__svr4__)) || defined(__NetBSD__) || \
+ defined(_AIX)
+
+#include <elf.h>
+#include <link.h>
+#endif
+#include <stdlib.h>
+#include <string.h>
+
+extern "C" {
+
+#include "InstrProfiling.h"
+#include "InstrProfilingInternal.h"
+#include "InstrProfilingTLS.h"
+#include "InstrProfilingTLSDyLib.h"
+}
+
+#include "interception/interception.h"
+
+extern "C" {
+
+struct pthread_wrapper_arg {
+ void *(*fn)(void *);
+ void *arg;
+ uint32_t arg_keepalive;
+};
+
+void *pthread_fn_wrapper(void *arg_ptr) {
+ struct pthread_wrapper_arg *wrapper_arg =
+ (struct pthread_wrapper_arg *)arg_ptr;
+ void *(*fn)(void *) = __atomic_load_n(&wrapper_arg->fn, __ATOMIC_RELAXED);
+ void *arg = __atomic_load_n(&wrapper_arg->arg, __ATOMIC_RELAXED);
+ __atomic_store_n(&wrapper_arg->arg_keepalive, 0, __ATOMIC_RELEASE);
+
+ // startup
+ // Do nothing (TLS is automatically loaded and zeroed)
+ void *retval = fn(arg);
+ // cleanup
+ run_thread_exit_handlers();
+ // Combine counters with main counters
+ return retval;
+}
+
+void __llvm_register_profile_intercepts() { register_profile_intercepts(); }
+
+} // end extern "C"
+
+INTERCEPTOR(int, pthread_create, void *thread, void *attr,
+ void *(*start_routine)(void *), void *arg) {
+ int res = -1;
+ struct pthread_wrapper_arg wrapper_arg = {(void *(*)(void *))start_routine,
+ arg, 1};
+
+ // do pthread
+ res = REAL(pthread_create)(thread, attr, pthread_fn_wrapper, &wrapper_arg);
+ // Spin wait for child thread to copy arguments
+ while (__atomic_load_n(&wrapper_arg.arg_keepalive, __ATOMIC_ACQUIRE) == 1)
+ ;
+ return res;
+}
+
+void register_profile_intercepts() { INTERCEPT_FUNCTION(pthread_create); }
diff --git a/compiler-rt/lib/profile/InstrProfilingFile.c b/compiler-rt/lib/profile/InstrProfilingFile.c
index e4d99ef4872bd..64775f24fd83c 100644
--- a/compiler-rt/lib/profile/InstrProfilingFile.c
+++ b/compiler-rt/lib/profile/InstrProfilingFile.c
@@ -34,6 +34,7 @@
#include "InstrProfiling.h"
#include "InstrProfilingInternal.h"
#include "InstrProfilingPort.h"
+#include "InstrProfilingTLS.h"
#include "InstrProfilingUtil.h"
/* From where is profile name specified.
@@ -1084,6 +1085,8 @@ void __llvm_profile_set_filename(const char *FilenamePat) {
parseAndSetFilename(FilenamePat, PNS_runtime_api, 1);
}
+void (*on_main_thread_exit)(void) = NULL;
+
/* The public API for writing profile data into the file with name
* set by previous calls to __llvm_profile_set_filename or
* __llvm_profile_override_default_filename or
@@ -1097,6 +1100,9 @@ int __llvm_profile_write_file(void) {
// Temporarily suspend getting SIGKILL when the parent exits.
int PDeathSig = lprofSuspendSigKill();
+ if (on_main_thread_exit)
+ on_main_thread_exit();
+
if (lprofProfileDumped() || __llvm_profile_is_continuous_mode_enabled()) {
PROF_NOTE("Profile data not written to file: %s.\n", "already written");
if (PDeathSig == 1)
diff --git a/compiler-rt/lib/profile/InstrProfilingPlatformLinux.c b/compiler-rt/lib/profile/InstrProfilingPlatformLinux.c
index b766436497b74..4f96523a56a37 100644
--- a/compiler-rt/lib/profile/InstrProfilingPlatformLinux.c
+++ b/compiler-rt/lib/profile/InstrProfilingPlatformLinux.c
@@ -45,6 +45,7 @@ extern __llvm_profile_data PROF_DATA_STOP COMPILER_RT_VISIBILITY
COMPILER_RT_WEAK;
extern char PROF_CNTS_START COMPILER_RT_VISIBILITY COMPILER_RT_WEAK;
extern char PROF_CNTS_STOP COMPILER_RT_VISIBILITY COMPILER_RT_WEAK;
+
extern VTableProfData PROF_VTABLE_START COMPILER_RT_VISIBILITY COMPILER_RT_WEAK;
extern VTableProfData PROF_VTABLE_STOP COMPILER_RT_VISIBILITY COMPILER_RT_WEAK;
extern char PROF_VNAME_START COMPILER_RT_VISIBILITY COMPILER_RT_WEAK;
diff --git a/compiler-rt/lib/profile/InstrProfilingStaticTLSLinux.cpp b/compiler-rt/lib/profile/InstrProfilingStaticTLSLinux.cpp
new file mode 100644
index 0000000000000..fc5f785e1ab40
--- /dev/null
+++ b/compiler-rt/lib/profile/InstrProfilingStaticTLSLinux.cpp
@@ -0,0 +1,123 @@
+#if defined(__linux__) || defined(__FreeBSD__) || defined(__Fuchsia__) || \
+ (defined(__sun__) && defined(__svr4__)) || defined(__NetBSD__) || \
+ defined(_AIX)
+
+#include <elf.h>
+#include <link.h>
+#endif
+#include <stdlib.h>
+#include <string.h>
+
+extern "C" {
+
+#include "InstrProfiling.h"
+#include "InstrProfilingInternal.h"
+#include "InstrProfilingTLS.h"
+}
+
+extern "C" {
+
+#define PROF_TLS_CNTS_START INSTR_PROF_SECT_START(INSTR_PROF_TLS_CNTS_COMMON)
+#define PROF_TLS_CNTS_STOP INSTR_PROF_SECT_STOP(INSTR_PROF_TLS_CNTS_COMMON)
+
+extern char PROF_TLS_CNTS_START COMPILER_RT_VISIBILITY COMPILER_RT_WEAK;
+extern char PROF_TLS_CNTS_STOP COMPILER_RT_VISIBILITY COMPILER_RT_WEAK;
+
+COMPILER_RT_VISIBILITY char *__llvm_profile_begin_tls_counters(void) {
+ return &PROF_TLS_CNTS_START;
+}
+COMPILER_RT_VISIBILITY char *__llvm_profile_end_tls_counters(void) {
+ return &PROF_TLS_CNTS_STOP;
+}
+
+struct finalization_data {
+ char *mod_begin;
+ char *tls_img_begin;
+ char *tls_img_end;
+ char *cnts_begin;
+ char *cnts_end;
+};
+
+// This is O(num_modules + num_counters) unfortunately. If there were a
+// mechanism to calculate the thread-local start of a thread-local section like
+// there is a mechanism to calculate the static start of a static section (i.e.
+// __start_$sectionname), that would simplify implementation a lot and make this
+// just O(num_counters).
+static int FindAndAddCounters_cb(struct dl_phdr_info *info, size_t size,
+ void *data) {
+ finalization_data *fdata = (finalization_data *)data;
+ char *mod_begin = fdata->mod_begin;
+ // We're looking for a match to the dladdr calculated based on PROF_CNTS_START
+ if (mod_begin != (char *)info->dlpi_addr) {
+ return 0;
+ }
+
+ if (info->dlpi_tls_data == NULL) {
+ return 1;
+ }
+
+ const Elf64_Phdr *hdr = info->dlpi_phdr;
+ const Elf64_Phdr *last_hdr = hdr + info->dlpi_phnum;
+
+ const Elf64_Phdr *tls_hdr;
+ for (; hdr != last_hdr; ++hdr) {
+ if (hdr->p_type == PT_TLS) {
+ tls_hdr = hdr;
+ goto found_tls_ph;
+ }
+ }
+ return 1;
+found_tls_ph:
+ uint64_t num_counters =
+ __llvm_profile_get_num_counters(fdata->tls_img_begin, fdata->tls_img_end);
+ uint64_t counter_size = __llvm_profile_counter_entry_size();
+
+ // Calculate the offset of __llvm_prf_tls_cnts into the tls block for this
+ // module. The addresses in use below correspond to the tls initialization
+ // image, which is statically allocated for the module, rather than the TLS
+ // block itself.
+ uint64_t ph_true_vaddr =
+ (uint64_t)info->dlpi_addr + (uint64_t)tls_hdr->p_vaddr;
+ uint64_t tls_cnts_tlsblk_offset =
+ (uint64_t)fdata->tls_img_begin - ph_true_vaddr;
+
+ // Calculate the thread local copy of __llvm_prf_tls_cnts for this module.
+ uint64_t tls_prf_cnts_modlocal_begin =
+ (uint64_t)info->dlpi_tls_data + tls_cnts_tlsblk_offset;
+
+ // We don't support single byte counters because they are also resilient to
+ // thread synchronization issues and they are designed to avoid memory
+ // overhead, which is the opposite of what TL counters do.
+ // TODO: warn?
+ if (counter_size == sizeof(uint64_t)) {
+ uint64_t *tls_cnt = (uint64_t *)tls_prf_cnts_modlocal_begin;
+ uint64_t *tls_end = (uint64_t *)tls_cnt + num_counters;
+ uint64_t *cnt = (uint64_t *)fdata->cnts_begin;
+ for (; tls_cnt != tls_end; tls_cnt++, cnt++) {
+ __atomic_fetch_add(cnt, *tls_cnt, __ATOMIC_RELAXED);
+ }
+ }
+ return 1;
+}
+
+COMPILER_RT_VISIBILITY
+void __llvm_profile_tls_counters_finalize(void) {
+ struct finalization_data fdata = {0};
+ fdata.tls_img_begin = __llvm_profile_begin_tls_counters();
+ fdata.tls_img_end = __llvm_profile_end_tls_counters();
+ fdata.cnts_begin = __llvm_profile_begin_counters();
+ fdata.cnts_end = __llvm_profile_end_counters();
+
+ if (!fdata.tls_img_begin || !fdata.tls_img_end || !fdata.cnts_begin ||
+ !fdata.cnts_end) {
+ return;
+ }
+
+ Dl_info info;
+ if (dladdr(fdata.cnts_begin, &info) == 0) {
+ return;
+ }
+ fdata.mod_begin = (char *)info.dli_fbase;
+ dl_iterate_phdr(FindAndAddCounters_cb, &fdata);
+}
+}
diff --git a/compiler-rt/lib/profile/InstrProfilingTLS.c b/compiler-rt/lib/profile/InstrProfilingTLS.c
new file mode 100644
index 0000000000000..029ed9e542e5a
--- /dev/null
+++ b/compiler-rt/lib/profile/InstrProfilingTLS.c
@@ -0,0 +1,29 @@
+#include "InstrProfilingTLS.h"
+#include "InstrProfiling.h"
+
+struct texit_fn_node module_node COMPILER_RT_VISIBILITY;
+
+// We act as a shim between the profile_threadlocal sharedlib
+// and the profile static lib. We need to the tell the static lib
+// to add all of the counters up on main thread exit, but the
+// shared lib is the one who knows how to do that and whether its
+// already been done.
+//
+// In the constructor we pass flush_main_thread_counters from the
+// sharedlib to the non-tls statlib's on_main_thread_exit fnptr.
+extern void flush_main_thread_counters(void);
+extern void (*on_main_thread_exit)(void);
+
+__attribute__((constructor)) COMPILER_RT_VISIBILITY void
+__llvm_profile_tls_register_thread_exit_handler(void) {
+ module_node.prev = NULL;
+ module_node.next = NULL;
+ module_node.fn = __llvm_profile_tls_counters_finalize;
+ register_tls_prfcnts_module_thread_exit_handler(&module_node);
+ if (!on_main_thread_exit) {
+ on_main_thread_exit = flush_main_thread_counters;
+ }
+}
+
+// TODO: Add destructor
+// (But not yet, I'm scared)
diff --git a/compiler-rt/lib/profile/InstrProfilingTLS.h b/compiler-rt/lib/profile/InstrProfilingTLS.h
new file mode 100644
index 0000000000000..1b6001d27d375
--- /dev/null
+++ b/compiler-rt/lib/profile/InstrProfilingTLS.h
@@ -0,0 +1,39 @@
+#ifndef INSTR_PROFILING_TLS_H
+#define INSTR_PROFILING_TLS_H
+
+char *__llvm_profile_begin_tls_counters(void);
+char *__llvm_profile_end_tls_counters(void);
+
+/*!
+ * \brief Add counter values from TLS to the global counters for the program
+ *
+ * On thread exit, atomically add the values in TLS counters to the static
+ * counters for the whole process.
+ */
+void __llvm_profile_tls_counters_finalize(void);
+
+/*
+ * Dylib stuff
+ */
+typedef void (*texit_fnc)(void);
+
+typedef struct texit_fn_node {
+ struct texit_fn_node *prev;
+ texit_fnc fn;
+ struct texit_fn_node *next;
+} texit_fn_node;
+
+// TODO: really this should be write-preferring rwlocked
+struct texit_fn_registry {
+ int texit_mtx;
+ texit_fn_node head;
+ texit_fn_node tail;
+};
+
+void register_tls_prfcnts_module_thread_exit_handler(texit_fn_node *new_node);
+void unregister_tls_prfcnts_module_thread_exit_handler(texit_fn_node *new_node);
+void run_thread_exit_handlers(void);
+
+void register_profile...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/95494
More information about the cfe-commits
mailing list