[clang] [compiler-rt] [llvm] [InstrProf] Created Thread local counter instrumentation, compiler-rt runtimes (PR #95494)

via cfe-commits cfe-commits at lists.llvm.org
Thu Jun 13 19:37:22 PDT 2024


llvmbot wrote:


<!--LLVM PR SUMMARY COMMENT-->

@llvm/pr-subscribers-clang-driver

Author: Andrew Wock (ajwock)

<details>
<summary>Changes</summary>

LLVM can now generate increments to counters in thread local storage.

Use a new compiler-rt runtime to atomically add thread local counters to global counters on thread exit.

The clang driver will link the new runtime libraries in when the new option -fprofile-thread-local is specified.

More details available in the RFC on discourse.

---

Patch is 67.04 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/95494.diff


36 Files Affected:

- (modified) clang/docs/UsersManual.rst (+8) 
- (modified) clang/include/clang/Basic/CodeGenOptions.def (+1) 
- (modified) clang/include/clang/Driver/Options.td (+3) 
- (modified) clang/include/clang/Driver/ToolChain.h (+6) 
- (modified) clang/lib/Driver/ToolChain.cpp (+10) 
- (modified) clang/lib/Driver/ToolChains/Clang.cpp (+12) 
- (modified) clang/lib/Driver/ToolChains/Linux.cpp (+7) 
- (modified) compiler-rt/include/profile/InstrProfData.inc (+4) 
- (modified) compiler-rt/lib/profile/CMakeLists.txt (+35) 
- (added) compiler-rt/lib/profile/InstrProfilingDyLibLinux.cpp (+63) 
- (modified) compiler-rt/lib/profile/InstrProfilingFile.c (+6) 
- (modified) compiler-rt/lib/profile/InstrProfilingPlatformLinux.c (+1) 
- (added) compiler-rt/lib/profile/InstrProfilingStaticTLSLinux.cpp (+123) 
- (added) compiler-rt/lib/profile/InstrProfilingTLS.c (+29) 
- (added) compiler-rt/lib/profile/InstrProfilingTLS.h (+39) 
- (added) compiler-rt/lib/profile/InstrProfilingTLSDyLib.c (+100) 
- (added) compiler-rt/lib/profile/InstrProfilingTLSDyLib.h (+4) 
- (modified) compiler-rt/lib/tsan/rtl/CMakeLists.txt (+1-1) 
- (added) compiler-rt/test/profile/Inputs/instrprof-tls-dlclose-lib.c (+7) 
- (added) compiler-rt/test/profile/Inputs/instrprof-tls-dlclose-main.c (+93) 
- (added) compiler-rt/test/profile/Inputs/instrprof-tls-dlopen-func.c (+9) 
- (added) compiler-rt/test/profile/Inputs/instrprof-tls-dlopen-func2.c (+9) 
- (added) compiler-rt/test/profile/Inputs/instrprof-tls-dlopen-main.c (+105) 
- (added) compiler-rt/test/profile/Inputs/instrprof-tls-exit.c (+37) 
- (added) compiler-rt/test/profile/Linux/instrprof-tls-dlclose-memfault.test (+27) 
- (added) compiler-rt/test/profile/Linux/instrprof-tls-dlclose-mix-subset.test (+41) 
- (added) compiler-rt/test/profile/Linux/instrprof-tls-dlclose-mix.test (+48) 
- (added) compiler-rt/test/profile/Linux/instrprof-tls-dlclose-nodelete.test (+24) 
- (added) compiler-rt/test/profile/Linux/instrprof-tls-dlopen.test (+32) 
- (added) compiler-rt/test/profile/Linux/instrprof-tls-exit.test (+17) 
- (added) compiler-rt/test/profile/Linux/instrprof-tls-noclose-mix.test (+51) 
- (added) compiler-rt/test/profile/Linux/instrprof-tls-shared-mix-subset.test (+35) 
- (added) compiler-rt/test/profile/Linux/instrprof-tls-shared-mix.test (+48) 
- (modified) llvm/include/llvm/ProfileData/InstrProf.h (+3) 
- (modified) llvm/include/llvm/ProfileData/InstrProfData.inc (+4) 
- (modified) llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp (+68-3) 


``````````diff
diff --git a/clang/docs/UsersManual.rst b/clang/docs/UsersManual.rst
index f954857b0235a..f7db513b92909 100644
--- a/clang/docs/UsersManual.rst
+++ b/clang/docs/UsersManual.rst
@@ -2932,6 +2932,14 @@ indexed format, regardeless whether it is produced by frontend or the IR pass.
   overhead. ``prefer-atomic`` will be transformed to ``atomic`` when supported
   by the target, or ``single`` otherwise.
 
+.. option:: -fprofile-thread-local
+
+   Increment profile counters in thread local storage and atomically add their
+   values to global counters on thread exit.  This has the potential to deliver
+   both accuracy and high performance whenever there is high thread contention 
+   on profile counters.  This is an experimental option and it is only supported
+   on 64-bit linux.
+
 Fine Tuning Profile Collection
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
diff --git a/clang/include/clang/Basic/CodeGenOptions.def b/clang/include/clang/Basic/CodeGenOptions.def
index 7ffc40a00504f..7cd0bfb6d71b5 100644
--- a/clang/include/clang/Basic/CodeGenOptions.def
+++ b/clang/include/clang/Basic/CodeGenOptions.def
@@ -218,6 +218,7 @@ ENUM_CODEGENOPT(ProfileUse, ProfileInstrKind, 2, ProfileNone)
 /// instrumented. Selected group numbers can be 0 to N-1 inclusive.
 VALUE_CODEGENOPT(ProfileTotalFunctionGroups, 32, 1)
 VALUE_CODEGENOPT(ProfileSelectedFunctionGroup, 32, 0)
+CODEGENOPT(InstrProfileThreadLocal, 1, 0) ///< Counters are updated on a per-thread basis
 CODEGENOPT(CoverageMapping , 1, 0) ///< Generate coverage mapping regions to
                                    ///< enable code coverage analysis.
 CODEGENOPT(DumpCoverageMapping , 1, 0) ///< Dump the generated coverage mapping
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index d44faa55c456f..aab5b63c991f1 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -1768,6 +1768,9 @@ def fprofile_instr_generate : Flag<["-"], "fprofile-instr-generate">,
 def fprofile_instr_generate_EQ : Joined<["-"], "fprofile-instr-generate=">,
     Group<f_Group>, Visibility<[ClangOption, CLOption]>, MetaVarName<"<file>">,
     HelpText<"Generate instrumented code to collect execution counts into <file> (overridden by LLVM_PROFILE_FILE env var)">;
+def fprofile_thread_local : Flag<["-"], "fprofile-thread-local">,
+    Group<f_Group>, Visibility<[ClangOption, CLOption]>,
+    HelpText<"Generage profile counters in thread local storage">;
 def fprofile_instr_use : Flag<["-"], "fprofile-instr-use">, Group<f_Group>,
     Visibility<[ClangOption, CLOption]>;
 def fprofile_instr_use_EQ : Joined<["-"], "fprofile-instr-use=">,
diff --git a/clang/include/clang/Driver/ToolChain.h b/clang/include/clang/Driver/ToolChain.h
index 9789cfacafd78..162c730782afb 100644
--- a/clang/include/clang/Driver/ToolChain.h
+++ b/clang/include/clang/Driver/ToolChain.h
@@ -752,6 +752,12 @@ class ToolChain {
   virtual void addProfileRTLibs(const llvm::opt::ArgList &Args,
                                 llvm::opt::ArgStringList &CmdArgs) const;
 
+  /// addThreadLocalProfileRTLibs - With -fprofile-threadlocal, add the
+  /// threadlocal profile runtime static + shared library pair.
+  virtual void
+  addThreadLocalProfileRTLibs(const llvm::opt::ArgList &Args,
+                              llvm::opt::ArgStringList &CmdArgs) const;
+
   /// Add arguments to use system-specific CUDA includes.
   virtual void AddCudaIncludeArgs(const llvm::opt::ArgList &DriverArgs,
                                   llvm::opt::ArgStringList &CC1Args) const;
diff --git a/clang/lib/Driver/ToolChain.cpp b/clang/lib/Driver/ToolChain.cpp
index 40ab2e91125d1..4708cb7df5044 100644
--- a/clang/lib/Driver/ToolChain.cpp
+++ b/clang/lib/Driver/ToolChain.cpp
@@ -1078,6 +1078,16 @@ void ToolChain::addProfileRTLibs(const llvm::opt::ArgList &Args,
   CmdArgs.push_back(getCompilerRTArgString(Args, "profile"));
 }
 
+void ToolChain::addThreadLocalProfileRTLibs(
+    const llvm::opt::ArgList &Args, llvm::opt::ArgStringList &CmdArgs) const {
+  if (needsProfileRT(Args) && Args.hasArg(options::OPT_fprofile_thread_local)) {
+    // Static first, so we can specify '-u' where needed
+    CmdArgs.push_back(getCompilerRTArgString(Args, "profile_threadlocal"));
+    CmdArgs.push_back(getCompilerRTArgString(Args, "profile_threadlocal",
+                                             ToolChain::FT_Shared));
+  }
+}
+
 ToolChain::RuntimeLibType ToolChain::GetRuntimeLibType(
     const ArgList &Args) const {
   if (runtimeLibType)
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index b8d8ff3db5d1f..cd63ac56fecf6 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -720,6 +720,18 @@ static void addPGOAndCoverageFlags(const ToolChain &TC, Compilation &C,
     CmdArgs.push_back("-fcoverage-mcdc");
   }
 
+  if (Args.hasArg(options::OPT_fprofile_thread_local)) {
+    if (!ProfileGenerateArg)
+      D.Diag(clang::diag::err_drv_argument_only_allowed_with)
+          << "-fprofile-thread-local"
+          << "-fprofile-instr-generate";
+
+    // Clang cc1 is not in the know about thread local coverage, but llvm
+    // should be
+    CmdArgs.push_back("-mllvm");
+    CmdArgs.push_back("-instr-prof-thread-local");
+  }
+
   if (Arg *A = Args.getLastArg(options::OPT_ffile_compilation_dir_EQ,
                                options::OPT_fcoverage_compilation_dir_EQ)) {
     if (A->getOption().matches(options::OPT_ffile_compilation_dir_EQ))
diff --git a/clang/lib/Driver/ToolChains/Linux.cpp b/clang/lib/Driver/ToolChains/Linux.cpp
index 2222dea431c3c..0a889f957786a 100644
--- a/clang/lib/Driver/ToolChains/Linux.cpp
+++ b/clang/lib/Driver/ToolChains/Linux.cpp
@@ -843,6 +843,13 @@ void Linux::addProfileRTLibs(const llvm::opt::ArgList &Args,
     CmdArgs.push_back(Args.MakeArgString(
         Twine("-u", llvm::getInstrProfRuntimeHookVarName())));
   ToolChain::addProfileRTLibs(Args, CmdArgs);
+
+  if (needsProfileRT(Args) && Args.hasArg(options::OPT_fprofile_thread_local)) {
+    CmdArgs.push_back(Args.MakeArgString(Twine(
+        "-u",
+        llvm::StringRef("__llvm_profile_tls_register_thread_exit_handler"))));
+  }
+  ToolChain::addThreadLocalProfileRTLibs(Args, CmdArgs);
 }
 
 void Linux::addExtraOpts(llvm::opt::ArgStringList &CmdArgs) const {
diff --git a/compiler-rt/include/profile/InstrProfData.inc b/compiler-rt/include/profile/InstrProfData.inc
index e9866d94b762c..8655bcf498437 100644
--- a/compiler-rt/include/profile/InstrProfData.inc
+++ b/compiler-rt/include/profile/InstrProfData.inc
@@ -312,6 +312,9 @@ INSTR_PROF_SECT_ENTRY(IPSK_data, \
 INSTR_PROF_SECT_ENTRY(IPSK_cnts, \
                       INSTR_PROF_QUOTE(INSTR_PROF_CNTS_COMMON), \
                       INSTR_PROF_CNTS_COFF, "__DATA,")
+INSTR_PROF_SECT_ENTRY(IPSK_tls_cnts, \
+                      INSTR_PROF_QUOTE(INSTR_PROF_TLS_CNTS_COMMON), \
+                      INSTR_PROF_CNTS_COFF, "__DATA,")
 INSTR_PROF_SECT_ENTRY(IPSK_bitmap, \
                       INSTR_PROF_QUOTE(INSTR_PROF_BITS_COMMON), \
                       INSTR_PROF_BITS_COFF, "__DATA,")
@@ -750,6 +753,7 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
 #define INSTR_PROF_NAME_COMMON __llvm_prf_names
 #define INSTR_PROF_VNAME_COMMON __llvm_prf_vns
 #define INSTR_PROF_CNTS_COMMON __llvm_prf_cnts
+#define INSTR_PROF_TLS_CNTS_COMMON __llvm_tls_prf_cnts
 #define INSTR_PROF_BITS_COMMON __llvm_prf_bits
 #define INSTR_PROF_VALS_COMMON __llvm_prf_vals
 #define INSTR_PROF_VNODES_COMMON __llvm_prf_vnds
diff --git a/compiler-rt/lib/profile/CMakeLists.txt b/compiler-rt/lib/profile/CMakeLists.txt
index 45e5164891751..b9f3a20bb328d 100644
--- a/compiler-rt/lib/profile/CMakeLists.txt
+++ b/compiler-rt/lib/profile/CMakeLists.txt
@@ -70,14 +70,25 @@ set(PROFILE_SOURCES
   InstrProfilingUtil.c
   )
 
+set(PROFILE_STATIC_TLS_SOURCES
+  InstrProfilingTLS.c
+  InstrProfilingStaticTLSLinux.cpp)
+
+set(PROFILE_SHARED_TLS_SOURCES
+  InstrProfilingTLSDyLib.c
+  InstrProfilingDyLibLinux.cpp)
+
 set(PROFILE_HEADERS
   InstrProfiling.h
   InstrProfilingInternal.h
   InstrProfilingPort.h
   InstrProfilingUtil.h
+  InstrProfilingTLS.h
   WindowsMMap.h
   )
 
+set(PROFILE_LINK_LIBS ${SANITIZER_COMMON_LINK_LIBS})
+
 if(WIN32)
   list(APPEND PROFILE_SOURCES
     WindowsMMap.c
@@ -134,6 +145,30 @@ if(APPLE)
     ADDITIONAL_HEADERS ${PROFILE_HEADERS}
     PARENT_TARGET profile)
 else()
+  #if(UNIX AND NOT APPLE AND NOT ANDROID)
+  if(OS_NAME MATCHES "Linux")
+    add_compiler_rt_runtime(clang_rt.profile_threadlocal
+      STATIC
+      OS ${PROFILE_SUPPORTED_OS}
+      ARCHS ${PROFILE_SUPPORTED_ARCH}
+      CFLAGS ${EXTRA_FLAGS}
+      SOURCES ${PROFILE_STATIC_TLS_SOURCES} 
+      ADDITIONAL_HEADERS ${PROFILE_HEADERS}
+      PARENT_TARGET profile)
+
+    add_compiler_rt_runtime(clang_rt.profile_threadlocal
+      SHARED
+      OS ${PROFILE_SUPPORTED_OS}
+      ARCHS ${PROFILE_SUPPORTED_ARCH}
+      CFLAGS ${EXTRA_FLAGS}
+      SOURCES ${PROFILE_SHARED_TLS_SOURCES} 
+      ADDITIONAL_HEADERS ${PROFILE_HEADERS}
+      OBJECT_LIBS RTInterception
+                  RTSanitizerCommon
+                  RTSanitizerCommonLibc
+      PARENT_TARGET profile)
+  endif()
+
   add_compiler_rt_runtime(clang_rt.profile
     STATIC
     ARCHS ${PROFILE_SUPPORTED_ARCH}
diff --git a/compiler-rt/lib/profile/InstrProfilingDyLibLinux.cpp b/compiler-rt/lib/profile/InstrProfilingDyLibLinux.cpp
new file mode 100644
index 0000000000000..47f2baa6a5815
--- /dev/null
+++ b/compiler-rt/lib/profile/InstrProfilingDyLibLinux.cpp
@@ -0,0 +1,63 @@
+#if defined(__linux__) || defined(__FreeBSD__) || defined(__Fuchsia__) ||      \
+    (defined(__sun__) && defined(__svr4__)) || defined(__NetBSD__) ||          \
+    defined(_AIX)
+
+#include <elf.h>
+#include <link.h>
+#endif
+#include <stdlib.h>
+#include <string.h>
+
+extern "C" {
+
+#include "InstrProfiling.h"
+#include "InstrProfilingInternal.h"
+#include "InstrProfilingTLS.h"
+#include "InstrProfilingTLSDyLib.h"
+}
+
+#include "interception/interception.h"
+
+extern "C" {
+
+struct pthread_wrapper_arg {
+  void *(*fn)(void *);
+  void *arg;
+  uint32_t arg_keepalive;
+};
+
+void *pthread_fn_wrapper(void *arg_ptr) {
+  struct pthread_wrapper_arg *wrapper_arg =
+      (struct pthread_wrapper_arg *)arg_ptr;
+  void *(*fn)(void *) = __atomic_load_n(&wrapper_arg->fn, __ATOMIC_RELAXED);
+  void *arg = __atomic_load_n(&wrapper_arg->arg, __ATOMIC_RELAXED);
+  __atomic_store_n(&wrapper_arg->arg_keepalive, 0, __ATOMIC_RELEASE);
+
+  // startup
+  // Do nothing (TLS is automatically loaded and zeroed)
+  void *retval = fn(arg);
+  // cleanup
+  run_thread_exit_handlers();
+  // Combine counters with main counters
+  return retval;
+}
+
+void __llvm_register_profile_intercepts() { register_profile_intercepts(); }
+
+} // end extern "C"
+
+INTERCEPTOR(int, pthread_create, void *thread, void *attr,
+            void *(*start_routine)(void *), void *arg) {
+  int res = -1;
+  struct pthread_wrapper_arg wrapper_arg = {(void *(*)(void *))start_routine,
+                                            arg, 1};
+
+  // do pthread
+  res = REAL(pthread_create)(thread, attr, pthread_fn_wrapper, &wrapper_arg);
+  // Spin wait for child thread to copy arguments
+  while (__atomic_load_n(&wrapper_arg.arg_keepalive, __ATOMIC_ACQUIRE) == 1)
+    ;
+  return res;
+}
+
+void register_profile_intercepts() { INTERCEPT_FUNCTION(pthread_create); }
diff --git a/compiler-rt/lib/profile/InstrProfilingFile.c b/compiler-rt/lib/profile/InstrProfilingFile.c
index e4d99ef4872bd..64775f24fd83c 100644
--- a/compiler-rt/lib/profile/InstrProfilingFile.c
+++ b/compiler-rt/lib/profile/InstrProfilingFile.c
@@ -34,6 +34,7 @@
 #include "InstrProfiling.h"
 #include "InstrProfilingInternal.h"
 #include "InstrProfilingPort.h"
+#include "InstrProfilingTLS.h"
 #include "InstrProfilingUtil.h"
 
 /* From where is profile name specified.
@@ -1084,6 +1085,8 @@ void __llvm_profile_set_filename(const char *FilenamePat) {
   parseAndSetFilename(FilenamePat, PNS_runtime_api, 1);
 }
 
+void (*on_main_thread_exit)(void) = NULL;
+
 /* The public API for writing profile data into the file with name
  * set by previous calls to __llvm_profile_set_filename or
  * __llvm_profile_override_default_filename or
@@ -1097,6 +1100,9 @@ int __llvm_profile_write_file(void) {
   // Temporarily suspend getting SIGKILL when the parent exits.
   int PDeathSig = lprofSuspendSigKill();
 
+  if (on_main_thread_exit)
+    on_main_thread_exit();
+
   if (lprofProfileDumped() || __llvm_profile_is_continuous_mode_enabled()) {
     PROF_NOTE("Profile data not written to file: %s.\n", "already written");
     if (PDeathSig == 1)
diff --git a/compiler-rt/lib/profile/InstrProfilingPlatformLinux.c b/compiler-rt/lib/profile/InstrProfilingPlatformLinux.c
index b766436497b74..4f96523a56a37 100644
--- a/compiler-rt/lib/profile/InstrProfilingPlatformLinux.c
+++ b/compiler-rt/lib/profile/InstrProfilingPlatformLinux.c
@@ -45,6 +45,7 @@ extern __llvm_profile_data PROF_DATA_STOP COMPILER_RT_VISIBILITY
     COMPILER_RT_WEAK;
 extern char PROF_CNTS_START COMPILER_RT_VISIBILITY COMPILER_RT_WEAK;
 extern char PROF_CNTS_STOP COMPILER_RT_VISIBILITY COMPILER_RT_WEAK;
+
 extern VTableProfData PROF_VTABLE_START COMPILER_RT_VISIBILITY COMPILER_RT_WEAK;
 extern VTableProfData PROF_VTABLE_STOP COMPILER_RT_VISIBILITY COMPILER_RT_WEAK;
 extern char PROF_VNAME_START COMPILER_RT_VISIBILITY COMPILER_RT_WEAK;
diff --git a/compiler-rt/lib/profile/InstrProfilingStaticTLSLinux.cpp b/compiler-rt/lib/profile/InstrProfilingStaticTLSLinux.cpp
new file mode 100644
index 0000000000000..fc5f785e1ab40
--- /dev/null
+++ b/compiler-rt/lib/profile/InstrProfilingStaticTLSLinux.cpp
@@ -0,0 +1,123 @@
+#if defined(__linux__) || defined(__FreeBSD__) || defined(__Fuchsia__) ||      \
+    (defined(__sun__) && defined(__svr4__)) || defined(__NetBSD__) ||          \
+    defined(_AIX)
+
+#include <elf.h>
+#include <link.h>
+#endif
+#include <stdlib.h>
+#include <string.h>
+
+extern "C" {
+
+#include "InstrProfiling.h"
+#include "InstrProfilingInternal.h"
+#include "InstrProfilingTLS.h"
+}
+
+extern "C" {
+
+#define PROF_TLS_CNTS_START INSTR_PROF_SECT_START(INSTR_PROF_TLS_CNTS_COMMON)
+#define PROF_TLS_CNTS_STOP INSTR_PROF_SECT_STOP(INSTR_PROF_TLS_CNTS_COMMON)
+
+extern char PROF_TLS_CNTS_START COMPILER_RT_VISIBILITY COMPILER_RT_WEAK;
+extern char PROF_TLS_CNTS_STOP COMPILER_RT_VISIBILITY COMPILER_RT_WEAK;
+
+COMPILER_RT_VISIBILITY char *__llvm_profile_begin_tls_counters(void) {
+  return &PROF_TLS_CNTS_START;
+}
+COMPILER_RT_VISIBILITY char *__llvm_profile_end_tls_counters(void) {
+  return &PROF_TLS_CNTS_STOP;
+}
+
+struct finalization_data {
+  char *mod_begin;
+  char *tls_img_begin;
+  char *tls_img_end;
+  char *cnts_begin;
+  char *cnts_end;
+};
+
+// This is O(num_modules + num_counters) unfortunately.  If there were a
+// mechanism to calculate the thread-local start of a thread-local section like
+// there is a mechanism to calculate the static start of a static section (i.e.
+// __start_$sectionname), that would simplify implementation a lot and make this
+// just O(num_counters).
+static int FindAndAddCounters_cb(struct dl_phdr_info *info, size_t size,
+                                 void *data) {
+  finalization_data *fdata = (finalization_data *)data;
+  char *mod_begin = fdata->mod_begin;
+  // We're looking for a match to the dladdr calculated based on PROF_CNTS_START
+  if (mod_begin != (char *)info->dlpi_addr) {
+    return 0;
+  }
+
+  if (info->dlpi_tls_data == NULL) {
+    return 1;
+  }
+
+  const Elf64_Phdr *hdr = info->dlpi_phdr;
+  const Elf64_Phdr *last_hdr = hdr + info->dlpi_phnum;
+
+  const Elf64_Phdr *tls_hdr;
+  for (; hdr != last_hdr; ++hdr) {
+    if (hdr->p_type == PT_TLS) {
+      tls_hdr = hdr;
+      goto found_tls_ph;
+    }
+  }
+  return 1;
+found_tls_ph:
+  uint64_t num_counters =
+      __llvm_profile_get_num_counters(fdata->tls_img_begin, fdata->tls_img_end);
+  uint64_t counter_size = __llvm_profile_counter_entry_size();
+
+  // Calculate the offset of __llvm_prf_tls_cnts into the tls block for this
+  // module. The addresses in use below correspond to the tls initialization
+  // image, which is statically allocated for the module, rather than the TLS
+  // block itself.
+  uint64_t ph_true_vaddr =
+      (uint64_t)info->dlpi_addr + (uint64_t)tls_hdr->p_vaddr;
+  uint64_t tls_cnts_tlsblk_offset =
+      (uint64_t)fdata->tls_img_begin - ph_true_vaddr;
+
+  // Calculate the thread local copy of __llvm_prf_tls_cnts for this module.
+  uint64_t tls_prf_cnts_modlocal_begin =
+      (uint64_t)info->dlpi_tls_data + tls_cnts_tlsblk_offset;
+
+  // We don't support single byte counters because they are also resilient to
+  // thread synchronization issues and they are designed to avoid memory
+  // overhead, which is the opposite of what TL counters do.
+  // TODO: warn?
+  if (counter_size == sizeof(uint64_t)) {
+    uint64_t *tls_cnt = (uint64_t *)tls_prf_cnts_modlocal_begin;
+    uint64_t *tls_end = (uint64_t *)tls_cnt + num_counters;
+    uint64_t *cnt = (uint64_t *)fdata->cnts_begin;
+    for (; tls_cnt != tls_end; tls_cnt++, cnt++) {
+      __atomic_fetch_add(cnt, *tls_cnt, __ATOMIC_RELAXED);
+    }
+  }
+  return 1;
+}
+
+COMPILER_RT_VISIBILITY
+void __llvm_profile_tls_counters_finalize(void) {
+  struct finalization_data fdata = {0};
+  fdata.tls_img_begin = __llvm_profile_begin_tls_counters();
+  fdata.tls_img_end = __llvm_profile_end_tls_counters();
+  fdata.cnts_begin = __llvm_profile_begin_counters();
+  fdata.cnts_end = __llvm_profile_end_counters();
+
+  if (!fdata.tls_img_begin || !fdata.tls_img_end || !fdata.cnts_begin ||
+      !fdata.cnts_end) {
+    return;
+  }
+
+  Dl_info info;
+  if (dladdr(fdata.cnts_begin, &info) == 0) {
+    return;
+  }
+  fdata.mod_begin = (char *)info.dli_fbase;
+  dl_iterate_phdr(FindAndAddCounters_cb, &fdata);
+}
+}
diff --git a/compiler-rt/lib/profile/InstrProfilingTLS.c b/compiler-rt/lib/profile/InstrProfilingTLS.c
new file mode 100644
index 0000000000000..029ed9e542e5a
--- /dev/null
+++ b/compiler-rt/lib/profile/InstrProfilingTLS.c
@@ -0,0 +1,29 @@
+#include "InstrProfilingTLS.h"
+#include "InstrProfiling.h"
+
+struct texit_fn_node module_node COMPILER_RT_VISIBILITY;
+
+// We act as a shim between the profile_threadlocal sharedlib
+// and the profile static lib.  We need to the tell the static lib
+// to add all of the counters up on main thread exit, but the
+// shared lib is the one who knows how to do that and whether its
+// already been done.
+//
+// In the constructor we pass flush_main_thread_counters from the
+// sharedlib to the non-tls statlib's on_main_thread_exit fnptr.
+extern void flush_main_thread_counters(void);
+extern void (*on_main_thread_exit)(void);
+
+__attribute__((constructor)) COMPILER_RT_VISIBILITY void
+__llvm_profile_tls_register_thread_exit_handler(void) {
+  module_node.prev = NULL;
+  module_node.next = NULL;
+  module_node.fn = __llvm_profile_tls_counters_finalize;
+  register_tls_prfcnts_module_thread_exit_handler(&module_node);
+  if (!on_main_thread_exit) {
+    on_main_thread_exit = flush_main_thread_counters;
+  }
+}
+
+// TODO: Add destructor
+// (But not yet, I'm scared)
diff --git a/compiler-rt/lib/profile/InstrProfilingTLS.h b/compiler-rt/lib/profile/InstrProfilingTLS.h
new file mode 100644
index 0000000000000..1b6001d27d375
--- /dev/null
+++ b/compiler-rt/lib/profile/InstrProfilingTLS.h
@@ -0,0 +1,39 @@
+#ifndef INSTR_PROFILING_TLS_H
+#define INSTR_PROFILING_TLS_H
+
+char *__llvm_profile_begin_tls_counters(void);
+char *__llvm_profile_end_tls_counters(void);
+
+/*!
+ * \brief Add counter values from TLS to the global counters for the program
+ *
+ * On thread exit, atomically add the values in TLS counters to the static
+ * counters for the whole process.
+ */
+void __llvm_profile_tls_counters_finalize(void);
+
+/*
+ * Dylib stuff
+ */
+typedef void (*texit_fnc)(void);
+
+typedef struct texit_fn_node {
+  struct texit_fn_node *prev;
+  texit_fnc fn;
+  struct texit_fn_node *next;
+} texit_fn_node;
+
+// TODO: really this should be write-preferring rwlocked
+struct texit_fn_registry {
+  int texit_mtx;
+  texit_fn_node head;
+  texit_fn_node tail;
+};
+
+void register_tls_prfcnts_module_thread_exit_handler(texit_fn_node *new_node);
+void unregister_tls_prfcnts_module_thread_exit_handler(texit_fn_node *new_node);
+void run_thread_exit_handlers(void);
+
+void register_profile...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/95494


More information about the cfe-commits mailing list