[clang] [compiler-rt] [llvm] [InstrProf] Created Thread local counter instrumentation, compiler-rt runtimes (PR #95494)
Andrew Wock via cfe-commits
cfe-commits at lists.llvm.org
Thu Jun 13 19:36:52 PDT 2024
https://github.com/ajwock created https://github.com/llvm/llvm-project/pull/95494
LLVM can now generate increments to counters in thread local storage.
Use a new compiler-rt runtime to atomically add thread local counters to global counters on thread exit.
The clang driver will link the new runtime libraries in when the new option -fprofile-thread-local is specified.
More details available in the RFC on discourse.
>From 44e2159636efd601c90aced44856d17d77728caa Mon Sep 17 00:00:00 2001
From: Andrew Wock <ajwock at gmail.com>
Date: Tue, 4 Jun 2024 09:45:31 -0400
Subject: [PATCH] Created Thread local counter instrumentation.
LLVM can now generate increments to counters in thread local storage.
Use a new compiler-rt runtime to atomically add thread local
counters to global counters on thread exit.
The clang driver will link the new runtime libraries in when the
new option -fprofile-thread-local is specified.
Signed-off-by: Andrew Wock <ajwock at gmail.com>
---
clang/docs/UsersManual.rst | 8 ++
clang/include/clang/Basic/CodeGenOptions.def | 1 +
clang/include/clang/Driver/Options.td | 3 +
clang/include/clang/Driver/ToolChain.h | 6 +
clang/lib/Driver/ToolChain.cpp | 10 ++
clang/lib/Driver/ToolChains/Clang.cpp | 12 ++
clang/lib/Driver/ToolChains/Linux.cpp | 7 +
compiler-rt/include/profile/InstrProfData.inc | 4 +
compiler-rt/lib/profile/CMakeLists.txt | 35 +++++
.../lib/profile/InstrProfilingDyLibLinux.cpp | 63 +++++++++
compiler-rt/lib/profile/InstrProfilingFile.c | 6 +
.../lib/profile/InstrProfilingPlatformLinux.c | 1 +
.../profile/InstrProfilingStaticTLSLinux.cpp | 123 ++++++++++++++++++
compiler-rt/lib/profile/InstrProfilingTLS.c | 29 +++++
compiler-rt/lib/profile/InstrProfilingTLS.h | 39 ++++++
.../lib/profile/InstrProfilingTLSDyLib.c | 100 ++++++++++++++
.../lib/profile/InstrProfilingTLSDyLib.h | 4 +
compiler-rt/lib/tsan/rtl/CMakeLists.txt | 2 +-
.../Inputs/instrprof-tls-dlclose-lib.c | 7 +
.../Inputs/instrprof-tls-dlclose-main.c | 93 +++++++++++++
.../Inputs/instrprof-tls-dlopen-func.c | 9 ++
.../Inputs/instrprof-tls-dlopen-func2.c | 9 ++
.../Inputs/instrprof-tls-dlopen-main.c | 105 +++++++++++++++
.../test/profile/Inputs/instrprof-tls-exit.c | 37 ++++++
.../Linux/instrprof-tls-dlclose-memfault.test | 27 ++++
.../instrprof-tls-dlclose-mix-subset.test | 41 ++++++
.../Linux/instrprof-tls-dlclose-mix.test | 48 +++++++
.../Linux/instrprof-tls-dlclose-nodelete.test | 24 ++++
.../profile/Linux/instrprof-tls-dlopen.test | 32 +++++
.../profile/Linux/instrprof-tls-exit.test | 17 +++
.../Linux/instrprof-tls-noclose-mix.test | 51 ++++++++
.../instrprof-tls-shared-mix-subset.test | 35 +++++
.../Linux/instrprof-tls-shared-mix.test | 48 +++++++
llvm/include/llvm/ProfileData/InstrProf.h | 3 +
.../llvm/ProfileData/InstrProfData.inc | 4 +
.../Instrumentation/InstrProfiling.cpp | 71 +++++++++-
36 files changed, 1110 insertions(+), 4 deletions(-)
create mode 100644 compiler-rt/lib/profile/InstrProfilingDyLibLinux.cpp
create mode 100644 compiler-rt/lib/profile/InstrProfilingStaticTLSLinux.cpp
create mode 100644 compiler-rt/lib/profile/InstrProfilingTLS.c
create mode 100644 compiler-rt/lib/profile/InstrProfilingTLS.h
create mode 100644 compiler-rt/lib/profile/InstrProfilingTLSDyLib.c
create mode 100644 compiler-rt/lib/profile/InstrProfilingTLSDyLib.h
create mode 100644 compiler-rt/test/profile/Inputs/instrprof-tls-dlclose-lib.c
create mode 100644 compiler-rt/test/profile/Inputs/instrprof-tls-dlclose-main.c
create mode 100644 compiler-rt/test/profile/Inputs/instrprof-tls-dlopen-func.c
create mode 100644 compiler-rt/test/profile/Inputs/instrprof-tls-dlopen-func2.c
create mode 100644 compiler-rt/test/profile/Inputs/instrprof-tls-dlopen-main.c
create mode 100644 compiler-rt/test/profile/Inputs/instrprof-tls-exit.c
create mode 100644 compiler-rt/test/profile/Linux/instrprof-tls-dlclose-memfault.test
create mode 100644 compiler-rt/test/profile/Linux/instrprof-tls-dlclose-mix-subset.test
create mode 100644 compiler-rt/test/profile/Linux/instrprof-tls-dlclose-mix.test
create mode 100644 compiler-rt/test/profile/Linux/instrprof-tls-dlclose-nodelete.test
create mode 100644 compiler-rt/test/profile/Linux/instrprof-tls-dlopen.test
create mode 100644 compiler-rt/test/profile/Linux/instrprof-tls-exit.test
create mode 100644 compiler-rt/test/profile/Linux/instrprof-tls-noclose-mix.test
create mode 100644 compiler-rt/test/profile/Linux/instrprof-tls-shared-mix-subset.test
create mode 100644 compiler-rt/test/profile/Linux/instrprof-tls-shared-mix.test
diff --git a/clang/docs/UsersManual.rst b/clang/docs/UsersManual.rst
index f954857b0235a..f7db513b92909 100644
--- a/clang/docs/UsersManual.rst
+++ b/clang/docs/UsersManual.rst
@@ -2932,6 +2932,14 @@ indexed format, regardeless whether it is produced by frontend or the IR pass.
overhead. ``prefer-atomic`` will be transformed to ``atomic`` when supported
by the target, or ``single`` otherwise.
+.. option:: -fprofile-thread-local
+
+ Increment profile counters in thread local storage and atomically add their
+ values to global counters on thread exit. This has the potential to deliver
+ both accuracy and high performance whenever there is high thread contention
+ on profile counters. This is an experimental option and it is only supported
+ on 64-bit linux.
+
Fine Tuning Profile Collection
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/clang/include/clang/Basic/CodeGenOptions.def b/clang/include/clang/Basic/CodeGenOptions.def
index 7ffc40a00504f..7cd0bfb6d71b5 100644
--- a/clang/include/clang/Basic/CodeGenOptions.def
+++ b/clang/include/clang/Basic/CodeGenOptions.def
@@ -218,6 +218,7 @@ ENUM_CODEGENOPT(ProfileUse, ProfileInstrKind, 2, ProfileNone)
/// instrumented. Selected group numbers can be 0 to N-1 inclusive.
VALUE_CODEGENOPT(ProfileTotalFunctionGroups, 32, 1)
VALUE_CODEGENOPT(ProfileSelectedFunctionGroup, 32, 0)
+CODEGENOPT(InstrProfileThreadLocal, 1, 0) ///< Counters are updated on a per-thread basis
CODEGENOPT(CoverageMapping , 1, 0) ///< Generate coverage mapping regions to
///< enable code coverage analysis.
CODEGENOPT(DumpCoverageMapping , 1, 0) ///< Dump the generated coverage mapping
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index d44faa55c456f..aab5b63c991f1 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -1768,6 +1768,9 @@ def fprofile_instr_generate : Flag<["-"], "fprofile-instr-generate">,
def fprofile_instr_generate_EQ : Joined<["-"], "fprofile-instr-generate=">,
Group<f_Group>, Visibility<[ClangOption, CLOption]>, MetaVarName<"<file>">,
HelpText<"Generate instrumented code to collect execution counts into <file> (overridden by LLVM_PROFILE_FILE env var)">;
+def fprofile_thread_local : Flag<["-"], "fprofile-thread-local">,
+ Group<f_Group>, Visibility<[ClangOption, CLOption]>,
+ HelpText<"Generage profile counters in thread local storage">;
def fprofile_instr_use : Flag<["-"], "fprofile-instr-use">, Group<f_Group>,
Visibility<[ClangOption, CLOption]>;
def fprofile_instr_use_EQ : Joined<["-"], "fprofile-instr-use=">,
diff --git a/clang/include/clang/Driver/ToolChain.h b/clang/include/clang/Driver/ToolChain.h
index 9789cfacafd78..162c730782afb 100644
--- a/clang/include/clang/Driver/ToolChain.h
+++ b/clang/include/clang/Driver/ToolChain.h
@@ -752,6 +752,12 @@ class ToolChain {
virtual void addProfileRTLibs(const llvm::opt::ArgList &Args,
llvm::opt::ArgStringList &CmdArgs) const;
+ /// addThreadLocalProfileRTLibs - With -fprofile-threadlocal, add the
+ /// threadlocal profile runtime static + shared library pair.
+ virtual void
+ addThreadLocalProfileRTLibs(const llvm::opt::ArgList &Args,
+ llvm::opt::ArgStringList &CmdArgs) const;
+
/// Add arguments to use system-specific CUDA includes.
virtual void AddCudaIncludeArgs(const llvm::opt::ArgList &DriverArgs,
llvm::opt::ArgStringList &CC1Args) const;
diff --git a/clang/lib/Driver/ToolChain.cpp b/clang/lib/Driver/ToolChain.cpp
index 40ab2e91125d1..4708cb7df5044 100644
--- a/clang/lib/Driver/ToolChain.cpp
+++ b/clang/lib/Driver/ToolChain.cpp
@@ -1078,6 +1078,16 @@ void ToolChain::addProfileRTLibs(const llvm::opt::ArgList &Args,
CmdArgs.push_back(getCompilerRTArgString(Args, "profile"));
}
+void ToolChain::addThreadLocalProfileRTLibs(
+ const llvm::opt::ArgList &Args, llvm::opt::ArgStringList &CmdArgs) const {
+ if (needsProfileRT(Args) && Args.hasArg(options::OPT_fprofile_thread_local)) {
+ // Static first, so we can specify '-u' where needed
+ CmdArgs.push_back(getCompilerRTArgString(Args, "profile_threadlocal"));
+ CmdArgs.push_back(getCompilerRTArgString(Args, "profile_threadlocal",
+ ToolChain::FT_Shared));
+ }
+}
+
ToolChain::RuntimeLibType ToolChain::GetRuntimeLibType(
const ArgList &Args) const {
if (runtimeLibType)
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index b8d8ff3db5d1f..cd63ac56fecf6 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -720,6 +720,18 @@ static void addPGOAndCoverageFlags(const ToolChain &TC, Compilation &C,
CmdArgs.push_back("-fcoverage-mcdc");
}
+ if (Args.hasArg(options::OPT_fprofile_thread_local)) {
+ if (!ProfileGenerateArg)
+ D.Diag(clang::diag::err_drv_argument_only_allowed_with)
+ << "-fprofile-thread-local"
+ << "-fprofile-instr-generate";
+
+ // Clang cc1 is not in the know about thread local coverage, but llvm
+ // should be
+ CmdArgs.push_back("-mllvm");
+ CmdArgs.push_back("-instr-prof-thread-local");
+ }
+
if (Arg *A = Args.getLastArg(options::OPT_ffile_compilation_dir_EQ,
options::OPT_fcoverage_compilation_dir_EQ)) {
if (A->getOption().matches(options::OPT_ffile_compilation_dir_EQ))
diff --git a/clang/lib/Driver/ToolChains/Linux.cpp b/clang/lib/Driver/ToolChains/Linux.cpp
index 2222dea431c3c..0a889f957786a 100644
--- a/clang/lib/Driver/ToolChains/Linux.cpp
+++ b/clang/lib/Driver/ToolChains/Linux.cpp
@@ -843,6 +843,13 @@ void Linux::addProfileRTLibs(const llvm::opt::ArgList &Args,
CmdArgs.push_back(Args.MakeArgString(
Twine("-u", llvm::getInstrProfRuntimeHookVarName())));
ToolChain::addProfileRTLibs(Args, CmdArgs);
+
+ if (needsProfileRT(Args) && Args.hasArg(options::OPT_fprofile_thread_local)) {
+ CmdArgs.push_back(Args.MakeArgString(Twine(
+ "-u",
+ llvm::StringRef("__llvm_profile_tls_register_thread_exit_handler"))));
+ }
+ ToolChain::addThreadLocalProfileRTLibs(Args, CmdArgs);
}
void Linux::addExtraOpts(llvm::opt::ArgStringList &CmdArgs) const {
diff --git a/compiler-rt/include/profile/InstrProfData.inc b/compiler-rt/include/profile/InstrProfData.inc
index e9866d94b762c..8655bcf498437 100644
--- a/compiler-rt/include/profile/InstrProfData.inc
+++ b/compiler-rt/include/profile/InstrProfData.inc
@@ -312,6 +312,9 @@ INSTR_PROF_SECT_ENTRY(IPSK_data, \
INSTR_PROF_SECT_ENTRY(IPSK_cnts, \
INSTR_PROF_QUOTE(INSTR_PROF_CNTS_COMMON), \
INSTR_PROF_CNTS_COFF, "__DATA,")
+INSTR_PROF_SECT_ENTRY(IPSK_tls_cnts, \
+ INSTR_PROF_QUOTE(INSTR_PROF_TLS_CNTS_COMMON), \
+ INSTR_PROF_CNTS_COFF, "__DATA,")
INSTR_PROF_SECT_ENTRY(IPSK_bitmap, \
INSTR_PROF_QUOTE(INSTR_PROF_BITS_COMMON), \
INSTR_PROF_BITS_COFF, "__DATA,")
@@ -750,6 +753,7 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
#define INSTR_PROF_NAME_COMMON __llvm_prf_names
#define INSTR_PROF_VNAME_COMMON __llvm_prf_vns
#define INSTR_PROF_CNTS_COMMON __llvm_prf_cnts
+#define INSTR_PROF_TLS_CNTS_COMMON __llvm_tls_prf_cnts
#define INSTR_PROF_BITS_COMMON __llvm_prf_bits
#define INSTR_PROF_VALS_COMMON __llvm_prf_vals
#define INSTR_PROF_VNODES_COMMON __llvm_prf_vnds
diff --git a/compiler-rt/lib/profile/CMakeLists.txt b/compiler-rt/lib/profile/CMakeLists.txt
index 45e5164891751..b9f3a20bb328d 100644
--- a/compiler-rt/lib/profile/CMakeLists.txt
+++ b/compiler-rt/lib/profile/CMakeLists.txt
@@ -70,14 +70,25 @@ set(PROFILE_SOURCES
InstrProfilingUtil.c
)
+set(PROFILE_STATIC_TLS_SOURCES
+ InstrProfilingTLS.c
+ InstrProfilingStaticTLSLinux.cpp)
+
+set(PROFILE_SHARED_TLS_SOURCES
+ InstrProfilingTLSDyLib.c
+ InstrProfilingDyLibLinux.cpp)
+
set(PROFILE_HEADERS
InstrProfiling.h
InstrProfilingInternal.h
InstrProfilingPort.h
InstrProfilingUtil.h
+ InstrProfilingTLS.h
WindowsMMap.h
)
+set(PROFILE_LINK_LIBS ${SANITIZER_COMMON_LINK_LIBS})
+
if(WIN32)
list(APPEND PROFILE_SOURCES
WindowsMMap.c
@@ -134,6 +145,30 @@ if(APPLE)
ADDITIONAL_HEADERS ${PROFILE_HEADERS}
PARENT_TARGET profile)
else()
+ #if(UNIX AND NOT APPLE AND NOT ANDROID)
+ if(OS_NAME MATCHES "Linux")
+ add_compiler_rt_runtime(clang_rt.profile_threadlocal
+ STATIC
+ OS ${PROFILE_SUPPORTED_OS}
+ ARCHS ${PROFILE_SUPPORTED_ARCH}
+ CFLAGS ${EXTRA_FLAGS}
+ SOURCES ${PROFILE_STATIC_TLS_SOURCES}
+ ADDITIONAL_HEADERS ${PROFILE_HEADERS}
+ PARENT_TARGET profile)
+
+ add_compiler_rt_runtime(clang_rt.profile_threadlocal
+ SHARED
+ OS ${PROFILE_SUPPORTED_OS}
+ ARCHS ${PROFILE_SUPPORTED_ARCH}
+ CFLAGS ${EXTRA_FLAGS}
+ SOURCES ${PROFILE_SHARED_TLS_SOURCES}
+ ADDITIONAL_HEADERS ${PROFILE_HEADERS}
+ OBJECT_LIBS RTInterception
+ RTSanitizerCommon
+ RTSanitizerCommonLibc
+ PARENT_TARGET profile)
+ endif()
+
add_compiler_rt_runtime(clang_rt.profile
STATIC
ARCHS ${PROFILE_SUPPORTED_ARCH}
diff --git a/compiler-rt/lib/profile/InstrProfilingDyLibLinux.cpp b/compiler-rt/lib/profile/InstrProfilingDyLibLinux.cpp
new file mode 100644
index 0000000000000..47f2baa6a5815
--- /dev/null
+++ b/compiler-rt/lib/profile/InstrProfilingDyLibLinux.cpp
@@ -0,0 +1,63 @@
+#if defined(__linux__) || defined(__FreeBSD__) || defined(__Fuchsia__) || \
+ (defined(__sun__) && defined(__svr4__)) || defined(__NetBSD__) || \
+ defined(_AIX)
+
+#include <elf.h>
+#include <link.h>
+#endif
+#include <stdlib.h>
+#include <string.h>
+
+extern "C" {
+
+#include "InstrProfiling.h"
+#include "InstrProfilingInternal.h"
+#include "InstrProfilingTLS.h"
+#include "InstrProfilingTLSDyLib.h"
+}
+
+#include "interception/interception.h"
+
+extern "C" {
+
+struct pthread_wrapper_arg {
+ void *(*fn)(void *);
+ void *arg;
+ uint32_t arg_keepalive;
+};
+
+void *pthread_fn_wrapper(void *arg_ptr) {
+ struct pthread_wrapper_arg *wrapper_arg =
+ (struct pthread_wrapper_arg *)arg_ptr;
+ void *(*fn)(void *) = __atomic_load_n(&wrapper_arg->fn, __ATOMIC_RELAXED);
+ void *arg = __atomic_load_n(&wrapper_arg->arg, __ATOMIC_RELAXED);
+ __atomic_store_n(&wrapper_arg->arg_keepalive, 0, __ATOMIC_RELEASE);
+
+ // startup
+ // Do nothing (TLS is automatically loaded and zeroed)
+ void *retval = fn(arg);
+ // cleanup
+ run_thread_exit_handlers();
+ // Combine counters with main counters
+ return retval;
+}
+
+void __llvm_register_profile_intercepts() { register_profile_intercepts(); }
+
+} // end extern "C"
+
+INTERCEPTOR(int, pthread_create, void *thread, void *attr,
+ void *(*start_routine)(void *), void *arg) {
+ int res = -1;
+ struct pthread_wrapper_arg wrapper_arg = {(void *(*)(void *))start_routine,
+ arg, 1};
+
+ // do pthread
+ res = REAL(pthread_create)(thread, attr, pthread_fn_wrapper, &wrapper_arg);
+ // Spin wait for child thread to copy arguments
+ while (__atomic_load_n(&wrapper_arg.arg_keepalive, __ATOMIC_ACQUIRE) == 1)
+ ;
+ return res;
+}
+
+void register_profile_intercepts() { INTERCEPT_FUNCTION(pthread_create); }
diff --git a/compiler-rt/lib/profile/InstrProfilingFile.c b/compiler-rt/lib/profile/InstrProfilingFile.c
index e4d99ef4872bd..64775f24fd83c 100644
--- a/compiler-rt/lib/profile/InstrProfilingFile.c
+++ b/compiler-rt/lib/profile/InstrProfilingFile.c
@@ -34,6 +34,7 @@
#include "InstrProfiling.h"
#include "InstrProfilingInternal.h"
#include "InstrProfilingPort.h"
+#include "InstrProfilingTLS.h"
#include "InstrProfilingUtil.h"
/* From where is profile name specified.
@@ -1084,6 +1085,8 @@ void __llvm_profile_set_filename(const char *FilenamePat) {
parseAndSetFilename(FilenamePat, PNS_runtime_api, 1);
}
+void (*on_main_thread_exit)(void) = NULL;
+
/* The public API for writing profile data into the file with name
* set by previous calls to __llvm_profile_set_filename or
* __llvm_profile_override_default_filename or
@@ -1097,6 +1100,9 @@ int __llvm_profile_write_file(void) {
// Temporarily suspend getting SIGKILL when the parent exits.
int PDeathSig = lprofSuspendSigKill();
+ if (on_main_thread_exit)
+ on_main_thread_exit();
+
if (lprofProfileDumped() || __llvm_profile_is_continuous_mode_enabled()) {
PROF_NOTE("Profile data not written to file: %s.\n", "already written");
if (PDeathSig == 1)
diff --git a/compiler-rt/lib/profile/InstrProfilingPlatformLinux.c b/compiler-rt/lib/profile/InstrProfilingPlatformLinux.c
index b766436497b74..4f96523a56a37 100644
--- a/compiler-rt/lib/profile/InstrProfilingPlatformLinux.c
+++ b/compiler-rt/lib/profile/InstrProfilingPlatformLinux.c
@@ -45,6 +45,7 @@ extern __llvm_profile_data PROF_DATA_STOP COMPILER_RT_VISIBILITY
COMPILER_RT_WEAK;
extern char PROF_CNTS_START COMPILER_RT_VISIBILITY COMPILER_RT_WEAK;
extern char PROF_CNTS_STOP COMPILER_RT_VISIBILITY COMPILER_RT_WEAK;
+
extern VTableProfData PROF_VTABLE_START COMPILER_RT_VISIBILITY COMPILER_RT_WEAK;
extern VTableProfData PROF_VTABLE_STOP COMPILER_RT_VISIBILITY COMPILER_RT_WEAK;
extern char PROF_VNAME_START COMPILER_RT_VISIBILITY COMPILER_RT_WEAK;
diff --git a/compiler-rt/lib/profile/InstrProfilingStaticTLSLinux.cpp b/compiler-rt/lib/profile/InstrProfilingStaticTLSLinux.cpp
new file mode 100644
index 0000000000000..fc5f785e1ab40
--- /dev/null
+++ b/compiler-rt/lib/profile/InstrProfilingStaticTLSLinux.cpp
@@ -0,0 +1,123 @@
+#if defined(__linux__) || defined(__FreeBSD__) || defined(__Fuchsia__) || \
+ (defined(__sun__) && defined(__svr4__)) || defined(__NetBSD__) || \
+ defined(_AIX)
+
+#include <elf.h>
+#include <link.h>
+#endif
+#include <stdlib.h>
+#include <string.h>
+
+extern "C" {
+
+#include "InstrProfiling.h"
+#include "InstrProfilingInternal.h"
+#include "InstrProfilingTLS.h"
+}
+
+extern "C" {
+
+#define PROF_TLS_CNTS_START INSTR_PROF_SECT_START(INSTR_PROF_TLS_CNTS_COMMON)
+#define PROF_TLS_CNTS_STOP INSTR_PROF_SECT_STOP(INSTR_PROF_TLS_CNTS_COMMON)
+
+extern char PROF_TLS_CNTS_START COMPILER_RT_VISIBILITY COMPILER_RT_WEAK;
+extern char PROF_TLS_CNTS_STOP COMPILER_RT_VISIBILITY COMPILER_RT_WEAK;
+
+COMPILER_RT_VISIBILITY char *__llvm_profile_begin_tls_counters(void) {
+ return &PROF_TLS_CNTS_START;
+}
+COMPILER_RT_VISIBILITY char *__llvm_profile_end_tls_counters(void) {
+ return &PROF_TLS_CNTS_STOP;
+}
+
+struct finalization_data {
+ char *mod_begin;
+ char *tls_img_begin;
+ char *tls_img_end;
+ char *cnts_begin;
+ char *cnts_end;
+};
+
+// This is O(num_modules + num_counters) unfortunately. If there were a
+// mechanism to calculate the thread-local start of a thread-local section like
+// there is a mechanism to calculate the static start of a static section (i.e.
+// __start_$sectionname), that would simplify implementation a lot and make this
+// just O(num_counters).
+static int FindAndAddCounters_cb(struct dl_phdr_info *info, size_t size,
+ void *data) {
+ finalization_data *fdata = (finalization_data *)data;
+ char *mod_begin = fdata->mod_begin;
+ // We're looking for a match to the dladdr calculated based on PROF_CNTS_START
+ if (mod_begin != (char *)info->dlpi_addr) {
+ return 0;
+ }
+
+ if (info->dlpi_tls_data == NULL) {
+ return 1;
+ }
+
+ const Elf64_Phdr *hdr = info->dlpi_phdr;
+ const Elf64_Phdr *last_hdr = hdr + info->dlpi_phnum;
+
+ const Elf64_Phdr *tls_hdr;
+ for (; hdr != last_hdr; ++hdr) {
+ if (hdr->p_type == PT_TLS) {
+ tls_hdr = hdr;
+ goto found_tls_ph;
+ }
+ }
+ return 1;
+found_tls_ph:
+ uint64_t num_counters =
+ __llvm_profile_get_num_counters(fdata->tls_img_begin, fdata->tls_img_end);
+ uint64_t counter_size = __llvm_profile_counter_entry_size();
+
+ // Calculate the offset of __llvm_prf_tls_cnts into the tls block for this
+ // module. The addresses in use below correspond to the tls initialization
+ // image, which is statically allocated for the module, rather than the TLS
+ // block itself.
+ uint64_t ph_true_vaddr =
+ (uint64_t)info->dlpi_addr + (uint64_t)tls_hdr->p_vaddr;
+ uint64_t tls_cnts_tlsblk_offset =
+ (uint64_t)fdata->tls_img_begin - ph_true_vaddr;
+
+ // Calculate the thread local copy of __llvm_prf_tls_cnts for this module.
+ uint64_t tls_prf_cnts_modlocal_begin =
+ (uint64_t)info->dlpi_tls_data + tls_cnts_tlsblk_offset;
+
+ // We don't support single byte counters because they are also resilient to
+ // thread synchronization issues and they are designed to avoid memory
+ // overhead, which is the opposite of what TL counters do.
+ // TODO: warn?
+ if (counter_size == sizeof(uint64_t)) {
+ uint64_t *tls_cnt = (uint64_t *)tls_prf_cnts_modlocal_begin;
+ uint64_t *tls_end = (uint64_t *)tls_cnt + num_counters;
+ uint64_t *cnt = (uint64_t *)fdata->cnts_begin;
+ for (; tls_cnt != tls_end; tls_cnt++, cnt++) {
+ __atomic_fetch_add(cnt, *tls_cnt, __ATOMIC_RELAXED);
+ }
+ }
+ return 1;
+}
+
+COMPILER_RT_VISIBILITY
+void __llvm_profile_tls_counters_finalize(void) {
+ struct finalization_data fdata = {0};
+ fdata.tls_img_begin = __llvm_profile_begin_tls_counters();
+ fdata.tls_img_end = __llvm_profile_end_tls_counters();
+ fdata.cnts_begin = __llvm_profile_begin_counters();
+ fdata.cnts_end = __llvm_profile_end_counters();
+
+ if (!fdata.tls_img_begin || !fdata.tls_img_end || !fdata.cnts_begin ||
+ !fdata.cnts_end) {
+ return;
+ }
+
+ Dl_info info;
+ if (dladdr(fdata.cnts_begin, &info) == 0) {
+ return;
+ }
+ fdata.mod_begin = (char *)info.dli_fbase;
+ dl_iterate_phdr(FindAndAddCounters_cb, &fdata);
+}
+}
diff --git a/compiler-rt/lib/profile/InstrProfilingTLS.c b/compiler-rt/lib/profile/InstrProfilingTLS.c
new file mode 100644
index 0000000000000..029ed9e542e5a
--- /dev/null
+++ b/compiler-rt/lib/profile/InstrProfilingTLS.c
@@ -0,0 +1,29 @@
+#include "InstrProfilingTLS.h"
+#include "InstrProfiling.h"
+
+struct texit_fn_node module_node COMPILER_RT_VISIBILITY;
+
+// We act as a shim between the profile_threadlocal sharedlib
+// and the profile static lib. We need to the tell the static lib
+// to add all of the counters up on main thread exit, but the
+// shared lib is the one who knows how to do that and whether its
+// already been done.
+//
+// In the constructor we pass flush_main_thread_counters from the
+// sharedlib to the non-tls statlib's on_main_thread_exit fnptr.
+extern void flush_main_thread_counters(void);
+extern void (*on_main_thread_exit)(void);
+
+__attribute__((constructor)) COMPILER_RT_VISIBILITY void
+__llvm_profile_tls_register_thread_exit_handler(void) {
+ module_node.prev = NULL;
+ module_node.next = NULL;
+ module_node.fn = __llvm_profile_tls_counters_finalize;
+ register_tls_prfcnts_module_thread_exit_handler(&module_node);
+ if (!on_main_thread_exit) {
+ on_main_thread_exit = flush_main_thread_counters;
+ }
+}
+
+// TODO: Add destructor
+// (But not yet, I'm scared)
diff --git a/compiler-rt/lib/profile/InstrProfilingTLS.h b/compiler-rt/lib/profile/InstrProfilingTLS.h
new file mode 100644
index 0000000000000..1b6001d27d375
--- /dev/null
+++ b/compiler-rt/lib/profile/InstrProfilingTLS.h
@@ -0,0 +1,39 @@
+#ifndef INSTR_PROFILING_TLS_H
+#define INSTR_PROFILING_TLS_H
+
+char *__llvm_profile_begin_tls_counters(void);
+char *__llvm_profile_end_tls_counters(void);
+
+/*!
+ * \brief Add counter values from TLS to the global counters for the program
+ *
+ * On thread exit, atomically add the values in TLS counters to the static
+ * counters for the whole process.
+ */
+void __llvm_profile_tls_counters_finalize(void);
+
+/*
+ * Dylib stuff
+ */
+typedef void (*texit_fnc)(void);
+
+typedef struct texit_fn_node {
+ struct texit_fn_node *prev;
+ texit_fnc fn;
+ struct texit_fn_node *next;
+} texit_fn_node;
+
+// TODO: really this should be write-preferring rwlocked
+struct texit_fn_registry {
+ int texit_mtx;
+ texit_fn_node head;
+ texit_fn_node tail;
+};
+
+void register_tls_prfcnts_module_thread_exit_handler(texit_fn_node *new_node);
+void unregister_tls_prfcnts_module_thread_exit_handler(texit_fn_node *new_node);
+void run_thread_exit_handlers(void);
+
+void register_profile_intercepts();
+
+#endif
diff --git a/compiler-rt/lib/profile/InstrProfilingTLSDyLib.c b/compiler-rt/lib/profile/InstrProfilingTLSDyLib.c
new file mode 100644
index 0000000000000..e82780dbcf6ab
--- /dev/null
+++ b/compiler-rt/lib/profile/InstrProfilingTLSDyLib.c
@@ -0,0 +1,100 @@
+#include "InstrProfiling.h"
+#include "InstrProfilingTLS.h"
+#include <stdlib.h>
+
+// Maintain a linked list of handlers to run on thread exit.
+// This is broken out into a dylib so that the registry is truly global across
+// dlopen et. al.
+//
+// Each module has a statically allocated node that gets linked into the
+// registry on the constructor and that gets linked out of the registry on
+// destroy.
+//
+// This node is defined in the static portion of the tls counts extension.
+
+struct texit_fn_registry texit_registry;
+
+static void lock_texit_registry(void) {
+ int expected = 0;
+ while (!__atomic_compare_exchange_n(&texit_registry.texit_mtx, &expected, 1,
+ 0, __ATOMIC_ACQUIRE, __ATOMIC_RELAXED)) {
+ expected = 0;
+ }
+}
+
+static void unlock_texit_registry(void) {
+ __atomic_store_n(&texit_registry.texit_mtx, 0, __ATOMIC_RELEASE);
+}
+
+static void wlock_texit_registry(void) { lock_texit_registry(); }
+
+static void wunlock_texit_registry(void) { unlock_texit_registry(); }
+
+static void rlock_texit_registry(void) { lock_texit_registry(); }
+
+static void runlock_texit_registry(void) { unlock_texit_registry(); }
+
+static inline texit_fn_node *take_nodep(texit_fn_node **nodepp) {
+ texit_fn_node *nodep = *nodepp;
+ *nodepp = NULL;
+ return nodep;
+}
+
+static inline texit_fn_node *replace_nodep(texit_fn_node **nodepp,
+ texit_fn_node *new_nodep) {
+ texit_fn_node *nodep = *nodepp;
+ *nodepp = new_nodep;
+ return nodep;
+}
+
+void flush_main_thread_counters(void) {
+ static int flushed = 0;
+ if (!flushed) {
+ run_thread_exit_handlers();
+ flushed = 1;
+ }
+}
+
+__attribute__((constructor)) static void __initialize_tls_exit_registry() {
+ register_profile_intercepts();
+ texit_registry.texit_mtx = 0;
+ texit_registry.head.prev = NULL;
+ texit_registry.head.fn = NULL;
+ texit_registry.head.next = &texit_registry.tail;
+ texit_registry.tail.prev = &texit_registry.head;
+ texit_registry.tail.fn = NULL;
+ texit_registry.tail.next = NULL;
+}
+
+// Should run from module constructor
+void register_tls_prfcnts_module_thread_exit_handler(texit_fn_node *new_nodep) {
+ wlock_texit_registry();
+ texit_fn_node *prev = replace_nodep(&texit_registry.tail.prev, new_nodep);
+ texit_fn_node *next = replace_nodep(&prev->next, new_nodep);
+ new_nodep->next = next;
+ new_nodep->prev = prev;
+ wunlock_texit_registry();
+}
+
+// Should run from module destructor
+// Also, this destructor/constructor pair should be outermost. At least outside
+// of the regular llvm_profile stuff.
+void unregister_tls_prfcnts_module_thread_exit_handler(
+ texit_fn_node *old_nodep) {
+ wlock_texit_registry();
+ texit_fn_node *prev = take_nodep(&old_nodep->prev);
+ texit_fn_node *next = take_nodep(&old_nodep->next);
+ prev->next = next;
+ next->prev = prev;
+ wunlock_texit_registry();
+}
+
+void run_thread_exit_handlers(void) {
+ rlock_texit_registry();
+ for (texit_fn_node *node = texit_registry.head.next;
+ node != &texit_registry.tail; node = node->next) {
+ if (node->fn != NULL)
+ node->fn();
+ }
+ runlock_texit_registry();
+}
diff --git a/compiler-rt/lib/profile/InstrProfilingTLSDyLib.h b/compiler-rt/lib/profile/InstrProfilingTLSDyLib.h
new file mode 100644
index 0000000000000..3c429d81129ec
--- /dev/null
+++ b/compiler-rt/lib/profile/InstrProfilingTLSDyLib.h
@@ -0,0 +1,4 @@
+#ifndef INSTR_PROFILING_TLS_DYLIB_H
+#define INSTR_PROFILING_TLS_DYLIB_H
+
+#endif
diff --git a/compiler-rt/lib/tsan/rtl/CMakeLists.txt b/compiler-rt/lib/tsan/rtl/CMakeLists.txt
index f40e72dbde1f9..8ddb6af279284 100644
--- a/compiler-rt/lib/tsan/rtl/CMakeLists.txt
+++ b/compiler-rt/lib/tsan/rtl/CMakeLists.txt
@@ -1,6 +1,6 @@
include_directories(../..)
-set(TSAN_RTL_CFLAGS ${TSAN_CFLAGS})
+set(TSAN_RTL_CFLAGS ${TSAN_CFLAGS} -O0 -g3)
append_list_if(COMPILER_RT_HAS_MSSE4_2_FLAG -msse4.2 TSAN_RTL_CFLAGS)
append_list_if(SANITIZER_LIMIT_FRAME_SIZE -Wframe-larger-than=530
TSAN_RTL_CFLAGS)
diff --git a/compiler-rt/test/profile/Inputs/instrprof-tls-dlclose-lib.c b/compiler-rt/test/profile/Inputs/instrprof-tls-dlclose-lib.c
new file mode 100644
index 0000000000000..fcf874000aa8e
--- /dev/null
+++ b/compiler-rt/test/profile/Inputs/instrprof-tls-dlclose-lib.c
@@ -0,0 +1,7 @@
+unsigned char determine_value_dyn(unsigned char c) {
+ if (c < 0x80) {
+ return c;
+ } else {
+ return -c;
+ }
+}
diff --git a/compiler-rt/test/profile/Inputs/instrprof-tls-dlclose-main.c b/compiler-rt/test/profile/Inputs/instrprof-tls-dlclose-main.c
new file mode 100644
index 0000000000000..309d405430af4
--- /dev/null
+++ b/compiler-rt/test/profile/Inputs/instrprof-tls-dlclose-main.c
@@ -0,0 +1,93 @@
+#include <dlfcn.h>
+#include <pthread.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+struct thread_arg {
+ uint64_t buf_size;
+ char const *buf;
+ uint64_t iteration_counter;
+ uint64_t output;
+};
+
+#ifndef DLOPEN_FUNC_DIR
+unsigned char determine_value_dyn(unsigned char);
+#endif
+
+void *thread_fn(void *arg_ptr) {
+#ifdef DLOPEN_FUNC_DIR
+
+ unsigned char (*determine_value_dyn)(unsigned char) = NULL;
+
+ const char *dynlib_name = DLOPEN_FUNC_DIR "/lib.shared";
+ const char *dynlib_sym = "determine_value_dyn";
+ void *handle = dlopen(dynlib_name, DLOPEN_FLAGS);
+ if (handle == NULL) {
+ fprintf(stderr, "dlopen error on: %s: %s\n", dynlib_name, dlerror());
+ exit(EXIT_FAILURE);
+ }
+
+ determine_value_dyn = dlsym(handle, dynlib_sym);
+ if (handle == NULL) {
+ fprintf(stderr, "dlsym error on: %s : %s\n", dynlib_name, dynlib_sym);
+ exit(EXIT_FAILURE);
+ }
+#endif
+
+ struct thread_arg *arg = (struct thread_arg *)arg_ptr;
+ for (uint64_t i = 0; i < arg->buf_size; i++) {
+ unsigned char c = (unsigned char)arg->buf[i];
+ arg->output += determine_value_dyn(c);
+ arg->iteration_counter++;
+ }
+
+ // This should unload the thread local counters region for this module,
+ // causing an expected failure for -fprofile-thread-local
+#ifdef DLOPEN_FUNC_DIR
+# ifndef DONT_CLOSE
+ dlclose(handle);
+# endif
+#endif
+ return NULL;
+}
+
+int main() {
+ const uint64_t len = 40000;
+
+ char *example_string = (char *)malloc(sizeof(char) * len);
+ int high = 0;
+ for (uint64_t i = 0; i < len; i++) {
+ if (high == 2) {
+ example_string[i] = 0xff;
+ high = 0;
+ } else {
+ example_string[i] = 0x0;
+ high++;
+ }
+ }
+
+ pthread_t thread;
+ struct thread_arg arg = {
+ len,
+ example_string,
+ 0,
+ 0,
+ };
+ if (pthread_create(&thread, NULL, thread_fn, &arg) != 0) {
+ fprintf(stderr, "Failed to spawn thread, exiting\n");
+ exit(EXIT_SUCCESS);
+ }
+
+ if (pthread_join(thread, NULL) != 0) {
+ fprintf(stderr, "Failed to join thread, continuing\n");
+ return EXIT_FAILURE;
+ }
+
+ printf("Thread output:\n"
+ "iteration_counter: %lu\n"
+ "output: %lx\n\n",
+ arg.iteration_counter, arg.output);
+
+ return EXIT_SUCCESS;
+}
diff --git a/compiler-rt/test/profile/Inputs/instrprof-tls-dlopen-func.c b/compiler-rt/test/profile/Inputs/instrprof-tls-dlopen-func.c
new file mode 100644
index 0000000000000..9ec903ab4c17a
--- /dev/null
+++ b/compiler-rt/test/profile/Inputs/instrprof-tls-dlopen-func.c
@@ -0,0 +1,9 @@
+#include <stdint.h>
+
+int8_t func(int8_t input) {
+ if (input < 0) {
+ return input;
+ } else {
+ return -input;
+ }
+}
diff --git a/compiler-rt/test/profile/Inputs/instrprof-tls-dlopen-func2.c b/compiler-rt/test/profile/Inputs/instrprof-tls-dlopen-func2.c
new file mode 100644
index 0000000000000..94122d793a6ee
--- /dev/null
+++ b/compiler-rt/test/profile/Inputs/instrprof-tls-dlopen-func2.c
@@ -0,0 +1,9 @@
+#include <stdint.h>
+
+int8_t func2(int8_t input) {
+ if (input >= 0) {
+ return -1;
+ } else {
+ return 1;
+ }
+}
diff --git a/compiler-rt/test/profile/Inputs/instrprof-tls-dlopen-main.c b/compiler-rt/test/profile/Inputs/instrprof-tls-dlopen-main.c
new file mode 100644
index 0000000000000..fc436841d233c
--- /dev/null
+++ b/compiler-rt/test/profile/Inputs/instrprof-tls-dlopen-main.c
@@ -0,0 +1,105 @@
+#include <pthread.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#ifdef DLOPEN_FUNC_DIR
+# include <dlfcn.h>
+int8_t (*func)(int8_t) = NULL;
+int8_t (*func2)(int8_t) = NULL;
+#else
+int8_t func(int8_t);
+int8_t func2(int8_t);
+#endif
+
+struct thread_arg {
+ uint64_t buf_size;
+ char const *buf;
+ uint64_t output;
+};
+
+void *thread_fn(void *arg_ptr) {
+ struct thread_arg *arg = (struct thread_arg *)arg_ptr;
+ for (uint64_t i = 0; i < arg->buf_size; i++) {
+ int8_t c = (int8_t)arg->buf[i];
+ arg->output += func(c);
+ arg->output += func2(c);
+ }
+ return NULL;
+}
+
+int main() {
+#define n_threads 10
+#define len 40000
+
+#ifdef DLOPEN_FUNC_DIR
+ const char *dynlib_path = DLOPEN_FUNC_DIR "/func.shared";
+ const char *dynlib_sym = "func";
+ void *handle = dlopen(dynlib_path, RTLD_LAZY);
+ if (handle == NULL) {
+ fprintf(stderr, "dlopen error on: %s: %s\n", dynlib_path, dlerror());
+ return EXIT_FAILURE;
+ }
+
+ func = dlsym(handle, dynlib_sym);
+ if (func == NULL) {
+ fprintf(stderr, "dlsym error on: %s : %s\n", dynlib_path, dynlib_sym);
+ return EXIT_FAILURE;
+ }
+
+ const char *dynlib_path2 = DLOPEN_FUNC_DIR "/func2.shared";
+ const char *dynlib_sym2 = "func2";
+ void *handle2 = dlopen(dynlib_path2, RTLD_LAZY);
+ if (handle2 == NULL) {
+ fprintf(stderr, "dlopen error on: %s: %s\n", dynlib_path2, dlerror());
+ return EXIT_FAILURE;
+ }
+
+ func2 = dlsym(handle2, dynlib_sym2);
+ if (func2 == NULL) {
+ fprintf(stderr, "dlsym error on: %s : %s\n", dynlib_path2, dynlib_sym2);
+ return EXIT_FAILURE;
+ }
+#endif
+
+ pthread_t threads[n_threads] = {0};
+ struct thread_arg args[n_threads] = {0};
+ char *example_string = (char *)malloc(sizeof(char) * len);
+ int high = 0;
+ for (uint64_t i = 0; i < len; i++) {
+ if (high == 2) {
+ example_string[i] = 0xff;
+ high = 0;
+ } else {
+ example_string[i] = 0x0;
+ high++;
+ }
+ }
+
+ for (uint64_t i = 0; i < n_threads; i++) {
+ struct thread_arg a = {
+ len,
+ example_string,
+ 0,
+ };
+ args[i] = a;
+ if (pthread_create(&threads[i], NULL, thread_fn, &args[i]) != 0) {
+ fprintf(stderr, "Failed to spawn thread %lu, exiting\n", i);
+ return EXIT_FAILURE;
+ }
+ }
+
+ int rc = EXIT_SUCCESS;
+ for (uint64_t i = 0; i < n_threads; i++) {
+ void *retval = NULL;
+ if (pthread_join(threads[i], &retval) != 0) {
+ printf("Failed to join thread %lu, continuing\n", i);
+ rc = EXIT_FAILURE;
+ }
+
+ printf("Thread %lu output:\n"
+ "output: %lx\n\n",
+ i, args[i].output);
+ }
+ return rc;
+}
diff --git a/compiler-rt/test/profile/Inputs/instrprof-tls-exit.c b/compiler-rt/test/profile/Inputs/instrprof-tls-exit.c
new file mode 100644
index 0000000000000..f7e6f78f019db
--- /dev/null
+++ b/compiler-rt/test/profile/Inputs/instrprof-tls-exit.c
@@ -0,0 +1,37 @@
+#include <pthread.h>
+#include <semaphore.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+void *exit_thread(void *arg_ptr) {
+ sem_t *s_p = (sem_t *)arg_ptr;
+ printf("Exit thread waiting...\n");
+ if (sem_wait(s_p)) {
+ fprintf(stderr, "Failed to wait on signal from main thread\n");
+ exit(EXIT_FAILURE);
+ }
+ printf("Exit thread activated\n");
+ exit(0);
+ return NULL;
+}
+
+int main() {
+ pthread_t exit;
+ sem_t s;
+ sem_init(&s, 0, 0);
+ if (pthread_create(&exit, NULL, exit_thread, &s) != 0) {
+ fprintf(stderr, "Failed to spawn exit thread\n");
+ return EXIT_FAILURE;
+ }
+ if (sem_post(&s)) {
+ fprintf(stderr, "Failed to send signal to exit thread\n");
+ return EXIT_FAILURE;
+ }
+ if (pthread_join(exit, NULL)) {
+ fprintf(stderr, "Failed to join exit thread\n");
+ return EXIT_FAILURE;
+ }
+ fprintf(stderr, "Child thread should have called exit()\n");
+ return EXIT_FAILURE;
+}
diff --git a/compiler-rt/test/profile/Linux/instrprof-tls-dlclose-memfault.test b/compiler-rt/test/profile/Linux/instrprof-tls-dlclose-memfault.test
new file mode 100644
index 0000000000000..3974102090b4e
--- /dev/null
+++ b/compiler-rt/test/profile/Linux/instrprof-tls-dlclose-memfault.test
@@ -0,0 +1,27 @@
+RUN: mkdir -p %t.tls.d
+RUN: %clang_profgen -fcoverage-mapping -fprofile-thread-local -fdata-sections -ffunction-sections -fuse-ld=gold -Wl,--gc-sections -o %t.tls.d/lib.shared -fPIC -shared %S/../Inputs/instrprof-tls-dlclose-lib.c
+
+RUN: %clang_profgen -fcoverage-mapping -lpthread -DDLOPEN_FUNC_DIR=\"%t.tls.d\" -DDLOPEN_FLAGS="RTLD_LAZY" -fprofile-thread-local -fdata-sections -ffunction-sections -fuse-ld=gold -Wl,--gc-sections -o %t-tls -rpath %t.tls.d %S/../Inputs/instrprof-tls-dlclose-main.c
+
+RUN: mkdir -p %t.atomic.d
+RUN: %clang_profgen -fcoverage-mapping -fprofile-update=atomic -fdata-sections -ffunction-sections -fuse-ld=gold -Wl,--gc-sections -o %t.atomic.d/lib.shared -fPIC -shared %S/../Inputs/instrprof-tls-dlclose-lib.c
+
+RUN: %clang_profgen -fcoverage-mapping -lpthread -DDLOPEN_FUNC_DIR=\"%t.atomic.d\" -DDLOPEN_FLAGS="RTLD_LAZY" -fprofile-update=atomic -fdata-sections -ffunction-sections -fuse-ld=gold -Wl,--gc-sections -o %t-atomic -rpath %t.atomic.d %S/../Inputs/instrprof-tls-dlclose-main.c
+
+RUN: env LLVM_PROFILE_FILE=%t-atomic.profraw %run %t-atomic
+// Here we expect a segfault until the dlclose issue is fixed
+RUN: env LLVM_PROFILE_FILE=%t-tls.profraw %run %t-tls
+
+RUN: llvm-profdata merge -o %t-tls.profdata %t-tls.profraw
+RUN: llvm-profdata merge -o %t-atomic.profdata %t-atomic.profraw
+RUN: %clang_profuse=%t-tls.profdata -o %t-main.tls.ll -S -emit-llvm %S/../Inputs/instrprof-tls-dlclose-main.c
+RUN: %clang_profuse=%t-atomic.profdata -o %t-main.atomic.ll -S -emit-llvm %S/../Inputs/instrprof-tls-dlclose-main.c
+RUN: diff %t-main.tls.ll %t-main.atomic.ll
+
+RUN: llvm-profdata merge -o %t-tls.profdata %t-tls.profraw
+RUN: llvm-profdata merge -o %t-atomic.profdata %t-atomic.profraw
+RUN: %clang_profuse=%t-tls.profdata -o %t-lib.tls.ll -S -emit-llvm %S/../Inputs/instrprof-tls-dlclose-lib.c
+RUN: %clang_profuse=%t-atomic.profdata -o %t-lib.atomic.ll -S -emit-llvm %S/../Inputs/instrprof-tls-dlclose-lib.c
+RUN: diff %t-lib.tls.ll %t-lib.atomic.ll
+
+XFAIL: target={{.*}}
diff --git a/compiler-rt/test/profile/Linux/instrprof-tls-dlclose-mix-subset.test b/compiler-rt/test/profile/Linux/instrprof-tls-dlclose-mix-subset.test
new file mode 100644
index 0000000000000..8cafef927ee7e
--- /dev/null
+++ b/compiler-rt/test/profile/Linux/instrprof-tls-dlclose-mix-subset.test
@@ -0,0 +1,41 @@
+// Passing subset of combos where you still get coverage from modules
+// which were opened with RTLD_NODELETE and later closed.
+//
+// These combos work because pthread_create is intercepted before
+// it is first called.
+
+// All threadlocal
+
+RUN: mkdir -p %t.tls.d
+RUN: %clang_profgen -fcoverage-mapping -fprofile-thread-local -fdata-sections -ffunction-sections -fuse-ld=gold -Wl,--gc-sections -o %t.tls.d/lib.shared -fPIC -shared %S/../Inputs/instrprof-tls-dlclose-lib.c
+
+RUN: %clang_profgen -fcoverage-mapping -lpthread -DDLOPEN_FUNC_DIR=\"%t.tls.d\" -DDLOPEN_FLAGS="RTLD_LAZY | RTLD_NODELETE" -fprofile-thread-local -fdata-sections -ffunction-sections -fuse-ld=gold -Wl,--gc-sections -o %t-tls -rpath %t.tls.d %S/../Inputs/instrprof-tls-dlclose-main.c
+
+// All atomic
+
+RUN: mkdir -p %t.atomic.d
+RUN: %clang_profgen -fcoverage-mapping -fprofile-update=atomic -fdata-sections -ffunction-sections -fuse-ld=gold -Wl,--gc-sections -o %t.atomic.d/lib.shared -fPIC -shared %S/../Inputs/instrprof-tls-dlclose-lib.c
+
+RUN: %clang_profgen -fcoverage-mapping -lpthread -DDLOPEN_FUNC_DIR=\"%t.atomic.d\" -DDLOPEN_FLAGS="RTLD_LAZY | RTLD_NODELETE" -fprofile-update=atomic -fdata-sections -ffunction-sections -fuse-ld=gold -Wl,--gc-sections -o %t-atomic -rpath %t.atomic.d %S/../Inputs/instrprof-tls-dlclose-main.c
+
+// Atomic lib, threadlocal exe
+
+RUN: mkdir -p %t.atomic-tl.d
+RUN: %clang_profgen -fcoverage-mapping -fprofile-update=atomic -fdata-sections -ffunction-sections -fuse-ld=gold -Wl,--gc-sections -o %t.atomic-tl.d/lib.shared -fPIC -shared %S/../Inputs/instrprof-tls-dlclose-lib.c
+
+RUN: %clang_profgen -fcoverage-mapping -lpthread -DDLOPEN_FUNC_DIR=\"%t.atomic-tl.d\" -DDLOPEN_FLAGS="RTLD_LAZY | RTLD_NODELETE" -fprofile-thread-local -fdata-sections -ffunction-sections -fuse-ld=gold -Wl,--gc-sections -o %t-atomic-tl -rpath %t.atomic-tl.d %S/../Inputs/instrprof-tls-dlclose-main.c
+
+// Atomic exe, threadlocal lib: Not working.
+
+RUN: env LLVM_PROFILE_FILE=%t-atomic.profraw %run %t-atomic
+RUN: env LLVM_PROFILE_FILE=%t-tls.profraw %run %t-tls
+RUN: env LLVM_PROFILE_FILE=%t-atomic-tl.profraw %run %t-atomic-tl
+
+RUN: llvm-profdata merge -o %t-tls.profdata %t-tls.profraw
+RUN: llvm-profdata merge -o %t-atomic.profdata %t-atomic.profraw
+RUN: llvm-profdata merge -o %t-atomic-tl.profdata %t-atomic-tl.profraw
+RUN: %clang_profuse=%t-tls.profdata -o %t-main.tls.ll -S -emit-llvm %S/../Inputs/instrprof-tls-dlclose-main.c
+RUN: %clang_profuse=%t-atomic.profdata -o %t-main.atomic.ll -S -emit-llvm %S/../Inputs/instrprof-tls-dlclose-main.c
+RUN: %clang_profuse=%t-atomic-tl.profdata -o %t-main.atomic-tl.ll -S -emit-llvm %S/../Inputs/instrprof-tls-dlclose-main.c
+RUN: diff %t-main.tls.ll %t-main.atomic.ll
+RUN: diff %t-main.atomic-tl.ll %t-main.atomic.ll
diff --git a/compiler-rt/test/profile/Linux/instrprof-tls-dlclose-mix.test b/compiler-rt/test/profile/Linux/instrprof-tls-dlclose-mix.test
new file mode 100644
index 0000000000000..74d5f9e2a4f58
--- /dev/null
+++ b/compiler-rt/test/profile/Linux/instrprof-tls-dlclose-mix.test
@@ -0,0 +1,48 @@
+// All threadlocal
+
+RUN: mkdir -p %t.tls.d
+RUN: %clang_profgen -fcoverage-mapping -fprofile-thread-local -fdata-sections -ffunction-sections -fuse-ld=gold -Wl,--gc-sections -o %t.tls.d/lib.shared -fPIC -shared %S/../Inputs/instrprof-tls-dlclose-lib.c
+
+RUN: %clang_profgen -fcoverage-mapping -lpthread -DDLOPEN_FUNC_DIR=\"%t.tls.d\" -DDLOPEN_FLAGS="RTLD_LAZY | RTLD_NODELETE" -fprofile-thread-local -fdata-sections -ffunction-sections -fuse-ld=gold -Wl,--gc-sections -o %t-tls -rpath %t.tls.d %S/../Inputs/instrprof-tls-dlclose-main.c
+
+// All atomic
+
+RUN: mkdir -p %t.atomic.d
+RUN: %clang_profgen -fcoverage-mapping -fprofile-update=atomic -fdata-sections -ffunction-sections -fuse-ld=gold -Wl,--gc-sections -o %t.atomic.d/lib.shared -fPIC -shared %S/../Inputs/instrprof-tls-dlclose-lib.c
+
+RUN: %clang_profgen -fcoverage-mapping -lpthread -DDLOPEN_FUNC_DIR=\"%t.atomic.d\" -DDLOPEN_FLAGS="RTLD_LAZY | RTLD_NODELETE" -fprofile-update=atomic -fdata-sections -ffunction-sections -fuse-ld=gold -Wl,--gc-sections -o %t-atomic -rpath %t.atomic.d %S/../Inputs/instrprof-tls-dlclose-main.c
+
+// Atomic lib, threadlocal exe
+
+RUN: mkdir -p %t.atomic-tl.d
+RUN: %clang_profgen -fcoverage-mapping -fprofile-update=atomic -fdata-sections -ffunction-sections -fuse-ld=gold -Wl,--gc-sections -o %t.atomic-tl.d/lib.shared -fPIC -shared %S/../Inputs/instrprof-tls-dlclose-lib.c
+
+RUN: %clang_profgen -fcoverage-mapping -lpthread -DDLOPEN_FUNC_DIR=\"%t.atomic-tl.d\" -DDLOPEN_FLAGS="RTLD_LAZY | RTLD_NODELETE" -fprofile-thread-local -fdata-sections -ffunction-sections -fuse-ld=gold -Wl,--gc-sections -o %t-atomic-tl -rpath %t.atomic-tl.d %S/../Inputs/instrprof-tls-dlclose-main.c
+
+// Atomic exe, threadlocal lib
+
+RUN: mkdir -p %t.tl-atomic.d
+RUN: %clang_profgen -fcoverage-mapping -fprofile-thread-local -fdata-sections -ffunction-sections -fuse-ld=gold -Wl,--gc-sections -o %t.tl-atomic.d/lib.shared -fPIC -shared %S/../Inputs/instrprof-tls-dlclose-lib.c
+
+RUN: %clang_profgen -fcoverage-mapping -lpthread -DDLOPEN_FUNC_DIR=\"%t.tl-atomic.d\" -DDLOPEN_FLAGS="RTLD_LAZY | RTLD_NODELETE" -fprofile-update=atomic -fdata-sections -ffunction-sections -fuse-ld=gold -Wl,--gc-sections -o %t-tl-atomic -rpath %t.tl-atomic.d %S/../Inputs/instrprof-tls-dlclose-main.c
+
+
+RUN: env LLVM_PROFILE_FILE=%t-atomic.profraw %run %t-atomic
+RUN: env LLVM_PROFILE_FILE=%t-tls.profraw %run %t-tls
+RUN: env LLVM_PROFILE_FILE=%t-atomic-tl.profraw %run %t-atomic-tl
+RUN: env LLVM_PROFILE_FILE=%t-tl-atomic.profraw %run %t-tl-atomic
+
+RUN: llvm-profdata merge -o %t-tls.profdata %t-tls.profraw
+RUN: llvm-profdata merge -o %t-atomic.profdata %t-atomic.profraw
+RUN: llvm-profdata merge -o %t-atomic-tl.profdata %t-atomic-tl.profraw
+RUN: llvm-profdata merge -o %t-tl-atomic.profdata %t-tl-atomic.profraw
+RUN: %clang_profuse=%t-tls.profdata -o %t-main.tls.ll -S -emit-llvm %S/../Inputs/instrprof-tls-dlclose-main.c
+RUN: %clang_profuse=%t-atomic.profdata -o %t-main.atomic.ll -S -emit-llvm %S/../Inputs/instrprof-tls-dlclose-main.c
+RUN: %clang_profuse=%t-atomic-tl.profdata -o %t-main.atomic-tl.ll -S -emit-llvm %S/../Inputs/instrprof-tls-dlclose-main.c
+RUN: %clang_profuse=%t-tl-atomic.profdata -o %t-main.tl-atomic.ll -S -emit-llvm %S/../Inputs/instrprof-tls-dlclose-main.c
+RUN: diff %t-main.tls.ll %t-main.atomic.ll
+RUN: diff %t-main.atomic-tl.ll %t-main.atomic.ll
+RUN: diff %t-main.tl-atomic.ll %t-main.atomic.ll
+
+// Atomic exe, threadlocal lib does not pass.
+XFAIL: target={{.*}}
diff --git a/compiler-rt/test/profile/Linux/instrprof-tls-dlclose-nodelete.test b/compiler-rt/test/profile/Linux/instrprof-tls-dlclose-nodelete.test
new file mode 100644
index 0000000000000..8e99a3b60a69a
--- /dev/null
+++ b/compiler-rt/test/profile/Linux/instrprof-tls-dlclose-nodelete.test
@@ -0,0 +1,24 @@
+RUN: mkdir -p %t.tls.d
+RUN: %clang_profgen -fcoverage-mapping -fprofile-thread-local -fdata-sections -ffunction-sections -fuse-ld=gold -Wl,--gc-sections -o %t.tls.d/lib.shared -fPIC -shared %S/../Inputs/instrprof-tls-dlclose-lib.c
+
+RUN: %clang_profgen -fcoverage-mapping -lpthread -DDLOPEN_FUNC_DIR=\"%t.tls.d\" -DDLOPEN_FLAGS="RTLD_NODELETE | RTLD_LAZY" -fprofile-thread-local -fdata-sections -ffunction-sections -fuse-ld=gold -Wl,--gc-sections -o %t-tls -rpath %t.tls.d %S/../Inputs/instrprof-tls-dlclose-main.c
+
+RUN: mkdir -p %t.atomic.d
+RUN: %clang_profgen -fcoverage-mapping -fprofile-update=atomic -fdata-sections -ffunction-sections -fuse-ld=gold -Wl,--gc-sections -o %t.atomic.d/lib.shared -fPIC -shared %S/../Inputs/instrprof-tls-dlclose-lib.c
+
+RUN: %clang_profgen -fcoverage-mapping -lpthread -DDLOPEN_FUNC_DIR=\"%t.atomic.d\" -DDLOPEN_FLAGS="RTLD_NODELETE | RTLD_LAZY" -fprofile-update=atomic -fdata-sections -ffunction-sections -fuse-ld=gold -Wl,--gc-sections -o %t-atomic -rpath %t.atomic.d %S/../Inputs/instrprof-tls-dlclose-main.c
+
+RUN: env LLVM_PROFILE_FILE=%t-atomic.profraw %run %t-atomic
+RUN: env LLVM_PROFILE_FILE=%t-tls.profraw %run %t-tls
+
+RUN: llvm-profdata merge -o %t-tls.profdata %t-tls.profraw
+RUN: llvm-profdata merge -o %t-atomic.profdata %t-atomic.profraw
+RUN: %clang_profuse=%t-tls.profdata -o %t-main.tls.ll -S -emit-llvm %S/../Inputs/instrprof-tls-dlclose-main.c
+RUN: %clang_profuse=%t-atomic.profdata -o %t-main.atomic.ll -S -emit-llvm %S/../Inputs/instrprof-tls-dlclose-main.c
+RUN: diff %t-main.tls.ll %t-main.atomic.ll
+
+RUN: llvm-profdata merge -o %t-tls.profdata %t-tls.profraw
+RUN: llvm-profdata merge -o %t-atomic.profdata %t-atomic.profraw
+RUN: %clang_profuse=%t-tls.profdata -o %t-lib.tls.ll -S -emit-llvm %S/../Inputs/instrprof-tls-dlclose-lib.c
+RUN: %clang_profuse=%t-atomic.profdata -o %t-lib.atomic.ll -S -emit-llvm %S/../Inputs/instrprof-tls-dlclose-lib.c
+RUN: diff %t-lib.tls.ll %t-lib.atomic.ll
diff --git a/compiler-rt/test/profile/Linux/instrprof-tls-dlopen.test b/compiler-rt/test/profile/Linux/instrprof-tls-dlopen.test
new file mode 100644
index 0000000000000..990c87e1bd86b
--- /dev/null
+++ b/compiler-rt/test/profile/Linux/instrprof-tls-dlopen.test
@@ -0,0 +1,32 @@
+RUN: mkdir -p %t.tls.d
+RUN: %clang_profgen -fcoverage-mapping -fprofile-thread-local -fdata-sections -ffunction-sections -fuse-ld=gold -Wl,--gc-sections -o %t.tls.d/func.shared -fPIC -shared %S/../Inputs/instrprof-tls-dlopen-func.c
+RUN: %clang_profgen -fcoverage-mapping -fprofile-thread-local -fdata-sections -ffunction-sections -fuse-ld=gold -Wl,--gc-sections -o %t.tls.d/func2.shared -fPIC -shared %S/../Inputs/instrprof-tls-dlopen-func2.c
+
+RUN: %clang_profgen -fcoverage-mapping -lpthread -DDLOPEN_FUNC_DIR=\"%t.tls.d\" -DDLOPEN_FLAGS="RTLD_LAZY" -fprofile-thread-local -fdata-sections -ffunction-sections -fuse-ld=gold -Wl,--gc-sections -o %t-tls -rpath %t.tls.d %S/../Inputs/instrprof-tls-dlopen-main.c
+
+RUN: mkdir -p %t.atomic.d
+RUN: %clang_profgen -fcoverage-mapping -fprofile-update=atomic -fdata-sections -ffunction-sections -fuse-ld=gold -Wl,--gc-sections -o %t.atomic.d/func.shared -fPIC -shared %S/../Inputs/instrprof-tls-dlopen-func.c
+RUN: %clang_profgen -fcoverage-mapping -fprofile-update=atomic -fdata-sections -ffunction-sections -fuse-ld=gold -Wl,--gc-sections -o %t.atomic.d/func2.shared -fPIC -shared %S/../Inputs/instrprof-tls-dlopen-func2.c
+
+RUN: %clang_profgen -fcoverage-mapping -lpthread -DDLOPEN_FUNC_DIR=\"%t.atomic.d\" -DDLOPEN_FLAGS="RTLD_LAZY" -fprofile-update=atomic -fdata-sections -ffunction-sections -fuse-ld=gold -Wl,--gc-sections -o %t-atomic -rpath %t.atomic.d %S/../Inputs/instrprof-tls-dlopen-main.c
+
+RUN: env LLVM_PROFILE_FILE=%t-tls.profraw %run %t-tls
+RUN: env LLVM_PROFILE_FILE=%t-atomic.profraw %run %t-atomic
+
+RUN: llvm-profdata merge -o %t-tls.profdata %t-tls.profraw
+RUN: llvm-profdata merge -o %t-atomic.profdata %t-atomic.profraw
+RUN: %clang_profuse=%t-tls.profdata -o %t-main.tls.ll -S -emit-llvm %S/../Inputs/instrprof-tls-dlopen-main.c
+RUN: %clang_profuse=%t-atomic.profdata -o %t-main.atomic.ll -S -emit-llvm %S/../Inputs/instrprof-tls-dlopen-main.c
+RUN: diff %t-main.tls.ll %t-main.atomic.ll
+
+RUN: llvm-profdata merge -o %t-tls.profdata %t-tls.profraw
+RUN: llvm-profdata merge -o %t-atomic.profdata %t-atomic.profraw
+RUN: %clang_profuse=%t-tls.profdata -o %t-func2.tls.ll -S -emit-llvm %S/../Inputs/instrprof-tls-dlopen-func2.c
+RUN: %clang_profuse=%t-atomic.profdata -o %t-func2.atomic.ll -S -emit-llvm %S/../Inputs/instrprof-tls-dlopen-func2.c
+RUN: diff %t-func2.tls.ll %t-func2.atomic.ll
+
+RUN: llvm-profdata merge -o %t-tls.profdata %t-tls.profraw
+RUN: llvm-profdata merge -o %t-atomic.profdata %t-atomic.profraw
+RUN: %clang_profuse=%t-tls.profdata -o %t-func.tls.ll -S -emit-llvm %S/../Inputs/instrprof-tls-dlopen-func.c
+RUN: %clang_profuse=%t-atomic.profdata -o %t-func.atomic.ll -S -emit-llvm %S/../Inputs/instrprof-tls-dlopen-func.c
+RUN: diff %t-func.tls.ll %t-func.atomic.ll
diff --git a/compiler-rt/test/profile/Linux/instrprof-tls-exit.test b/compiler-rt/test/profile/Linux/instrprof-tls-exit.test
new file mode 100644
index 0000000000000..fef3c78f0726c
--- /dev/null
+++ b/compiler-rt/test/profile/Linux/instrprof-tls-exit.test
@@ -0,0 +1,17 @@
+RUN: %clang_profgen -fcoverage-mapping -fprofile-update=atomic -fdata-sections -ffunction-sections -fuse-ld=gold -Wl,--gc-sections -o %t-atomic %S/../Inputs/instrprof-tls-exit.c
+
+RUN: %clang_profgen -fcoverage-mapping -fprofile-thread-local -fdata-sections -ffunction-sections -fuse-ld=gold -Wl,--gc-sections -o %t-tls %S/../Inputs/instrprof-tls-exit.c
+
+RUN: env LLVM_PROFILE_FILE=%t-tls.profraw %run %t-tls
+RUN: env LLVM_PROFILE_FILE=%t-atomic.profraw %run %t-atomic
+
+RUN: llvm-profdata merge -o %t-tls.profdata %t-tls.profraw
+RUN: llvm-profdata merge -o %t-atomic.profdata %t-atomic.profraw
+
+RUN: %clang_profuse=%t-tls.profdata -o %t-tls.ll -S -emit-llvm %S/../Inputs/instrprof-tls-exit.c
+RUN: %clang_profuse=%t-atomic.profdata -o %t-atomic.ll -S -emit-llvm %S/../Inputs/instrprof-tls-exit.c
+RUN: diff %t-tls.ll %t-atomic.ll
+
+# With the first iteration of this change, it is understood that only exiting via the main thread will cause
+# expected coverage outputs.
+XFAIL: target={{.*}}
diff --git a/compiler-rt/test/profile/Linux/instrprof-tls-noclose-mix.test b/compiler-rt/test/profile/Linux/instrprof-tls-noclose-mix.test
new file mode 100644
index 0000000000000..67cb1d2f66543
--- /dev/null
+++ b/compiler-rt/test/profile/Linux/instrprof-tls-noclose-mix.test
@@ -0,0 +1,51 @@
+// All threadlocal
+
+RUN: mkdir -p %t.tls.d
+RUN: %clang_profgen -fcoverage-mapping -fprofile-thread-local -fdata-sections -ffunction-sections -fuse-ld=gold -Wl,--gc-sections -o %t.tls.d/lib.shared -fPIC -shared %S/../Inputs/instrprof-tls-dlclose-lib.c
+
+RUN: %clang_profgen -fcoverage-mapping -lpthread -DDONT_CLOSE -DDLOPEN_FUNC_DIR=\"%t.tls.d\" -DDLOPEN_FLAGS="RTLD_LAZY | RTLD_NODELETE" -fprofile-thread-local -fdata-sections -ffunction-sections -fuse-ld=gold -Wl,--gc-sections -o %t-tls -rpath %t.tls.d %S/../Inputs/instrprof-tls-dlclose-main.c
+
+// All atomic
+
+RUN: mkdir -p %t.atomic.d
+RUN: %clang_profgen -fcoverage-mapping -fprofile-update=atomic -fdata-sections -ffunction-sections -fuse-ld=gold -Wl,--gc-sections -o %t.atomic.d/lib.shared -fPIC -shared %S/../Inputs/instrprof-tls-dlclose-lib.c
+
+RUN: %clang_profgen -fcoverage-mapping -lpthread -DDONT_CLOSE -DDLOPEN_FUNC_DIR=\"%t.atomic.d\" -DDLOPEN_FLAGS="RTLD_LAZY | RTLD_NODELETE" -fprofile-update=atomic -fdata-sections -ffunction-sections -fuse-ld=gold -Wl,--gc-sections -o %t-atomic -rpath %t.atomic.d %S/../Inputs/instrprof-tls-dlclose-main.c
+
+// Atomic lib, threadlocal exe
+
+RUN: mkdir -p %t.atomic-tl.d
+RUN: %clang_profgen -fcoverage-mapping -fprofile-update=atomic -fdata-sections -ffunction-sections -fuse-ld=gold -Wl,--gc-sections -o %t.atomic-tl.d/lib.shared -fPIC -shared %S/../Inputs/instrprof-tls-dlclose-lib.c
+
+RUN: %clang_profgen -fcoverage-mapping -lpthread -DDONT_CLOSE -DDLOPEN_FUNC_DIR=\"%t.atomic-tl.d\" -DDLOPEN_FLAGS="RTLD_LAZY | RTLD_NODELETE" -fprofile-thread-local -fdata-sections -ffunction-sections -fuse-ld=gold -Wl,--gc-sections -o %t-atomic-tl -rpath %t.atomic-tl.d %S/../Inputs/instrprof-tls-dlclose-main.c
+
+// Atomic exe, threadlocal lib
+
+RUN: mkdir -p %t.tl-atomic.d
+RUN: %clang_profgen -fcoverage-mapping -fprofile-thread-local -fdata-sections -ffunction-sections -fuse-ld=gold -Wl,--gc-sections -o %t.tl-atomic.d/lib.shared -fPIC -shared %S/../Inputs/instrprof-tls-dlclose-lib.c
+
+RUN: %clang_profgen -fcoverage-mapping -g3 -lpthread -DDONT_CLOSE -DDLOPEN_FUNC_DIR=\"%t.tl-atomic.d\" -DDLOPEN_FLAGS="RTLD_LAZY | RTLD_NODELETE" -fprofile-update=atomic -fdata-sections -ffunction-sections -fuse-ld=gold -Wl,--gc-sections -o %t-tl-atomic -rpath %t.tl-atomic.d %S/../Inputs/instrprof-tls-dlclose-main.c
+
+
+RUN: env LLVM_PROFILE_FILE=%t-atomic.profraw %run %t-atomic
+RUN: env LLVM_PROFILE_FILE=%t-tls.profraw %run %t-tls
+RUN: env LLVM_PROFILE_FILE=%t-atomic-tl.profraw %run %t-atomic-tl
+RUN: env LLVM_PROFILE_FILE=%t-tl-atomic.profraw %run %t-tl-atomic
+
+RUN: llvm-profdata merge -o %t-tls.profdata %t-tls.profraw
+RUN: llvm-profdata merge -o %t-atomic.profdata %t-atomic.profraw
+RUN: llvm-profdata merge -o %t-atomic-tl.profdata %t-atomic-tl.profraw
+RUN: llvm-profdata merge -o %t-tl-atomic.profdata %t-tl-atomic.profraw
+RUN: %clang_profuse=%t-tls.profdata -o %t-main.tls.ll -S -emit-llvm %S/../Inputs/instrprof-tls-dlclose-main.c
+RUN: %clang_profuse=%t-atomic.profdata -o %t-main.atomic.ll -S -emit-llvm %S/../Inputs/instrprof-tls-dlclose-main.c
+RUN: %clang_profuse=%t-atomic-tl.profdata -o %t-main.atomic-tl.ll -S -emit-llvm %S/../Inputs/instrprof-tls-dlclose-main.c
+RUN: %clang_profuse=%t-tl-atomic.profdata -o %t-main.tl-atomic.ll -S -emit-llvm %S/../Inputs/instrprof-tls-dlclose-main.c
+RUN: diff %t-main.tls.ll %t-main.atomic.ll
+RUN: diff %t-main.atomic-tl.ll %t-main.atomic.ll
+
+// The failure associated with threadlocal dlopened lib, atomic-update executable is that pthread_create
+// is not intercepted before it is first called. That means that we can't run the thread exit handler.
+RUN: diff %t-main.tl-atomic.ll %t-main.atomic.ll
+
+// Atomic exe, threadlocal lib does not pass.
+XFAIL: target={{.*}}
diff --git a/compiler-rt/test/profile/Linux/instrprof-tls-shared-mix-subset.test b/compiler-rt/test/profile/Linux/instrprof-tls-shared-mix-subset.test
new file mode 100644
index 0000000000000..6d6b7e4b3bb59
--- /dev/null
+++ b/compiler-rt/test/profile/Linux/instrprof-tls-shared-mix-subset.test
@@ -0,0 +1,35 @@
+// All threadlocal
+
+RUN: mkdir -p %t.tls.d
+RUN: %clang_profgen -fcoverage-mapping -fprofile-thread-local -fdata-sections -ffunction-sections -fuse-ld=gold -Wl,--gc-sections -o %t.tls.d/liblib.so -fPIC -shared %S/../Inputs/instrprof-tls-dlclose-lib.c
+
+RUN: %clang_profgen -fcoverage-mapping -lpthread -fprofile-thread-local -fdata-sections -ffunction-sections -fuse-ld=gold -Wl,--gc-sections -o %t-tls -rpath %t.tls.d -L%t.tls.d -llib %S/../Inputs/instrprof-tls-dlclose-main.c
+
+// All atomic
+
+RUN: mkdir -p %t.atomic.d
+RUN: %clang_profgen -fcoverage-mapping -fprofile-update=atomic -fdata-sections -ffunction-sections -fuse-ld=gold -Wl,--gc-sections -o %t.atomic.d/liblib.so -fPIC -shared %S/../Inputs/instrprof-tls-dlclose-lib.c
+
+RUN: %clang_profgen -fcoverage-mapping -lpthread -fprofile-update=atomic -fdata-sections -ffunction-sections -fuse-ld=gold -Wl,--gc-sections -o %t-atomic -rpath %t.atomic.d -L%t.atomic.d -llib %S/../Inputs/instrprof-tls-dlclose-main.c
+
+// Atomic lib, threadlocal exe
+
+RUN: mkdir -p %t.atomic-tl.d
+RUN: %clang_profgen -fcoverage-mapping -fprofile-update=atomic -fdata-sections -ffunction-sections -fuse-ld=gold -Wl,--gc-sections -o %t.atomic-tl.d/liblib.so -fPIC -shared %S/../Inputs/instrprof-tls-dlclose-lib.c
+
+RUN: %clang_profgen -fcoverage-mapping -lpthread -fprofile-thread-local -fdata-sections -ffunction-sections -fuse-ld=gold -Wl,--gc-sections -o %t-atomic-tl -rpath %t.atomic-tl.d -L%t.atomic-tl.d -llib %S/../Inputs/instrprof-tls-dlclose-main.c
+
+// Atomic exe, threadlocal lib
+
+RUN: env LLVM_PROFILE_FILE=%t-atomic.profraw %run %t-atomic
+RUN: env LLVM_PROFILE_FILE=%t-tls.profraw %run %t-tls
+RUN: env LLVM_PROFILE_FILE=%t-atomic-tl.profraw %run %t-atomic-tl
+
+RUN: llvm-profdata merge -o %t-tls.profdata %t-tls.profraw
+RUN: llvm-profdata merge -o %t-atomic.profdata %t-atomic.profraw
+RUN: llvm-profdata merge -o %t-atomic-tl.profdata %t-atomic-tl.profraw
+RUN: %clang_profuse=%t-tls.profdata -o %t-main.tls.ll -S -emit-llvm %S/../Inputs/instrprof-tls-dlclose-main.c
+RUN: %clang_profuse=%t-atomic.profdata -o %t-main.atomic.ll -S -emit-llvm %S/../Inputs/instrprof-tls-dlclose-main.c
+RUN: %clang_profuse=%t-atomic-tl.profdata -o %t-main.atomic-tl.ll -S -emit-llvm %S/../Inputs/instrprof-tls-dlclose-main.c
+RUN: diff %t-main.tls.ll %t-main.atomic.ll
+RUN: diff %t-main.atomic-tl.ll %t-main.atomic.ll
diff --git a/compiler-rt/test/profile/Linux/instrprof-tls-shared-mix.test b/compiler-rt/test/profile/Linux/instrprof-tls-shared-mix.test
new file mode 100644
index 0000000000000..1fb58128ada32
--- /dev/null
+++ b/compiler-rt/test/profile/Linux/instrprof-tls-shared-mix.test
@@ -0,0 +1,48 @@
+// All threadlocal
+
+RUN: mkdir -p %t.tls.d
+RUN: %clang_profgen -fcoverage-mapping -fprofile-thread-local -fdata-sections -ffunction-sections -fuse-ld=gold -Wl,--gc-sections -o %t.tls.d/liblib.so -fPIC -shared %S/../Inputs/instrprof-tls-dlclose-lib.c
+
+RUN: %clang_profgen -fcoverage-mapping -lpthread -fprofile-thread-local -fdata-sections -ffunction-sections -fuse-ld=gold -Wl,--gc-sections -o %t-tls -rpath %t.tls.d -L%t.tls.d -llib %S/../Inputs/instrprof-tls-dlclose-main.c
+
+// All atomic
+
+RUN: mkdir -p %t.atomic.d
+RUN: %clang_profgen -fcoverage-mapping -fprofile-update=atomic -fdata-sections -ffunction-sections -fuse-ld=gold -Wl,--gc-sections -o %t.atomic.d/liblib.so -fPIC -shared %S/../Inputs/instrprof-tls-dlclose-lib.c
+
+RUN: %clang_profgen -fcoverage-mapping -lpthread -fprofile-update=atomic -fdata-sections -ffunction-sections -fuse-ld=gold -Wl,--gc-sections -o %t-atomic -rpath %t.atomic.d -L%t.atomic.d -llib %S/../Inputs/instrprof-tls-dlclose-main.c
+
+// Atomic lib, threadlocal exe
+
+RUN: mkdir -p %t.atomic-tl.d
+RUN: %clang_profgen -fcoverage-mapping -fprofile-update=atomic -fdata-sections -ffunction-sections -fuse-ld=gold -Wl,--gc-sections -o %t.atomic-tl.d/liblib.so -fPIC -shared %S/../Inputs/instrprof-tls-dlclose-lib.c
+
+RUN: %clang_profgen -fcoverage-mapping -lpthread -fprofile-thread-local -fdata-sections -ffunction-sections -fuse-ld=gold -Wl,--gc-sections -o %t-atomic-tl -rpath %t.atomic-tl.d -L%t.atomic-tl.d -llib %S/../Inputs/instrprof-tls-dlclose-main.c
+
+// Atomic exe, threadlocal lib
+
+RUN: mkdir -p %t.tl-atomic.d
+RUN: %clang_profgen -fcoverage-mapping -fprofile-thread-local -fdata-sections -ffunction-sections -fuse-ld=gold -Wl,--gc-sections -o %t.tl-atomic.d/liblib.so -fPIC -shared %S/../Inputs/instrprof-tls-dlclose-lib.c
+
+RUN: %clang_profgen -fcoverage-mapping -lpthread -fprofile-update=atomic -fdata-sections -ffunction-sections -fuse-ld=gold -Wl,--gc-sections -o %t-tl-atomic -rpath %t.tl-atomic.d -L%t.tl-atomic.d -llib %S/../Inputs/instrprof-tls-dlclose-main.c
+
+
+RUN: env LLVM_PROFILE_FILE=%t-atomic.profraw %run %t-atomic
+RUN: env LLVM_PROFILE_FILE=%t-tls.profraw %run %t-tls
+RUN: env LLVM_PROFILE_FILE=%t-atomic-tl.profraw %run %t-atomic-tl
+RUN: env LLVM_PROFILE_FILE=%t-tl-atomic.profraw %run %t-tl-atomic
+
+RUN: llvm-profdata merge -o %t-tls.profdata %t-tls.profraw
+RUN: llvm-profdata merge -o %t-atomic.profdata %t-atomic.profraw
+RUN: llvm-profdata merge -o %t-atomic-tl.profdata %t-atomic-tl.profraw
+RUN: llvm-profdata merge -o %t-tl-atomic.profdata %t-tl-atomic.profraw
+RUN: %clang_profuse=%t-tls.profdata -o %t-main.tls.ll -S -emit-llvm %S/../Inputs/instrprof-tls-dlclose-main.c
+RUN: %clang_profuse=%t-atomic.profdata -o %t-main.atomic.ll -S -emit-llvm %S/../Inputs/instrprof-tls-dlclose-main.c
+RUN: %clang_profuse=%t-atomic-tl.profdata -o %t-main.atomic-tl.ll -S -emit-llvm %S/../Inputs/instrprof-tls-dlclose-main.c
+RUN: %clang_profuse=%t-tl-atomic.profdata -o %t-main.tl-atomic.ll -S -emit-llvm %S/../Inputs/instrprof-tls-dlclose-main.c
+RUN: diff %t-main.tls.ll %t-main.atomic.ll
+RUN: diff %t-main.atomic-tl.ll %t-main.atomic.ll
+RUN: diff %t-main.tl-atomic.ll %t-main.atomic.ll
+
+// Atomic exe, threadlocal lib does not pass.
+XFAIL: target={{.*}}
diff --git a/llvm/include/llvm/ProfileData/InstrProf.h b/llvm/include/llvm/ProfileData/InstrProf.h
index 817ad9550f652..0c78450641db6 100644
--- a/llvm/include/llvm/ProfileData/InstrProf.h
+++ b/llvm/include/llvm/ProfileData/InstrProf.h
@@ -100,6 +100,9 @@ inline StringRef getInstrProfDataVarPrefix() { return "__profd_"; }
/// Return the name prefix of profile counter variables.
inline StringRef getInstrProfCountersVarPrefix() { return "__profc_"; }
+/// Return the name prefix of profile counter variables.
+inline StringRef getInstrProfCountersTLSVarPrefix() { return "__profc_tls_"; }
+
/// Return the name prefix of profile bitmap variables.
inline StringRef getInstrProfBitmapVarPrefix() { return "__profbm_"; }
diff --git a/llvm/include/llvm/ProfileData/InstrProfData.inc b/llvm/include/llvm/ProfileData/InstrProfData.inc
index e9866d94b762c..8655bcf498437 100644
--- a/llvm/include/llvm/ProfileData/InstrProfData.inc
+++ b/llvm/include/llvm/ProfileData/InstrProfData.inc
@@ -312,6 +312,9 @@ INSTR_PROF_SECT_ENTRY(IPSK_data, \
INSTR_PROF_SECT_ENTRY(IPSK_cnts, \
INSTR_PROF_QUOTE(INSTR_PROF_CNTS_COMMON), \
INSTR_PROF_CNTS_COFF, "__DATA,")
+INSTR_PROF_SECT_ENTRY(IPSK_tls_cnts, \
+ INSTR_PROF_QUOTE(INSTR_PROF_TLS_CNTS_COMMON), \
+ INSTR_PROF_CNTS_COFF, "__DATA,")
INSTR_PROF_SECT_ENTRY(IPSK_bitmap, \
INSTR_PROF_QUOTE(INSTR_PROF_BITS_COMMON), \
INSTR_PROF_BITS_COFF, "__DATA,")
@@ -750,6 +753,7 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
#define INSTR_PROF_NAME_COMMON __llvm_prf_names
#define INSTR_PROF_VNAME_COMMON __llvm_prf_vns
#define INSTR_PROF_CNTS_COMMON __llvm_prf_cnts
+#define INSTR_PROF_TLS_CNTS_COMMON __llvm_tls_prf_cnts
#define INSTR_PROF_BITS_COMMON __llvm_prf_bits
#define INSTR_PROF_VALS_COMMON __llvm_prf_vals
#define INSTR_PROF_VNODES_COMMON __llvm_prf_vnds
diff --git a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
index f9b58d9f27821..d4005350e84ca 100644
--- a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
+++ b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
@@ -85,6 +85,11 @@ cl::opt<InstrProfCorrelator::ProfCorrelatorKind> ProfileCorrelate(
"Use debug info to correlate"),
clEnumValN(InstrProfCorrelator::BINARY, "binary",
"Use binary to correlate")));
+
+cl::opt<bool>
+ InstrProfThreadLocal("instr-prof-thread-local",
+ cl::desc("Generate thread local counter regions"),
+ cl::init(false));
} // namespace llvm
namespace {
@@ -215,6 +220,10 @@ class InstrLowerer final {
struct PerFunctionProfileData {
uint32_t NumValueSites[IPVK_Last + 1] = {};
GlobalVariable *RegionCounters = nullptr;
+ GlobalVariable *TLSRegionCounters = nullptr;
+ // Both a regular DataVar and TLS Datavar must exist when TLS counters are
+ // in use
+ GlobalVariable *TLSDataVar = nullptr;
GlobalVariable *DataVar = nullptr;
GlobalVariable *RegionBitmaps = nullptr;
uint32_t NumBitmapBytes = 0;
@@ -286,16 +295,24 @@ class InstrLowerer final {
/// acts on.
Value *getCounterAddress(InstrProfCntrInstBase *I);
+ Value *getThreadLocalCounterAddress(InstrProfCntrInstBase *I);
+
/// Get the region counters for an increment, creating them if necessary.
///
/// If the counter array doesn't yet exist, the profile data variables
/// referring to them will also be created.
GlobalVariable *getOrCreateRegionCounters(InstrProfCntrInstBase *Inc);
+ /// Get the thread local region counters, creating them if necessary.
+ /// These must exist alongside the global region counters.
+ GlobalVariable *
+ getOrCreateThreadLocalRegionCounters(InstrProfCntrInstBase *Inc);
+
/// Create the region counters.
GlobalVariable *createRegionCounters(InstrProfCntrInstBase *Inc,
StringRef Name,
- GlobalValue::LinkageTypes Linkage);
+ GlobalValue::LinkageTypes Linkage,
+ bool ThreadLocal);
/// Compute the address of the test vector bitmap that this profiling
/// instruction acts on.
@@ -608,6 +625,7 @@ enum class ValueProfilingCallType {
} // end anonymous namespace
+// TODO: put TLS counters incompatibility checks here
PreservedAnalyses InstrProfilingLoweringPass::run(Module &M,
ModuleAnalysisManager &AM) {
FunctionAnalysisManager &FAM =
@@ -894,6 +912,9 @@ void InstrLowerer::lowerValueProfileInst(InstrProfValueProfileInst *Ind) {
Value *InstrLowerer::getCounterAddress(InstrProfCntrInstBase *I) {
auto *Counters = getOrCreateRegionCounters(I);
+ if (InstrProfThreadLocal) {
+ return getThreadLocalCounterAddress(I);
+ }
IRBuilder<> Builder(I);
if (isa<InstrProfTimestampInst>(I))
@@ -932,6 +953,22 @@ Value *InstrLowerer::getCounterAddress(InstrProfCntrInstBase *I) {
return Builder.CreateIntToPtr(Add, Addr->getType());
}
+Value *InstrLowerer::getThreadLocalCounterAddress(InstrProfCntrInstBase *I) {
+ GlobalVariable *CountersTLS = getOrCreateThreadLocalRegionCounters(I);
+ IRBuilder<> Builder(I);
+
+ if (isa<InstrProfTimestampInst>(I))
+ CountersTLS->setAlignment(Align(8));
+
+ auto *Addr = Builder.CreateConstInBoundsGEP2_32(
+ CountersTLS->getValueType(),
+ Builder.CreateThreadLocalAddress(CountersTLS), 0,
+ I->getIndex()->getZExtValue());
+
+ assert(!isRuntimeCounterRelocationEnabled());
+ return Addr;
+}
+
Value *InstrLowerer::getBitmapAddress(InstrProfMCDCTVBitmapUpdate *I) {
auto *Bitmaps = getOrCreateRegionBitmaps(I);
IRBuilder<> Builder(I);
@@ -1391,13 +1428,18 @@ GlobalVariable *InstrLowerer::setupProfileSection(InstrProfInstBase *Inc,
VarPrefix = getInstrProfCountersVarPrefix();
VarName = getVarName(Inc, VarPrefix, Renamed);
InstrProfCntrInstBase *CntrIncrement = dyn_cast<InstrProfCntrInstBase>(Inc);
- Ptr = createRegionCounters(CntrIncrement, VarName, Linkage);
+ Ptr = createRegionCounters(CntrIncrement, VarName, Linkage, false);
} else if (IPSK == IPSK_bitmap) {
VarPrefix = getInstrProfBitmapVarPrefix();
VarName = getVarName(Inc, VarPrefix, Renamed);
InstrProfMCDCBitmapInstBase *BitmapUpdate =
dyn_cast<InstrProfMCDCBitmapInstBase>(Inc);
Ptr = createRegionBitmaps(BitmapUpdate, VarName, Linkage);
+ } else if (IPSK == IPSK_tls_cnts) {
+ VarPrefix = getInstrProfCountersTLSVarPrefix();
+ VarName = getVarName(Inc, VarPrefix, Renamed);
+ InstrProfCntrInstBase *CntrIncrement = dyn_cast<InstrProfCntrInstBase>(Inc);
+ Ptr = createRegionCounters(CntrIncrement, VarName, Linkage, true);
} else {
llvm_unreachable("Profile Section must be for Counters or Bitmaps");
}
@@ -1440,7 +1482,8 @@ InstrLowerer::getOrCreateRegionBitmaps(InstrProfMCDCBitmapInstBase *Inc) {
GlobalVariable *
InstrLowerer::createRegionCounters(InstrProfCntrInstBase *Inc, StringRef Name,
- GlobalValue::LinkageTypes Linkage) {
+ GlobalValue::LinkageTypes Linkage,
+ bool ThreadLocal) {
uint64_t NumCounters = Inc->getNumCounters()->getZExtValue();
auto &Ctx = M.getContext();
GlobalVariable *GV;
@@ -1460,6 +1503,7 @@ InstrLowerer::createRegionCounters(InstrProfCntrInstBase *Inc, StringRef Name,
Constant::getNullValue(CounterTy), Name);
GV->setAlignment(Align(8));
}
+ GV->setThreadLocal(ThreadLocal);
return GV;
}
@@ -1475,6 +1519,10 @@ InstrLowerer::getOrCreateRegionCounters(InstrProfCntrInstBase *Inc) {
auto *CounterPtr = setupProfileSection(Inc, IPSK_cnts);
PD.RegionCounters = CounterPtr;
+ if (InstrProfThreadLocal) {
+ PD.TLSRegionCounters = setupProfileSection(Inc, IPSK_tls_cnts);
+ }
+
if (DebugInfoCorrelate ||
ProfileCorrelate == InstrProfCorrelator::DEBUG_INFO) {
LLVMContext &Ctx = M.getContext();
@@ -1518,6 +1566,21 @@ InstrLowerer::getOrCreateRegionCounters(InstrProfCntrInstBase *Inc) {
return PD.RegionCounters;
}
+GlobalVariable *
+InstrLowerer::getOrCreateThreadLocalRegionCounters(InstrProfCntrInstBase *Inc) {
+ // If this check fails, this function would return a null pointer
+ assert(InstrProfThreadLocal);
+ GlobalVariable *NamePtr = Inc->getName();
+ auto &PD = ProfileDataMap[NamePtr];
+ if (PD.TLSRegionCounters) {
+ return PD.TLSRegionCounters;
+ } else {
+ // Initializes TLSRegionCounters when InstrProfThreadLocal is true
+ (void)getOrCreateRegionCounters(Inc);
+ return PD.TLSRegionCounters;
+ }
+}
+
void InstrLowerer::createDataVariable(InstrProfCntrInstBase *Inc) {
// When debug information is correlated to profile data, a data variable
// is not needed.
@@ -1555,6 +1618,8 @@ void InstrLowerer::createDataVariable(InstrProfCntrInstBase *Inc) {
getVarName(Inc, getInstrProfCountersVarPrefix(), Renamed);
std::string DataVarName =
getVarName(Inc, getInstrProfDataVarPrefix(), Renamed);
+ std::string TLSDataVarName =
+ getVarName(Inc, getInstrProfCountersTLSVarPrefix(), Renamed);
auto *Int8PtrTy = PointerType::getUnqual(Ctx);
// Allocate statically the array of pointers to value profile nodes for
More information about the cfe-commits
mailing list