[clang] [compiler-rt] [llvm] [InstrProf] Created Thread local counter instrumentation, compiler-rt runtimes (PR #95494)

Andrew Wock via cfe-commits cfe-commits at lists.llvm.org
Thu Jun 13 19:36:52 PDT 2024


https://github.com/ajwock created https://github.com/llvm/llvm-project/pull/95494

LLVM can now generate increments to counters in thread local storage.

Use a new compiler-rt runtime to atomically add thread local counters to global counters on thread exit.

The clang driver will link the new runtime libraries in when the new option -fprofile-thread-local is specified.

More details available in the RFC on discourse.

>From 44e2159636efd601c90aced44856d17d77728caa Mon Sep 17 00:00:00 2001
From: Andrew Wock <ajwock at gmail.com>
Date: Tue, 4 Jun 2024 09:45:31 -0400
Subject: [PATCH] Created Thread local counter instrumentation.

LLVM can now generate increments to counters in thread local storage.

Use a new compiler-rt runtime to atomically add thread local
counters to global counters on thread exit.

The clang driver will link the new runtime libraries in when the
new option -fprofile-thread-local is specified.

Signed-off-by: Andrew Wock <ajwock at gmail.com>
---
 clang/docs/UsersManual.rst                    |   8 ++
 clang/include/clang/Basic/CodeGenOptions.def  |   1 +
 clang/include/clang/Driver/Options.td         |   3 +
 clang/include/clang/Driver/ToolChain.h        |   6 +
 clang/lib/Driver/ToolChain.cpp                |  10 ++
 clang/lib/Driver/ToolChains/Clang.cpp         |  12 ++
 clang/lib/Driver/ToolChains/Linux.cpp         |   7 +
 compiler-rt/include/profile/InstrProfData.inc |   4 +
 compiler-rt/lib/profile/CMakeLists.txt        |  35 +++++
 .../lib/profile/InstrProfilingDyLibLinux.cpp  |  63 +++++++++
 compiler-rt/lib/profile/InstrProfilingFile.c  |   6 +
 .../lib/profile/InstrProfilingPlatformLinux.c |   1 +
 .../profile/InstrProfilingStaticTLSLinux.cpp  | 123 ++++++++++++++++++
 compiler-rt/lib/profile/InstrProfilingTLS.c   |  29 +++++
 compiler-rt/lib/profile/InstrProfilingTLS.h   |  39 ++++++
 .../lib/profile/InstrProfilingTLSDyLib.c      | 100 ++++++++++++++
 .../lib/profile/InstrProfilingTLSDyLib.h      |   4 +
 compiler-rt/lib/tsan/rtl/CMakeLists.txt       |   2 +-
 .../Inputs/instrprof-tls-dlclose-lib.c        |   7 +
 .../Inputs/instrprof-tls-dlclose-main.c       |  93 +++++++++++++
 .../Inputs/instrprof-tls-dlopen-func.c        |   9 ++
 .../Inputs/instrprof-tls-dlopen-func2.c       |   9 ++
 .../Inputs/instrprof-tls-dlopen-main.c        | 105 +++++++++++++++
 .../test/profile/Inputs/instrprof-tls-exit.c  |  37 ++++++
 .../Linux/instrprof-tls-dlclose-memfault.test |  27 ++++
 .../instrprof-tls-dlclose-mix-subset.test     |  41 ++++++
 .../Linux/instrprof-tls-dlclose-mix.test      |  48 +++++++
 .../Linux/instrprof-tls-dlclose-nodelete.test |  24 ++++
 .../profile/Linux/instrprof-tls-dlopen.test   |  32 +++++
 .../profile/Linux/instrprof-tls-exit.test     |  17 +++
 .../Linux/instrprof-tls-noclose-mix.test      |  51 ++++++++
 .../instrprof-tls-shared-mix-subset.test      |  35 +++++
 .../Linux/instrprof-tls-shared-mix.test       |  48 +++++++
 llvm/include/llvm/ProfileData/InstrProf.h     |   3 +
 .../llvm/ProfileData/InstrProfData.inc        |   4 +
 .../Instrumentation/InstrProfiling.cpp        |  71 +++++++++-
 36 files changed, 1110 insertions(+), 4 deletions(-)
 create mode 100644 compiler-rt/lib/profile/InstrProfilingDyLibLinux.cpp
 create mode 100644 compiler-rt/lib/profile/InstrProfilingStaticTLSLinux.cpp
 create mode 100644 compiler-rt/lib/profile/InstrProfilingTLS.c
 create mode 100644 compiler-rt/lib/profile/InstrProfilingTLS.h
 create mode 100644 compiler-rt/lib/profile/InstrProfilingTLSDyLib.c
 create mode 100644 compiler-rt/lib/profile/InstrProfilingTLSDyLib.h
 create mode 100644 compiler-rt/test/profile/Inputs/instrprof-tls-dlclose-lib.c
 create mode 100644 compiler-rt/test/profile/Inputs/instrprof-tls-dlclose-main.c
 create mode 100644 compiler-rt/test/profile/Inputs/instrprof-tls-dlopen-func.c
 create mode 100644 compiler-rt/test/profile/Inputs/instrprof-tls-dlopen-func2.c
 create mode 100644 compiler-rt/test/profile/Inputs/instrprof-tls-dlopen-main.c
 create mode 100644 compiler-rt/test/profile/Inputs/instrprof-tls-exit.c
 create mode 100644 compiler-rt/test/profile/Linux/instrprof-tls-dlclose-memfault.test
 create mode 100644 compiler-rt/test/profile/Linux/instrprof-tls-dlclose-mix-subset.test
 create mode 100644 compiler-rt/test/profile/Linux/instrprof-tls-dlclose-mix.test
 create mode 100644 compiler-rt/test/profile/Linux/instrprof-tls-dlclose-nodelete.test
 create mode 100644 compiler-rt/test/profile/Linux/instrprof-tls-dlopen.test
 create mode 100644 compiler-rt/test/profile/Linux/instrprof-tls-exit.test
 create mode 100644 compiler-rt/test/profile/Linux/instrprof-tls-noclose-mix.test
 create mode 100644 compiler-rt/test/profile/Linux/instrprof-tls-shared-mix-subset.test
 create mode 100644 compiler-rt/test/profile/Linux/instrprof-tls-shared-mix.test

diff --git a/clang/docs/UsersManual.rst b/clang/docs/UsersManual.rst
index f954857b0235a..f7db513b92909 100644
--- a/clang/docs/UsersManual.rst
+++ b/clang/docs/UsersManual.rst
@@ -2932,6 +2932,14 @@ indexed format, regardeless whether it is produced by frontend or the IR pass.
   overhead. ``prefer-atomic`` will be transformed to ``atomic`` when supported
   by the target, or ``single`` otherwise.
 
+.. option:: -fprofile-thread-local
+
+   Increment profile counters in thread local storage and atomically add their
+   values to global counters on thread exit.  This has the potential to deliver
+   both accuracy and high performance whenever there is high thread contention 
+   on profile counters.  This is an experimental option and it is only supported
+   on 64-bit linux.
+
 Fine Tuning Profile Collection
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
diff --git a/clang/include/clang/Basic/CodeGenOptions.def b/clang/include/clang/Basic/CodeGenOptions.def
index 7ffc40a00504f..7cd0bfb6d71b5 100644
--- a/clang/include/clang/Basic/CodeGenOptions.def
+++ b/clang/include/clang/Basic/CodeGenOptions.def
@@ -218,6 +218,7 @@ ENUM_CODEGENOPT(ProfileUse, ProfileInstrKind, 2, ProfileNone)
 /// instrumented. Selected group numbers can be 0 to N-1 inclusive.
 VALUE_CODEGENOPT(ProfileTotalFunctionGroups, 32, 1)
 VALUE_CODEGENOPT(ProfileSelectedFunctionGroup, 32, 0)
+CODEGENOPT(InstrProfileThreadLocal, 1, 0) ///< Counters are updated on a per-thread basis
 CODEGENOPT(CoverageMapping , 1, 0) ///< Generate coverage mapping regions to
                                    ///< enable code coverage analysis.
 CODEGENOPT(DumpCoverageMapping , 1, 0) ///< Dump the generated coverage mapping
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index d44faa55c456f..aab5b63c991f1 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -1768,6 +1768,9 @@ def fprofile_instr_generate : Flag<["-"], "fprofile-instr-generate">,
 def fprofile_instr_generate_EQ : Joined<["-"], "fprofile-instr-generate=">,
     Group<f_Group>, Visibility<[ClangOption, CLOption]>, MetaVarName<"<file>">,
     HelpText<"Generate instrumented code to collect execution counts into <file> (overridden by LLVM_PROFILE_FILE env var)">;
+def fprofile_thread_local : Flag<["-"], "fprofile-thread-local">,
+    Group<f_Group>, Visibility<[ClangOption, CLOption]>,
+    HelpText<"Generage profile counters in thread local storage">;
 def fprofile_instr_use : Flag<["-"], "fprofile-instr-use">, Group<f_Group>,
     Visibility<[ClangOption, CLOption]>;
 def fprofile_instr_use_EQ : Joined<["-"], "fprofile-instr-use=">,
diff --git a/clang/include/clang/Driver/ToolChain.h b/clang/include/clang/Driver/ToolChain.h
index 9789cfacafd78..162c730782afb 100644
--- a/clang/include/clang/Driver/ToolChain.h
+++ b/clang/include/clang/Driver/ToolChain.h
@@ -752,6 +752,12 @@ class ToolChain {
   virtual void addProfileRTLibs(const llvm::opt::ArgList &Args,
                                 llvm::opt::ArgStringList &CmdArgs) const;
 
+  /// addThreadLocalProfileRTLibs - With -fprofile-threadlocal, add the
+  /// threadlocal profile runtime static + shared library pair.
+  virtual void
+  addThreadLocalProfileRTLibs(const llvm::opt::ArgList &Args,
+                              llvm::opt::ArgStringList &CmdArgs) const;
+
   /// Add arguments to use system-specific CUDA includes.
   virtual void AddCudaIncludeArgs(const llvm::opt::ArgList &DriverArgs,
                                   llvm::opt::ArgStringList &CC1Args) const;
diff --git a/clang/lib/Driver/ToolChain.cpp b/clang/lib/Driver/ToolChain.cpp
index 40ab2e91125d1..4708cb7df5044 100644
--- a/clang/lib/Driver/ToolChain.cpp
+++ b/clang/lib/Driver/ToolChain.cpp
@@ -1078,6 +1078,16 @@ void ToolChain::addProfileRTLibs(const llvm::opt::ArgList &Args,
   CmdArgs.push_back(getCompilerRTArgString(Args, "profile"));
 }
 
+void ToolChain::addThreadLocalProfileRTLibs(
+    const llvm::opt::ArgList &Args, llvm::opt::ArgStringList &CmdArgs) const {
+  if (needsProfileRT(Args) && Args.hasArg(options::OPT_fprofile_thread_local)) {
+    // Static first, so we can specify '-u' where needed
+    CmdArgs.push_back(getCompilerRTArgString(Args, "profile_threadlocal"));
+    CmdArgs.push_back(getCompilerRTArgString(Args, "profile_threadlocal",
+                                             ToolChain::FT_Shared));
+  }
+}
+
 ToolChain::RuntimeLibType ToolChain::GetRuntimeLibType(
     const ArgList &Args) const {
   if (runtimeLibType)
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index b8d8ff3db5d1f..cd63ac56fecf6 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -720,6 +720,18 @@ static void addPGOAndCoverageFlags(const ToolChain &TC, Compilation &C,
     CmdArgs.push_back("-fcoverage-mcdc");
   }
 
+  if (Args.hasArg(options::OPT_fprofile_thread_local)) {
+    if (!ProfileGenerateArg)
+      D.Diag(clang::diag::err_drv_argument_only_allowed_with)
+          << "-fprofile-thread-local"
+          << "-fprofile-instr-generate";
+
+    // Clang cc1 is not in the know about thread local coverage, but llvm
+    // should be
+    CmdArgs.push_back("-mllvm");
+    CmdArgs.push_back("-instr-prof-thread-local");
+  }
+
   if (Arg *A = Args.getLastArg(options::OPT_ffile_compilation_dir_EQ,
                                options::OPT_fcoverage_compilation_dir_EQ)) {
     if (A->getOption().matches(options::OPT_ffile_compilation_dir_EQ))
diff --git a/clang/lib/Driver/ToolChains/Linux.cpp b/clang/lib/Driver/ToolChains/Linux.cpp
index 2222dea431c3c..0a889f957786a 100644
--- a/clang/lib/Driver/ToolChains/Linux.cpp
+++ b/clang/lib/Driver/ToolChains/Linux.cpp
@@ -843,6 +843,13 @@ void Linux::addProfileRTLibs(const llvm::opt::ArgList &Args,
     CmdArgs.push_back(Args.MakeArgString(
         Twine("-u", llvm::getInstrProfRuntimeHookVarName())));
   ToolChain::addProfileRTLibs(Args, CmdArgs);
+
+  if (needsProfileRT(Args) && Args.hasArg(options::OPT_fprofile_thread_local)) {
+    CmdArgs.push_back(Args.MakeArgString(Twine(
+        "-u",
+        llvm::StringRef("__llvm_profile_tls_register_thread_exit_handler"))));
+  }
+  ToolChain::addThreadLocalProfileRTLibs(Args, CmdArgs);
 }
 
 void Linux::addExtraOpts(llvm::opt::ArgStringList &CmdArgs) const {
diff --git a/compiler-rt/include/profile/InstrProfData.inc b/compiler-rt/include/profile/InstrProfData.inc
index e9866d94b762c..8655bcf498437 100644
--- a/compiler-rt/include/profile/InstrProfData.inc
+++ b/compiler-rt/include/profile/InstrProfData.inc
@@ -312,6 +312,9 @@ INSTR_PROF_SECT_ENTRY(IPSK_data, \
 INSTR_PROF_SECT_ENTRY(IPSK_cnts, \
                       INSTR_PROF_QUOTE(INSTR_PROF_CNTS_COMMON), \
                       INSTR_PROF_CNTS_COFF, "__DATA,")
+INSTR_PROF_SECT_ENTRY(IPSK_tls_cnts, \
+                      INSTR_PROF_QUOTE(INSTR_PROF_TLS_CNTS_COMMON), \
+                      INSTR_PROF_CNTS_COFF, "__DATA,")
 INSTR_PROF_SECT_ENTRY(IPSK_bitmap, \
                       INSTR_PROF_QUOTE(INSTR_PROF_BITS_COMMON), \
                       INSTR_PROF_BITS_COFF, "__DATA,")
@@ -750,6 +753,7 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
 #define INSTR_PROF_NAME_COMMON __llvm_prf_names
 #define INSTR_PROF_VNAME_COMMON __llvm_prf_vns
 #define INSTR_PROF_CNTS_COMMON __llvm_prf_cnts
+#define INSTR_PROF_TLS_CNTS_COMMON __llvm_tls_prf_cnts
 #define INSTR_PROF_BITS_COMMON __llvm_prf_bits
 #define INSTR_PROF_VALS_COMMON __llvm_prf_vals
 #define INSTR_PROF_VNODES_COMMON __llvm_prf_vnds
diff --git a/compiler-rt/lib/profile/CMakeLists.txt b/compiler-rt/lib/profile/CMakeLists.txt
index 45e5164891751..b9f3a20bb328d 100644
--- a/compiler-rt/lib/profile/CMakeLists.txt
+++ b/compiler-rt/lib/profile/CMakeLists.txt
@@ -70,14 +70,25 @@ set(PROFILE_SOURCES
   InstrProfilingUtil.c
   )
 
+set(PROFILE_STATIC_TLS_SOURCES
+  InstrProfilingTLS.c
+  InstrProfilingStaticTLSLinux.cpp)
+
+set(PROFILE_SHARED_TLS_SOURCES
+  InstrProfilingTLSDyLib.c
+  InstrProfilingDyLibLinux.cpp)
+
 set(PROFILE_HEADERS
   InstrProfiling.h
   InstrProfilingInternal.h
   InstrProfilingPort.h
   InstrProfilingUtil.h
+  InstrProfilingTLS.h
   WindowsMMap.h
   )
 
+set(PROFILE_LINK_LIBS ${SANITIZER_COMMON_LINK_LIBS})
+
 if(WIN32)
   list(APPEND PROFILE_SOURCES
     WindowsMMap.c
@@ -134,6 +145,30 @@ if(APPLE)
     ADDITIONAL_HEADERS ${PROFILE_HEADERS}
     PARENT_TARGET profile)
 else()
+  #if(UNIX AND NOT APPLE AND NOT ANDROID)
+  if(OS_NAME MATCHES "Linux")
+    add_compiler_rt_runtime(clang_rt.profile_threadlocal
+      STATIC
+      OS ${PROFILE_SUPPORTED_OS}
+      ARCHS ${PROFILE_SUPPORTED_ARCH}
+      CFLAGS ${EXTRA_FLAGS}
+      SOURCES ${PROFILE_STATIC_TLS_SOURCES} 
+      ADDITIONAL_HEADERS ${PROFILE_HEADERS}
+      PARENT_TARGET profile)
+
+    add_compiler_rt_runtime(clang_rt.profile_threadlocal
+      SHARED
+      OS ${PROFILE_SUPPORTED_OS}
+      ARCHS ${PROFILE_SUPPORTED_ARCH}
+      CFLAGS ${EXTRA_FLAGS}
+      SOURCES ${PROFILE_SHARED_TLS_SOURCES} 
+      ADDITIONAL_HEADERS ${PROFILE_HEADERS}
+      OBJECT_LIBS RTInterception
+                  RTSanitizerCommon
+                  RTSanitizerCommonLibc
+      PARENT_TARGET profile)
+  endif()
+
   add_compiler_rt_runtime(clang_rt.profile
     STATIC
     ARCHS ${PROFILE_SUPPORTED_ARCH}
diff --git a/compiler-rt/lib/profile/InstrProfilingDyLibLinux.cpp b/compiler-rt/lib/profile/InstrProfilingDyLibLinux.cpp
new file mode 100644
index 0000000000000..47f2baa6a5815
--- /dev/null
+++ b/compiler-rt/lib/profile/InstrProfilingDyLibLinux.cpp
@@ -0,0 +1,63 @@
+#if defined(__linux__) || defined(__FreeBSD__) || defined(__Fuchsia__) ||      \
+    (defined(__sun__) && defined(__svr4__)) || defined(__NetBSD__) ||          \
+    defined(_AIX)
+
+#include <elf.h>
+#include <link.h>
+#endif
+#include <stdlib.h>
+#include <string.h>
+
+extern "C" {
+
+#include "InstrProfiling.h"
+#include "InstrProfilingInternal.h"
+#include "InstrProfilingTLS.h"
+#include "InstrProfilingTLSDyLib.h"
+}
+
+#include "interception/interception.h"
+
+extern "C" {
+
+struct pthread_wrapper_arg {
+  void *(*fn)(void *);
+  void *arg;
+  uint32_t arg_keepalive;
+};
+
+void *pthread_fn_wrapper(void *arg_ptr) {
+  struct pthread_wrapper_arg *wrapper_arg =
+      (struct pthread_wrapper_arg *)arg_ptr;
+  void *(*fn)(void *) = __atomic_load_n(&wrapper_arg->fn, __ATOMIC_RELAXED);
+  void *arg = __atomic_load_n(&wrapper_arg->arg, __ATOMIC_RELAXED);
+  __atomic_store_n(&wrapper_arg->arg_keepalive, 0, __ATOMIC_RELEASE);
+
+  // startup
+  // Do nothing (TLS is automatically loaded and zeroed)
+  void *retval = fn(arg);
+  // cleanup
+  run_thread_exit_handlers();
+  // Combine counters with main counters
+  return retval;
+}
+
+void __llvm_register_profile_intercepts() { register_profile_intercepts(); }
+
+} // end extern "C"
+
+INTERCEPTOR(int, pthread_create, void *thread, void *attr,
+            void *(*start_routine)(void *), void *arg) {
+  int res = -1;
+  struct pthread_wrapper_arg wrapper_arg = {(void *(*)(void *))start_routine,
+                                            arg, 1};
+
+  // do pthread
+  res = REAL(pthread_create)(thread, attr, pthread_fn_wrapper, &wrapper_arg);
+  // Spin wait for child thread to copy arguments
+  while (__atomic_load_n(&wrapper_arg.arg_keepalive, __ATOMIC_ACQUIRE) == 1)
+    ;
+  return res;
+}
+
+void register_profile_intercepts() { INTERCEPT_FUNCTION(pthread_create); }
diff --git a/compiler-rt/lib/profile/InstrProfilingFile.c b/compiler-rt/lib/profile/InstrProfilingFile.c
index e4d99ef4872bd..64775f24fd83c 100644
--- a/compiler-rt/lib/profile/InstrProfilingFile.c
+++ b/compiler-rt/lib/profile/InstrProfilingFile.c
@@ -34,6 +34,7 @@
 #include "InstrProfiling.h"
 #include "InstrProfilingInternal.h"
 #include "InstrProfilingPort.h"
+#include "InstrProfilingTLS.h"
 #include "InstrProfilingUtil.h"
 
 /* From where is profile name specified.
@@ -1084,6 +1085,8 @@ void __llvm_profile_set_filename(const char *FilenamePat) {
   parseAndSetFilename(FilenamePat, PNS_runtime_api, 1);
 }
 
+void (*on_main_thread_exit)(void) = NULL;
+
 /* The public API for writing profile data into the file with name
  * set by previous calls to __llvm_profile_set_filename or
  * __llvm_profile_override_default_filename or
@@ -1097,6 +1100,9 @@ int __llvm_profile_write_file(void) {
   // Temporarily suspend getting SIGKILL when the parent exits.
   int PDeathSig = lprofSuspendSigKill();
 
+  if (on_main_thread_exit)
+    on_main_thread_exit();
+
   if (lprofProfileDumped() || __llvm_profile_is_continuous_mode_enabled()) {
     PROF_NOTE("Profile data not written to file: %s.\n", "already written");
     if (PDeathSig == 1)
diff --git a/compiler-rt/lib/profile/InstrProfilingPlatformLinux.c b/compiler-rt/lib/profile/InstrProfilingPlatformLinux.c
index b766436497b74..4f96523a56a37 100644
--- a/compiler-rt/lib/profile/InstrProfilingPlatformLinux.c
+++ b/compiler-rt/lib/profile/InstrProfilingPlatformLinux.c
@@ -45,6 +45,7 @@ extern __llvm_profile_data PROF_DATA_STOP COMPILER_RT_VISIBILITY
     COMPILER_RT_WEAK;
 extern char PROF_CNTS_START COMPILER_RT_VISIBILITY COMPILER_RT_WEAK;
 extern char PROF_CNTS_STOP COMPILER_RT_VISIBILITY COMPILER_RT_WEAK;
+
 extern VTableProfData PROF_VTABLE_START COMPILER_RT_VISIBILITY COMPILER_RT_WEAK;
 extern VTableProfData PROF_VTABLE_STOP COMPILER_RT_VISIBILITY COMPILER_RT_WEAK;
 extern char PROF_VNAME_START COMPILER_RT_VISIBILITY COMPILER_RT_WEAK;
diff --git a/compiler-rt/lib/profile/InstrProfilingStaticTLSLinux.cpp b/compiler-rt/lib/profile/InstrProfilingStaticTLSLinux.cpp
new file mode 100644
index 0000000000000..fc5f785e1ab40
--- /dev/null
+++ b/compiler-rt/lib/profile/InstrProfilingStaticTLSLinux.cpp
@@ -0,0 +1,123 @@
+#if defined(__linux__) || defined(__FreeBSD__) || defined(__Fuchsia__) ||      \
+    (defined(__sun__) && defined(__svr4__)) || defined(__NetBSD__) ||          \
+    defined(_AIX)
+
+#include <elf.h>
+#include <link.h>
+#endif
+#include <stdlib.h>
+#include <string.h>
+
+extern "C" {
+
+#include "InstrProfiling.h"
+#include "InstrProfilingInternal.h"
+#include "InstrProfilingTLS.h"
+}
+
+extern "C" {
+
+#define PROF_TLS_CNTS_START INSTR_PROF_SECT_START(INSTR_PROF_TLS_CNTS_COMMON)
+#define PROF_TLS_CNTS_STOP INSTR_PROF_SECT_STOP(INSTR_PROF_TLS_CNTS_COMMON)
+
+extern char PROF_TLS_CNTS_START COMPILER_RT_VISIBILITY COMPILER_RT_WEAK;
+extern char PROF_TLS_CNTS_STOP COMPILER_RT_VISIBILITY COMPILER_RT_WEAK;
+
+COMPILER_RT_VISIBILITY char *__llvm_profile_begin_tls_counters(void) {
+  return &PROF_TLS_CNTS_START;
+}
+COMPILER_RT_VISIBILITY char *__llvm_profile_end_tls_counters(void) {
+  return &PROF_TLS_CNTS_STOP;
+}
+
+struct finalization_data {
+  char *mod_begin;
+  char *tls_img_begin;
+  char *tls_img_end;
+  char *cnts_begin;
+  char *cnts_end;
+};
+
+// This is O(num_modules + num_counters) unfortunately.  If there were a
+// mechanism to calculate the thread-local start of a thread-local section like
+// there is a mechanism to calculate the static start of a static section (i.e.
+// __start_$sectionname), that would simplify implementation a lot and make this
+// just O(num_counters).
+static int FindAndAddCounters_cb(struct dl_phdr_info *info, size_t size,
+                                 void *data) {
+  finalization_data *fdata = (finalization_data *)data;
+  char *mod_begin = fdata->mod_begin;
+  // We're looking for a match to the dladdr calculated based on PROF_CNTS_START
+  if (mod_begin != (char *)info->dlpi_addr) {
+    return 0;
+  }
+
+  if (info->dlpi_tls_data == NULL) {
+    return 1;
+  }
+
+  const Elf64_Phdr *hdr = info->dlpi_phdr;
+  const Elf64_Phdr *last_hdr = hdr + info->dlpi_phnum;
+
+  const Elf64_Phdr *tls_hdr;
+  for (; hdr != last_hdr; ++hdr) {
+    if (hdr->p_type == PT_TLS) {
+      tls_hdr = hdr;
+      goto found_tls_ph;
+    }
+  }
+  return 1;
+found_tls_ph:
+  uint64_t num_counters =
+      __llvm_profile_get_num_counters(fdata->tls_img_begin, fdata->tls_img_end);
+  uint64_t counter_size = __llvm_profile_counter_entry_size();
+
+  // Calculate the offset of __llvm_prf_tls_cnts into the tls block for this
+  // module. The addresses in use below correspond to the tls initialization
+  // image, which is statically allocated for the module, rather than the TLS
+  // block itself.
+  uint64_t ph_true_vaddr =
+      (uint64_t)info->dlpi_addr + (uint64_t)tls_hdr->p_vaddr;
+  uint64_t tls_cnts_tlsblk_offset =
+      (uint64_t)fdata->tls_img_begin - ph_true_vaddr;
+
+  // Calculate the thread local copy of __llvm_prf_tls_cnts for this module.
+  uint64_t tls_prf_cnts_modlocal_begin =
+      (uint64_t)info->dlpi_tls_data + tls_cnts_tlsblk_offset;
+
+  // We don't support single byte counters because they are also resilient to
+  // thread synchronization issues and they are designed to avoid memory
+  // overhead, which is the opposite of what TL counters do.
+  // TODO: warn?
+  if (counter_size == sizeof(uint64_t)) {
+    uint64_t *tls_cnt = (uint64_t *)tls_prf_cnts_modlocal_begin;
+    uint64_t *tls_end = (uint64_t *)tls_cnt + num_counters;
+    uint64_t *cnt = (uint64_t *)fdata->cnts_begin;
+    for (; tls_cnt != tls_end; tls_cnt++, cnt++) {
+      __atomic_fetch_add(cnt, *tls_cnt, __ATOMIC_RELAXED);
+    }
+  }
+  return 1;
+}
+
+COMPILER_RT_VISIBILITY
+void __llvm_profile_tls_counters_finalize(void) {
+  struct finalization_data fdata = {0};
+  fdata.tls_img_begin = __llvm_profile_begin_tls_counters();
+  fdata.tls_img_end = __llvm_profile_end_tls_counters();
+  fdata.cnts_begin = __llvm_profile_begin_counters();
+  fdata.cnts_end = __llvm_profile_end_counters();
+
+  if (!fdata.tls_img_begin || !fdata.tls_img_end || !fdata.cnts_begin ||
+      !fdata.cnts_end) {
+    return;
+  }
+
+  Dl_info info;
+  if (dladdr(fdata.cnts_begin, &info) == 0) {
+    return;
+  }
+  fdata.mod_begin = (char *)info.dli_fbase;
+  dl_iterate_phdr(FindAndAddCounters_cb, &fdata);
+}
+}
diff --git a/compiler-rt/lib/profile/InstrProfilingTLS.c b/compiler-rt/lib/profile/InstrProfilingTLS.c
new file mode 100644
index 0000000000000..029ed9e542e5a
--- /dev/null
+++ b/compiler-rt/lib/profile/InstrProfilingTLS.c
@@ -0,0 +1,29 @@
+#include "InstrProfilingTLS.h"
+#include "InstrProfiling.h"
+
+struct texit_fn_node module_node COMPILER_RT_VISIBILITY;
+
+// We act as a shim between the profile_threadlocal sharedlib
+// and the profile static lib.  We need to the tell the static lib
+// to add all of the counters up on main thread exit, but the
+// shared lib is the one who knows how to do that and whether its
+// already been done.
+//
+// In the constructor we pass flush_main_thread_counters from the
+// sharedlib to the non-tls statlib's on_main_thread_exit fnptr.
+extern void flush_main_thread_counters(void);
+extern void (*on_main_thread_exit)(void);
+
+__attribute__((constructor)) COMPILER_RT_VISIBILITY void
+__llvm_profile_tls_register_thread_exit_handler(void) {
+  module_node.prev = NULL;
+  module_node.next = NULL;
+  module_node.fn = __llvm_profile_tls_counters_finalize;
+  register_tls_prfcnts_module_thread_exit_handler(&module_node);
+  if (!on_main_thread_exit) {
+    on_main_thread_exit = flush_main_thread_counters;
+  }
+}
+
+// TODO: Add destructor
+// (But not yet, I'm scared)
diff --git a/compiler-rt/lib/profile/InstrProfilingTLS.h b/compiler-rt/lib/profile/InstrProfilingTLS.h
new file mode 100644
index 0000000000000..1b6001d27d375
--- /dev/null
+++ b/compiler-rt/lib/profile/InstrProfilingTLS.h
@@ -0,0 +1,39 @@
+#ifndef INSTR_PROFILING_TLS_H
+#define INSTR_PROFILING_TLS_H
+
+char *__llvm_profile_begin_tls_counters(void);
+char *__llvm_profile_end_tls_counters(void);
+
+/*!
+ * \brief Add counter values from TLS to the global counters for the program
+ *
+ * On thread exit, atomically add the values in TLS counters to the static
+ * counters for the whole process.
+ */
+void __llvm_profile_tls_counters_finalize(void);
+
+/*
+ * Dylib stuff
+ */
+typedef void (*texit_fnc)(void);
+
+typedef struct texit_fn_node {
+  struct texit_fn_node *prev;
+  texit_fnc fn;
+  struct texit_fn_node *next;
+} texit_fn_node;
+
+// TODO: really this should be write-preferring rwlocked
+struct texit_fn_registry {
+  int texit_mtx;
+  texit_fn_node head;
+  texit_fn_node tail;
+};
+
+void register_tls_prfcnts_module_thread_exit_handler(texit_fn_node *new_node);
+void unregister_tls_prfcnts_module_thread_exit_handler(texit_fn_node *new_node);
+void run_thread_exit_handlers(void);
+
+void register_profile_intercepts();
+
+#endif
diff --git a/compiler-rt/lib/profile/InstrProfilingTLSDyLib.c b/compiler-rt/lib/profile/InstrProfilingTLSDyLib.c
new file mode 100644
index 0000000000000..e82780dbcf6ab
--- /dev/null
+++ b/compiler-rt/lib/profile/InstrProfilingTLSDyLib.c
@@ -0,0 +1,100 @@
+#include "InstrProfiling.h"
+#include "InstrProfilingTLS.h"
+#include <stdlib.h>
+
+// Maintain a linked list of handlers to run on thread exit.
+// This is broken out into a dylib so that the registry is truly global across
+// dlopen et. al.
+//
+// Each module has a statically allocated node that gets linked into the
+// registry on the constructor and that gets linked out of the registry on
+// destroy.
+//
+// This node is defined in the static portion of the tls counts extension.
+
+struct texit_fn_registry texit_registry;
+
+static void lock_texit_registry(void) {
+  int expected = 0;
+  while (!__atomic_compare_exchange_n(&texit_registry.texit_mtx, &expected, 1,
+                                      0, __ATOMIC_ACQUIRE, __ATOMIC_RELAXED)) {
+    expected = 0;
+  }
+}
+
+static void unlock_texit_registry(void) {
+  __atomic_store_n(&texit_registry.texit_mtx, 0, __ATOMIC_RELEASE);
+}
+
+static void wlock_texit_registry(void) { lock_texit_registry(); }
+
+static void wunlock_texit_registry(void) { unlock_texit_registry(); }
+
+static void rlock_texit_registry(void) { lock_texit_registry(); }
+
+static void runlock_texit_registry(void) { unlock_texit_registry(); }
+
+static inline texit_fn_node *take_nodep(texit_fn_node **nodepp) {
+  texit_fn_node *nodep = *nodepp;
+  *nodepp = NULL;
+  return nodep;
+}
+
+static inline texit_fn_node *replace_nodep(texit_fn_node **nodepp,
+                                           texit_fn_node *new_nodep) {
+  texit_fn_node *nodep = *nodepp;
+  *nodepp = new_nodep;
+  return nodep;
+}
+
+void flush_main_thread_counters(void) {
+  static int flushed = 0;
+  if (!flushed) {
+    run_thread_exit_handlers();
+    flushed = 1;
+  }
+}
+
+__attribute__((constructor)) static void __initialize_tls_exit_registry() {
+  register_profile_intercepts();
+  texit_registry.texit_mtx = 0;
+  texit_registry.head.prev = NULL;
+  texit_registry.head.fn = NULL;
+  texit_registry.head.next = &texit_registry.tail;
+  texit_registry.tail.prev = &texit_registry.head;
+  texit_registry.tail.fn = NULL;
+  texit_registry.tail.next = NULL;
+}
+
+// Should run from module constructor
+void register_tls_prfcnts_module_thread_exit_handler(texit_fn_node *new_nodep) {
+  wlock_texit_registry();
+  texit_fn_node *prev = replace_nodep(&texit_registry.tail.prev, new_nodep);
+  texit_fn_node *next = replace_nodep(&prev->next, new_nodep);
+  new_nodep->next = next;
+  new_nodep->prev = prev;
+  wunlock_texit_registry();
+}
+
+// Should run from module destructor
+// Also, this destructor/constructor pair should be outermost.  At least outside
+// of the regular llvm_profile stuff.
+void unregister_tls_prfcnts_module_thread_exit_handler(
+    texit_fn_node *old_nodep) {
+  wlock_texit_registry();
+  texit_fn_node *prev = take_nodep(&old_nodep->prev);
+  texit_fn_node *next = take_nodep(&old_nodep->next);
+  prev->next = next;
+  next->prev = prev;
+  wunlock_texit_registry();
+}
+
+void run_thread_exit_handlers(void) {
+  rlock_texit_registry();
+  for (texit_fn_node *node = texit_registry.head.next;
+       node != &texit_registry.tail; node = node->next) {
+    if (node->fn != NULL)
+      node->fn();
+  }
+  runlock_texit_registry();
+}
diff --git a/compiler-rt/lib/profile/InstrProfilingTLSDyLib.h b/compiler-rt/lib/profile/InstrProfilingTLSDyLib.h
new file mode 100644
index 0000000000000..3c429d81129ec
--- /dev/null
+++ b/compiler-rt/lib/profile/InstrProfilingTLSDyLib.h
@@ -0,0 +1,4 @@
+#ifndef INSTR_PROFILING_TLS_DYLIB_H
+#define INSTR_PROFILING_TLS_DYLIB_H
+
+#endif
diff --git a/compiler-rt/lib/tsan/rtl/CMakeLists.txt b/compiler-rt/lib/tsan/rtl/CMakeLists.txt
index f40e72dbde1f9..8ddb6af279284 100644
--- a/compiler-rt/lib/tsan/rtl/CMakeLists.txt
+++ b/compiler-rt/lib/tsan/rtl/CMakeLists.txt
@@ -1,6 +1,6 @@
 include_directories(../..)
 
-set(TSAN_RTL_CFLAGS ${TSAN_CFLAGS})
+set(TSAN_RTL_CFLAGS ${TSAN_CFLAGS} -O0 -g3)
 append_list_if(COMPILER_RT_HAS_MSSE4_2_FLAG -msse4.2 TSAN_RTL_CFLAGS)
 append_list_if(SANITIZER_LIMIT_FRAME_SIZE -Wframe-larger-than=530
                TSAN_RTL_CFLAGS)
diff --git a/compiler-rt/test/profile/Inputs/instrprof-tls-dlclose-lib.c b/compiler-rt/test/profile/Inputs/instrprof-tls-dlclose-lib.c
new file mode 100644
index 0000000000000..fcf874000aa8e
--- /dev/null
+++ b/compiler-rt/test/profile/Inputs/instrprof-tls-dlclose-lib.c
@@ -0,0 +1,7 @@
+unsigned char determine_value_dyn(unsigned char c) {
+  if (c < 0x80) {
+    return c;
+  } else {
+    return -c;
+  }
+}
diff --git a/compiler-rt/test/profile/Inputs/instrprof-tls-dlclose-main.c b/compiler-rt/test/profile/Inputs/instrprof-tls-dlclose-main.c
new file mode 100644
index 0000000000000..309d405430af4
--- /dev/null
+++ b/compiler-rt/test/profile/Inputs/instrprof-tls-dlclose-main.c
@@ -0,0 +1,93 @@
+#include <dlfcn.h>
+#include <pthread.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+struct thread_arg {
+  uint64_t buf_size;
+  char const *buf;
+  uint64_t iteration_counter;
+  uint64_t output;
+};
+
+#ifndef DLOPEN_FUNC_DIR
+unsigned char determine_value_dyn(unsigned char);
+#endif
+
+void *thread_fn(void *arg_ptr) {
+#ifdef DLOPEN_FUNC_DIR
+
+  unsigned char (*determine_value_dyn)(unsigned char) = NULL;
+
+  const char *dynlib_name = DLOPEN_FUNC_DIR "/lib.shared";
+  const char *dynlib_sym = "determine_value_dyn";
+  void *handle = dlopen(dynlib_name, DLOPEN_FLAGS);
+  if (handle == NULL) {
+    fprintf(stderr, "dlopen error on: %s: %s\n", dynlib_name, dlerror());
+    exit(EXIT_FAILURE);
+  }
+
+  determine_value_dyn = dlsym(handle, dynlib_sym);
+  if (handle == NULL) {
+    fprintf(stderr, "dlsym error on: %s : %s\n", dynlib_name, dynlib_sym);
+    exit(EXIT_FAILURE);
+  }
+#endif
+
+  struct thread_arg *arg = (struct thread_arg *)arg_ptr;
+  for (uint64_t i = 0; i < arg->buf_size; i++) {
+    unsigned char c = (unsigned char)arg->buf[i];
+    arg->output += determine_value_dyn(c);
+    arg->iteration_counter++;
+  }
+
+  // This should unload the thread local counters region for this module,
+  // causing an expected failure for -fprofile-thread-local
+#ifdef DLOPEN_FUNC_DIR
+#  ifndef DONT_CLOSE
+  dlclose(handle);
+#  endif
+#endif
+  return NULL;
+}
+
+int main() {
+  const uint64_t len = 40000;
+
+  char *example_string = (char *)malloc(sizeof(char) * len);
+  int high = 0;
+  for (uint64_t i = 0; i < len; i++) {
+    if (high == 2) {
+      example_string[i] = 0xff;
+      high = 0;
+    } else {
+      example_string[i] = 0x0;
+      high++;
+    }
+  }
+
+  pthread_t thread;
+  struct thread_arg arg = {
+      len,
+      example_string,
+      0,
+      0,
+  };
+  if (pthread_create(&thread, NULL, thread_fn, &arg) != 0) {
+    fprintf(stderr, "Failed to spawn thread, exiting\n");
+    exit(EXIT_SUCCESS);
+  }
+
+  if (pthread_join(thread, NULL) != 0) {
+    fprintf(stderr, "Failed to join thread, continuing\n");
+    return EXIT_FAILURE;
+  }
+
+  printf("Thread output:\n"
+         "iteration_counter: %lu\n"
+         "output: %lx\n\n",
+         arg.iteration_counter, arg.output);
+
+  return EXIT_SUCCESS;
+}
diff --git a/compiler-rt/test/profile/Inputs/instrprof-tls-dlopen-func.c b/compiler-rt/test/profile/Inputs/instrprof-tls-dlopen-func.c
new file mode 100644
index 0000000000000..9ec903ab4c17a
--- /dev/null
+++ b/compiler-rt/test/profile/Inputs/instrprof-tls-dlopen-func.c
@@ -0,0 +1,9 @@
+#include <stdint.h>
+
+int8_t func(int8_t input) {
+  if (input < 0) {
+    return input;
+  } else {
+    return -input;
+  }
+}
diff --git a/compiler-rt/test/profile/Inputs/instrprof-tls-dlopen-func2.c b/compiler-rt/test/profile/Inputs/instrprof-tls-dlopen-func2.c
new file mode 100644
index 0000000000000..94122d793a6ee
--- /dev/null
+++ b/compiler-rt/test/profile/Inputs/instrprof-tls-dlopen-func2.c
@@ -0,0 +1,9 @@
+#include <stdint.h>
+
+int8_t func2(int8_t input) {
+  if (input >= 0) {
+    return -1;
+  } else {
+    return 1;
+  }
+}
diff --git a/compiler-rt/test/profile/Inputs/instrprof-tls-dlopen-main.c b/compiler-rt/test/profile/Inputs/instrprof-tls-dlopen-main.c
new file mode 100644
index 0000000000000..fc436841d233c
--- /dev/null
+++ b/compiler-rt/test/profile/Inputs/instrprof-tls-dlopen-main.c
@@ -0,0 +1,105 @@
+#include <pthread.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#ifdef DLOPEN_FUNC_DIR
+#  include <dlfcn.h>
+int8_t (*func)(int8_t) = NULL;
+int8_t (*func2)(int8_t) = NULL;
+#else
+int8_t func(int8_t);
+int8_t func2(int8_t);
+#endif
+
+struct thread_arg {
+  uint64_t buf_size;
+  char const *buf;
+  uint64_t output;
+};
+
+void *thread_fn(void *arg_ptr) {
+  struct thread_arg *arg = (struct thread_arg *)arg_ptr;
+  for (uint64_t i = 0; i < arg->buf_size; i++) {
+    int8_t c = (int8_t)arg->buf[i];
+    arg->output += func(c);
+    arg->output += func2(c);
+  }
+  return NULL;
+}
+
+int main() {
+#define n_threads 10
+#define len 40000
+
+#ifdef DLOPEN_FUNC_DIR
+  const char *dynlib_path = DLOPEN_FUNC_DIR "/func.shared";
+  const char *dynlib_sym = "func";
+  void *handle = dlopen(dynlib_path, RTLD_LAZY);
+  if (handle == NULL) {
+    fprintf(stderr, "dlopen error on: %s: %s\n", dynlib_path, dlerror());
+    return EXIT_FAILURE;
+  }
+
+  func = dlsym(handle, dynlib_sym);
+  if (func == NULL) {
+    fprintf(stderr, "dlsym error on: %s : %s\n", dynlib_path, dynlib_sym);
+    return EXIT_FAILURE;
+  }
+
+  const char *dynlib_path2 = DLOPEN_FUNC_DIR "/func2.shared";
+  const char *dynlib_sym2 = "func2";
+  void *handle2 = dlopen(dynlib_path2, RTLD_LAZY);
+  if (handle2 == NULL) {
+    fprintf(stderr, "dlopen error on: %s: %s\n", dynlib_path2, dlerror());
+    return EXIT_FAILURE;
+  }
+
+  func2 = dlsym(handle2, dynlib_sym2);
+  if (func2 == NULL) {
+    fprintf(stderr, "dlsym error on: %s : %s\n", dynlib_path2, dynlib_sym2);
+    return EXIT_FAILURE;
+  }
+#endif
+
+  pthread_t threads[n_threads] = {0};
+  struct thread_arg args[n_threads] = {0};
+  char *example_string = (char *)malloc(sizeof(char) * len);
+  int high = 0;
+  for (uint64_t i = 0; i < len; i++) {
+    if (high == 2) {
+      example_string[i] = 0xff;
+      high = 0;
+    } else {
+      example_string[i] = 0x0;
+      high++;
+    }
+  }
+
+  for (uint64_t i = 0; i < n_threads; i++) {
+    struct thread_arg a = {
+        len,
+        example_string,
+        0,
+    };
+    args[i] = a;
+    if (pthread_create(&threads[i], NULL, thread_fn, &args[i]) != 0) {
+      fprintf(stderr, "Failed to spawn thread %lu, exiting\n", i);
+      return EXIT_FAILURE;
+    }
+  }
+
+  int rc = EXIT_SUCCESS;
+  for (uint64_t i = 0; i < n_threads; i++) {
+    void *retval = NULL;
+    if (pthread_join(threads[i], &retval) != 0) {
+      printf("Failed to join thread %lu, continuing\n", i);
+      rc = EXIT_FAILURE;
+    }
+
+    printf("Thread %lu output:\n"
+           "output: %lx\n\n",
+           i, args[i].output);
+  }
+  return rc;
+}
diff --git a/compiler-rt/test/profile/Inputs/instrprof-tls-exit.c b/compiler-rt/test/profile/Inputs/instrprof-tls-exit.c
new file mode 100644
index 0000000000000..f7e6f78f019db
--- /dev/null
+++ b/compiler-rt/test/profile/Inputs/instrprof-tls-exit.c
@@ -0,0 +1,37 @@
+#include <pthread.h>
+#include <semaphore.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+void *exit_thread(void *arg_ptr) {
+  sem_t *s_p = (sem_t *)arg_ptr;
+  printf("Exit thread waiting...\n");
+  if (sem_wait(s_p)) {
+    fprintf(stderr, "Failed to wait on signal from main thread\n");
+    exit(EXIT_FAILURE);
+  }
+  printf("Exit thread activated\n");
+  exit(0);
+  return NULL;
+}
+
+int main() {
+  pthread_t exit;
+  sem_t s;
+  sem_init(&s, 0, 0);
+  if (pthread_create(&exit, NULL, exit_thread, &s) != 0) {
+    fprintf(stderr, "Failed to spawn exit thread\n");
+    return EXIT_FAILURE;
+  }
+  if (sem_post(&s)) {
+    fprintf(stderr, "Failed to send signal to exit thread\n");
+    return EXIT_FAILURE;
+  }
+  if (pthread_join(exit, NULL)) {
+    fprintf(stderr, "Failed to join exit thread\n");
+    return EXIT_FAILURE;
+  }
+  fprintf(stderr, "Child thread should have called exit()\n");
+  return EXIT_FAILURE;
+}
diff --git a/compiler-rt/test/profile/Linux/instrprof-tls-dlclose-memfault.test b/compiler-rt/test/profile/Linux/instrprof-tls-dlclose-memfault.test
new file mode 100644
index 0000000000000..3974102090b4e
--- /dev/null
+++ b/compiler-rt/test/profile/Linux/instrprof-tls-dlclose-memfault.test
@@ -0,0 +1,27 @@
+RUN: mkdir -p %t.tls.d
+RUN: %clang_profgen -fcoverage-mapping -fprofile-thread-local -fdata-sections -ffunction-sections -fuse-ld=gold -Wl,--gc-sections -o %t.tls.d/lib.shared -fPIC -shared %S/../Inputs/instrprof-tls-dlclose-lib.c
+
+RUN: %clang_profgen -fcoverage-mapping -lpthread -DDLOPEN_FUNC_DIR=\"%t.tls.d\" -DDLOPEN_FLAGS="RTLD_LAZY" -fprofile-thread-local -fdata-sections -ffunction-sections -fuse-ld=gold -Wl,--gc-sections -o %t-tls -rpath %t.tls.d %S/../Inputs/instrprof-tls-dlclose-main.c
+
+RUN: mkdir -p %t.atomic.d
+RUN: %clang_profgen -fcoverage-mapping -fprofile-update=atomic -fdata-sections -ffunction-sections -fuse-ld=gold -Wl,--gc-sections -o %t.atomic.d/lib.shared -fPIC -shared %S/../Inputs/instrprof-tls-dlclose-lib.c
+
+RUN: %clang_profgen -fcoverage-mapping -lpthread -DDLOPEN_FUNC_DIR=\"%t.atomic.d\" -DDLOPEN_FLAGS="RTLD_LAZY" -fprofile-update=atomic -fdata-sections -ffunction-sections -fuse-ld=gold -Wl,--gc-sections -o %t-atomic -rpath %t.atomic.d %S/../Inputs/instrprof-tls-dlclose-main.c
+
+RUN: env LLVM_PROFILE_FILE=%t-atomic.profraw %run %t-atomic
+// Here we expect a segfault until the dlclose issue is fixed
+RUN: env LLVM_PROFILE_FILE=%t-tls.profraw %run %t-tls
+
+RUN: llvm-profdata merge -o %t-tls.profdata %t-tls.profraw
+RUN: llvm-profdata merge -o %t-atomic.profdata %t-atomic.profraw
+RUN: %clang_profuse=%t-tls.profdata    -o %t-main.tls.ll    -S -emit-llvm %S/../Inputs/instrprof-tls-dlclose-main.c
+RUN: %clang_profuse=%t-atomic.profdata -o %t-main.atomic.ll -S -emit-llvm %S/../Inputs/instrprof-tls-dlclose-main.c
+RUN: diff %t-main.tls.ll %t-main.atomic.ll
+
+RUN: llvm-profdata merge -o %t-tls.profdata %t-tls.profraw
+RUN: llvm-profdata merge -o %t-atomic.profdata %t-atomic.profraw
+RUN: %clang_profuse=%t-tls.profdata -o %t-lib.tls.ll -S -emit-llvm %S/../Inputs/instrprof-tls-dlclose-lib.c
+RUN: %clang_profuse=%t-atomic.profdata -o %t-lib.atomic.ll -S -emit-llvm %S/../Inputs/instrprof-tls-dlclose-lib.c
+RUN: diff %t-lib.tls.ll %t-lib.atomic.ll
+
+XFAIL: target={{.*}}
diff --git a/compiler-rt/test/profile/Linux/instrprof-tls-dlclose-mix-subset.test b/compiler-rt/test/profile/Linux/instrprof-tls-dlclose-mix-subset.test
new file mode 100644
index 0000000000000..8cafef927ee7e
--- /dev/null
+++ b/compiler-rt/test/profile/Linux/instrprof-tls-dlclose-mix-subset.test
@@ -0,0 +1,41 @@
+// Passing subset of combos where you still get coverage from modules
+// which were opened with RTLD_NODELETE and later closed.
+//
+// These combos work because pthread_create is intercepted before
+// it is first called.
+
+// All threadlocal
+
+RUN: mkdir -p %t.tls.d
+RUN: %clang_profgen -fcoverage-mapping -fprofile-thread-local -fdata-sections -ffunction-sections -fuse-ld=gold -Wl,--gc-sections -o %t.tls.d/lib.shared -fPIC -shared %S/../Inputs/instrprof-tls-dlclose-lib.c
+
+RUN: %clang_profgen -fcoverage-mapping -lpthread -DDLOPEN_FUNC_DIR=\"%t.tls.d\" -DDLOPEN_FLAGS="RTLD_LAZY | RTLD_NODELETE" -fprofile-thread-local -fdata-sections -ffunction-sections -fuse-ld=gold -Wl,--gc-sections -o %t-tls -rpath %t.tls.d %S/../Inputs/instrprof-tls-dlclose-main.c
+
+// All atomic
+
+RUN: mkdir -p %t.atomic.d
+RUN: %clang_profgen -fcoverage-mapping -fprofile-update=atomic -fdata-sections -ffunction-sections -fuse-ld=gold -Wl,--gc-sections -o %t.atomic.d/lib.shared -fPIC -shared %S/../Inputs/instrprof-tls-dlclose-lib.c
+
+RUN: %clang_profgen -fcoverage-mapping -lpthread -DDLOPEN_FUNC_DIR=\"%t.atomic.d\" -DDLOPEN_FLAGS="RTLD_LAZY | RTLD_NODELETE" -fprofile-update=atomic -fdata-sections -ffunction-sections -fuse-ld=gold -Wl,--gc-sections -o %t-atomic -rpath %t.atomic.d %S/../Inputs/instrprof-tls-dlclose-main.c
+
+// Atomic lib, threadlocal exe
+
+RUN: mkdir -p %t.atomic-tl.d
+RUN: %clang_profgen -fcoverage-mapping -fprofile-update=atomic -fdata-sections -ffunction-sections -fuse-ld=gold -Wl,--gc-sections -o %t.atomic-tl.d/lib.shared -fPIC -shared %S/../Inputs/instrprof-tls-dlclose-lib.c
+
+RUN: %clang_profgen -fcoverage-mapping -lpthread -DDLOPEN_FUNC_DIR=\"%t.atomic-tl.d\" -DDLOPEN_FLAGS="RTLD_LAZY | RTLD_NODELETE" -fprofile-thread-local -fdata-sections -ffunction-sections -fuse-ld=gold -Wl,--gc-sections -o %t-atomic-tl -rpath %t.atomic-tl.d %S/../Inputs/instrprof-tls-dlclose-main.c
+
+// Atomic exe, threadlocal lib:  Not working.
+
+RUN: env LLVM_PROFILE_FILE=%t-atomic.profraw    %run %t-atomic
+RUN: env LLVM_PROFILE_FILE=%t-tls.profraw       %run %t-tls
+RUN: env LLVM_PROFILE_FILE=%t-atomic-tl.profraw %run %t-atomic-tl
+
+RUN: llvm-profdata merge -o %t-tls.profdata %t-tls.profraw
+RUN: llvm-profdata merge -o %t-atomic.profdata %t-atomic.profraw
+RUN: llvm-profdata merge -o %t-atomic-tl.profdata %t-atomic-tl.profraw
+RUN: %clang_profuse=%t-tls.profdata         -o %t-main.tls.ll       -S -emit-llvm %S/../Inputs/instrprof-tls-dlclose-main.c
+RUN: %clang_profuse=%t-atomic.profdata      -o %t-main.atomic.ll    -S -emit-llvm %S/../Inputs/instrprof-tls-dlclose-main.c
+RUN: %clang_profuse=%t-atomic-tl.profdata   -o %t-main.atomic-tl.ll -S -emit-llvm %S/../Inputs/instrprof-tls-dlclose-main.c
+RUN: diff %t-main.tls.ll        %t-main.atomic.ll
+RUN: diff %t-main.atomic-tl.ll  %t-main.atomic.ll
diff --git a/compiler-rt/test/profile/Linux/instrprof-tls-dlclose-mix.test b/compiler-rt/test/profile/Linux/instrprof-tls-dlclose-mix.test
new file mode 100644
index 0000000000000..74d5f9e2a4f58
--- /dev/null
+++ b/compiler-rt/test/profile/Linux/instrprof-tls-dlclose-mix.test
@@ -0,0 +1,48 @@
+// All threadlocal
+
+RUN: mkdir -p %t.tls.d
+RUN: %clang_profgen -fcoverage-mapping -fprofile-thread-local -fdata-sections -ffunction-sections -fuse-ld=gold -Wl,--gc-sections -o %t.tls.d/lib.shared -fPIC -shared %S/../Inputs/instrprof-tls-dlclose-lib.c
+
+RUN: %clang_profgen -fcoverage-mapping -lpthread -DDLOPEN_FUNC_DIR=\"%t.tls.d\" -DDLOPEN_FLAGS="RTLD_LAZY | RTLD_NODELETE" -fprofile-thread-local -fdata-sections -ffunction-sections -fuse-ld=gold -Wl,--gc-sections -o %t-tls -rpath %t.tls.d %S/../Inputs/instrprof-tls-dlclose-main.c
+
+// All atomic
+
+RUN: mkdir -p %t.atomic.d
+RUN: %clang_profgen -fcoverage-mapping -fprofile-update=atomic -fdata-sections -ffunction-sections -fuse-ld=gold -Wl,--gc-sections -o %t.atomic.d/lib.shared -fPIC -shared %S/../Inputs/instrprof-tls-dlclose-lib.c
+
+RUN: %clang_profgen -fcoverage-mapping -lpthread -DDLOPEN_FUNC_DIR=\"%t.atomic.d\" -DDLOPEN_FLAGS="RTLD_LAZY | RTLD_NODELETE" -fprofile-update=atomic -fdata-sections -ffunction-sections -fuse-ld=gold -Wl,--gc-sections -o %t-atomic -rpath %t.atomic.d %S/../Inputs/instrprof-tls-dlclose-main.c
+
+// Atomic lib, threadlocal exe
+
+RUN: mkdir -p %t.atomic-tl.d
+RUN: %clang_profgen -fcoverage-mapping -fprofile-update=atomic -fdata-sections -ffunction-sections -fuse-ld=gold -Wl,--gc-sections -o %t.atomic-tl.d/lib.shared -fPIC -shared %S/../Inputs/instrprof-tls-dlclose-lib.c
+
+RUN: %clang_profgen -fcoverage-mapping -lpthread -DDLOPEN_FUNC_DIR=\"%t.atomic-tl.d\" -DDLOPEN_FLAGS="RTLD_LAZY | RTLD_NODELETE" -fprofile-thread-local -fdata-sections -ffunction-sections -fuse-ld=gold -Wl,--gc-sections -o %t-atomic-tl -rpath %t.atomic-tl.d %S/../Inputs/instrprof-tls-dlclose-main.c
+
+// Atomic exe, threadlocal lib
+
+RUN: mkdir -p %t.tl-atomic.d
+RUN: %clang_profgen -fcoverage-mapping -fprofile-thread-local -fdata-sections -ffunction-sections -fuse-ld=gold -Wl,--gc-sections -o %t.tl-atomic.d/lib.shared -fPIC -shared %S/../Inputs/instrprof-tls-dlclose-lib.c
+
+RUN: %clang_profgen -fcoverage-mapping -lpthread -DDLOPEN_FUNC_DIR=\"%t.tl-atomic.d\" -DDLOPEN_FLAGS="RTLD_LAZY | RTLD_NODELETE" -fprofile-update=atomic -fdata-sections -ffunction-sections -fuse-ld=gold -Wl,--gc-sections -o %t-tl-atomic -rpath %t.tl-atomic.d %S/../Inputs/instrprof-tls-dlclose-main.c
+
+
+RUN: env LLVM_PROFILE_FILE=%t-atomic.profraw    %run %t-atomic
+RUN: env LLVM_PROFILE_FILE=%t-tls.profraw       %run %t-tls
+RUN: env LLVM_PROFILE_FILE=%t-atomic-tl.profraw %run %t-atomic-tl
+RUN: env LLVM_PROFILE_FILE=%t-tl-atomic.profraw %run %t-tl-atomic
+
+RUN: llvm-profdata merge -o %t-tls.profdata %t-tls.profraw
+RUN: llvm-profdata merge -o %t-atomic.profdata %t-atomic.profraw
+RUN: llvm-profdata merge -o %t-atomic-tl.profdata %t-atomic-tl.profraw
+RUN: llvm-profdata merge -o %t-tl-atomic.profdata %t-tl-atomic.profraw
+RUN: %clang_profuse=%t-tls.profdata         -o %t-main.tls.ll       -S -emit-llvm %S/../Inputs/instrprof-tls-dlclose-main.c
+RUN: %clang_profuse=%t-atomic.profdata      -o %t-main.atomic.ll    -S -emit-llvm %S/../Inputs/instrprof-tls-dlclose-main.c
+RUN: %clang_profuse=%t-atomic-tl.profdata   -o %t-main.atomic-tl.ll -S -emit-llvm %S/../Inputs/instrprof-tls-dlclose-main.c
+RUN: %clang_profuse=%t-tl-atomic.profdata   -o %t-main.tl-atomic.ll -S -emit-llvm %S/../Inputs/instrprof-tls-dlclose-main.c
+RUN: diff %t-main.tls.ll        %t-main.atomic.ll
+RUN: diff %t-main.atomic-tl.ll  %t-main.atomic.ll
+RUN: diff %t-main.tl-atomic.ll  %t-main.atomic.ll
+
+// Atomic exe, threadlocal lib does not pass.
+XFAIL: target={{.*}}
diff --git a/compiler-rt/test/profile/Linux/instrprof-tls-dlclose-nodelete.test b/compiler-rt/test/profile/Linux/instrprof-tls-dlclose-nodelete.test
new file mode 100644
index 0000000000000..8e99a3b60a69a
--- /dev/null
+++ b/compiler-rt/test/profile/Linux/instrprof-tls-dlclose-nodelete.test
@@ -0,0 +1,24 @@
+RUN: mkdir -p %t.tls.d
+RUN: %clang_profgen -fcoverage-mapping -fprofile-thread-local -fdata-sections -ffunction-sections -fuse-ld=gold -Wl,--gc-sections -o %t.tls.d/lib.shared -fPIC -shared %S/../Inputs/instrprof-tls-dlclose-lib.c
+
+RUN: %clang_profgen -fcoverage-mapping -lpthread -DDLOPEN_FUNC_DIR=\"%t.tls.d\" -DDLOPEN_FLAGS="RTLD_NODELETE | RTLD_LAZY" -fprofile-thread-local -fdata-sections -ffunction-sections -fuse-ld=gold -Wl,--gc-sections -o %t-tls -rpath %t.tls.d %S/../Inputs/instrprof-tls-dlclose-main.c
+
+RUN: mkdir -p %t.atomic.d
+RUN: %clang_profgen -fcoverage-mapping -fprofile-update=atomic -fdata-sections -ffunction-sections -fuse-ld=gold -Wl,--gc-sections -o %t.atomic.d/lib.shared -fPIC -shared %S/../Inputs/instrprof-tls-dlclose-lib.c
+
+RUN: %clang_profgen -fcoverage-mapping -lpthread -DDLOPEN_FUNC_DIR=\"%t.atomic.d\" -DDLOPEN_FLAGS="RTLD_NODELETE | RTLD_LAZY" -fprofile-update=atomic -fdata-sections -ffunction-sections -fuse-ld=gold -Wl,--gc-sections -o %t-atomic -rpath %t.atomic.d %S/../Inputs/instrprof-tls-dlclose-main.c
+
+RUN: env LLVM_PROFILE_FILE=%t-atomic.profraw %run %t-atomic
+RUN: env LLVM_PROFILE_FILE=%t-tls.profraw %run %t-tls
+
+RUN: llvm-profdata merge -o %t-tls.profdata %t-tls.profraw
+RUN: llvm-profdata merge -o %t-atomic.profdata %t-atomic.profraw
+RUN: %clang_profuse=%t-tls.profdata    -o %t-main.tls.ll    -S -emit-llvm %S/../Inputs/instrprof-tls-dlclose-main.c
+RUN: %clang_profuse=%t-atomic.profdata -o %t-main.atomic.ll -S -emit-llvm %S/../Inputs/instrprof-tls-dlclose-main.c
+RUN: diff %t-main.tls.ll %t-main.atomic.ll
+
+RUN: llvm-profdata merge -o %t-tls.profdata %t-tls.profraw
+RUN: llvm-profdata merge -o %t-atomic.profdata %t-atomic.profraw
+RUN: %clang_profuse=%t-tls.profdata -o %t-lib.tls.ll -S -emit-llvm %S/../Inputs/instrprof-tls-dlclose-lib.c
+RUN: %clang_profuse=%t-atomic.profdata -o %t-lib.atomic.ll -S -emit-llvm %S/../Inputs/instrprof-tls-dlclose-lib.c
+RUN: diff %t-lib.tls.ll %t-lib.atomic.ll
diff --git a/compiler-rt/test/profile/Linux/instrprof-tls-dlopen.test b/compiler-rt/test/profile/Linux/instrprof-tls-dlopen.test
new file mode 100644
index 0000000000000..990c87e1bd86b
--- /dev/null
+++ b/compiler-rt/test/profile/Linux/instrprof-tls-dlopen.test
@@ -0,0 +1,32 @@
+RUN: mkdir -p %t.tls.d
+RUN: %clang_profgen -fcoverage-mapping -fprofile-thread-local -fdata-sections -ffunction-sections -fuse-ld=gold -Wl,--gc-sections -o %t.tls.d/func.shared -fPIC -shared %S/../Inputs/instrprof-tls-dlopen-func.c
+RUN: %clang_profgen -fcoverage-mapping -fprofile-thread-local -fdata-sections -ffunction-sections -fuse-ld=gold -Wl,--gc-sections -o %t.tls.d/func2.shared -fPIC -shared %S/../Inputs/instrprof-tls-dlopen-func2.c
+
+RUN: %clang_profgen -fcoverage-mapping -lpthread -DDLOPEN_FUNC_DIR=\"%t.tls.d\" -DDLOPEN_FLAGS="RTLD_LAZY" -fprofile-thread-local -fdata-sections -ffunction-sections -fuse-ld=gold -Wl,--gc-sections -o %t-tls -rpath %t.tls.d %S/../Inputs/instrprof-tls-dlopen-main.c
+
+RUN: mkdir -p %t.atomic.d
+RUN: %clang_profgen -fcoverage-mapping -fprofile-update=atomic -fdata-sections -ffunction-sections -fuse-ld=gold -Wl,--gc-sections -o %t.atomic.d/func.shared -fPIC -shared %S/../Inputs/instrprof-tls-dlopen-func.c
+RUN: %clang_profgen -fcoverage-mapping -fprofile-update=atomic -fdata-sections -ffunction-sections -fuse-ld=gold -Wl,--gc-sections -o %t.atomic.d/func2.shared -fPIC -shared %S/../Inputs/instrprof-tls-dlopen-func2.c
+
+RUN: %clang_profgen -fcoverage-mapping -lpthread -DDLOPEN_FUNC_DIR=\"%t.atomic.d\" -DDLOPEN_FLAGS="RTLD_LAZY" -fprofile-update=atomic -fdata-sections -ffunction-sections -fuse-ld=gold -Wl,--gc-sections -o %t-atomic -rpath %t.atomic.d %S/../Inputs/instrprof-tls-dlopen-main.c
+
+RUN: env LLVM_PROFILE_FILE=%t-tls.profraw %run %t-tls
+RUN: env LLVM_PROFILE_FILE=%t-atomic.profraw %run %t-atomic
+
+RUN: llvm-profdata merge -o %t-tls.profdata %t-tls.profraw
+RUN: llvm-profdata merge -o %t-atomic.profdata %t-atomic.profraw
+RUN: %clang_profuse=%t-tls.profdata    -o %t-main.tls.ll    -S -emit-llvm %S/../Inputs/instrprof-tls-dlopen-main.c
+RUN: %clang_profuse=%t-atomic.profdata -o %t-main.atomic.ll -S -emit-llvm %S/../Inputs/instrprof-tls-dlopen-main.c
+RUN: diff %t-main.tls.ll %t-main.atomic.ll
+
+RUN: llvm-profdata merge -o %t-tls.profdata %t-tls.profraw
+RUN: llvm-profdata merge -o %t-atomic.profdata %t-atomic.profraw
+RUN: %clang_profuse=%t-tls.profdata -o %t-func2.tls.ll -S -emit-llvm %S/../Inputs/instrprof-tls-dlopen-func2.c
+RUN: %clang_profuse=%t-atomic.profdata -o %t-func2.atomic.ll -S -emit-llvm %S/../Inputs/instrprof-tls-dlopen-func2.c
+RUN: diff %t-func2.tls.ll %t-func2.atomic.ll
+
+RUN: llvm-profdata merge -o %t-tls.profdata %t-tls.profraw
+RUN: llvm-profdata merge -o %t-atomic.profdata %t-atomic.profraw
+RUN: %clang_profuse=%t-tls.profdata -o %t-func.tls.ll -S -emit-llvm %S/../Inputs/instrprof-tls-dlopen-func.c
+RUN: %clang_profuse=%t-atomic.profdata -o %t-func.atomic.ll -S -emit-llvm %S/../Inputs/instrprof-tls-dlopen-func.c
+RUN: diff %t-func.tls.ll %t-func.atomic.ll
diff --git a/compiler-rt/test/profile/Linux/instrprof-tls-exit.test b/compiler-rt/test/profile/Linux/instrprof-tls-exit.test
new file mode 100644
index 0000000000000..fef3c78f0726c
--- /dev/null
+++ b/compiler-rt/test/profile/Linux/instrprof-tls-exit.test
@@ -0,0 +1,17 @@
+RUN: %clang_profgen -fcoverage-mapping -fprofile-update=atomic -fdata-sections -ffunction-sections -fuse-ld=gold -Wl,--gc-sections -o %t-atomic %S/../Inputs/instrprof-tls-exit.c
+
+RUN: %clang_profgen -fcoverage-mapping -fprofile-thread-local -fdata-sections -ffunction-sections -fuse-ld=gold -Wl,--gc-sections -o %t-tls %S/../Inputs/instrprof-tls-exit.c
+
+RUN: env LLVM_PROFILE_FILE=%t-tls.profraw %run %t-tls
+RUN: env LLVM_PROFILE_FILE=%t-atomic.profraw %run %t-atomic
+
+RUN: llvm-profdata merge -o %t-tls.profdata %t-tls.profraw
+RUN: llvm-profdata merge -o %t-atomic.profdata %t-atomic.profraw
+
+RUN: %clang_profuse=%t-tls.profdata    -o %t-tls.ll    -S -emit-llvm %S/../Inputs/instrprof-tls-exit.c
+RUN: %clang_profuse=%t-atomic.profdata -o %t-atomic.ll -S -emit-llvm %S/../Inputs/instrprof-tls-exit.c
+RUN: diff %t-tls.ll %t-atomic.ll
+
+# With the first iteration of this change, it is understood that only exiting via the main thread will cause
+# expected coverage outputs.
+XFAIL: target={{.*}}
diff --git a/compiler-rt/test/profile/Linux/instrprof-tls-noclose-mix.test b/compiler-rt/test/profile/Linux/instrprof-tls-noclose-mix.test
new file mode 100644
index 0000000000000..67cb1d2f66543
--- /dev/null
+++ b/compiler-rt/test/profile/Linux/instrprof-tls-noclose-mix.test
@@ -0,0 +1,51 @@
+// All threadlocal
+
+RUN: mkdir -p %t.tls.d
+RUN: %clang_profgen -fcoverage-mapping -fprofile-thread-local -fdata-sections -ffunction-sections -fuse-ld=gold -Wl,--gc-sections -o %t.tls.d/lib.shared -fPIC -shared %S/../Inputs/instrprof-tls-dlclose-lib.c
+
+RUN: %clang_profgen -fcoverage-mapping -lpthread -DDONT_CLOSE -DDLOPEN_FUNC_DIR=\"%t.tls.d\" -DDLOPEN_FLAGS="RTLD_LAZY | RTLD_NODELETE" -fprofile-thread-local -fdata-sections -ffunction-sections -fuse-ld=gold -Wl,--gc-sections -o %t-tls -rpath %t.tls.d %S/../Inputs/instrprof-tls-dlclose-main.c
+
+// All atomic
+
+RUN: mkdir -p %t.atomic.d
+RUN: %clang_profgen -fcoverage-mapping -fprofile-update=atomic -fdata-sections -ffunction-sections -fuse-ld=gold -Wl,--gc-sections -o %t.atomic.d/lib.shared -fPIC -shared %S/../Inputs/instrprof-tls-dlclose-lib.c
+
+RUN: %clang_profgen -fcoverage-mapping -lpthread -DDONT_CLOSE -DDLOPEN_FUNC_DIR=\"%t.atomic.d\" -DDLOPEN_FLAGS="RTLD_LAZY | RTLD_NODELETE" -fprofile-update=atomic -fdata-sections -ffunction-sections -fuse-ld=gold -Wl,--gc-sections -o %t-atomic -rpath %t.atomic.d %S/../Inputs/instrprof-tls-dlclose-main.c
+
+// Atomic lib, threadlocal exe
+
+RUN: mkdir -p %t.atomic-tl.d
+RUN: %clang_profgen -fcoverage-mapping -fprofile-update=atomic -fdata-sections -ffunction-sections -fuse-ld=gold -Wl,--gc-sections -o %t.atomic-tl.d/lib.shared -fPIC -shared %S/../Inputs/instrprof-tls-dlclose-lib.c
+
+RUN: %clang_profgen -fcoverage-mapping -lpthread -DDONT_CLOSE -DDLOPEN_FUNC_DIR=\"%t.atomic-tl.d\" -DDLOPEN_FLAGS="RTLD_LAZY | RTLD_NODELETE" -fprofile-thread-local -fdata-sections -ffunction-sections -fuse-ld=gold -Wl,--gc-sections -o %t-atomic-tl -rpath %t.atomic-tl.d %S/../Inputs/instrprof-tls-dlclose-main.c
+
+// Atomic exe, threadlocal lib
+
+RUN: mkdir -p %t.tl-atomic.d
+RUN: %clang_profgen -fcoverage-mapping -fprofile-thread-local -fdata-sections -ffunction-sections -fuse-ld=gold -Wl,--gc-sections -o %t.tl-atomic.d/lib.shared -fPIC -shared %S/../Inputs/instrprof-tls-dlclose-lib.c
+
+RUN: %clang_profgen -fcoverage-mapping -g3 -lpthread -DDONT_CLOSE -DDLOPEN_FUNC_DIR=\"%t.tl-atomic.d\" -DDLOPEN_FLAGS="RTLD_LAZY | RTLD_NODELETE" -fprofile-update=atomic -fdata-sections -ffunction-sections -fuse-ld=gold -Wl,--gc-sections -o %t-tl-atomic -rpath %t.tl-atomic.d %S/../Inputs/instrprof-tls-dlclose-main.c
+
+
+RUN: env LLVM_PROFILE_FILE=%t-atomic.profraw    %run %t-atomic
+RUN: env LLVM_PROFILE_FILE=%t-tls.profraw       %run %t-tls
+RUN: env LLVM_PROFILE_FILE=%t-atomic-tl.profraw %run %t-atomic-tl
+RUN: env LLVM_PROFILE_FILE=%t-tl-atomic.profraw %run %t-tl-atomic
+
+RUN: llvm-profdata merge -o %t-tls.profdata %t-tls.profraw
+RUN: llvm-profdata merge -o %t-atomic.profdata %t-atomic.profraw
+RUN: llvm-profdata merge -o %t-atomic-tl.profdata %t-atomic-tl.profraw
+RUN: llvm-profdata merge -o %t-tl-atomic.profdata %t-tl-atomic.profraw
+RUN: %clang_profuse=%t-tls.profdata         -o %t-main.tls.ll       -S -emit-llvm %S/../Inputs/instrprof-tls-dlclose-main.c
+RUN: %clang_profuse=%t-atomic.profdata      -o %t-main.atomic.ll    -S -emit-llvm %S/../Inputs/instrprof-tls-dlclose-main.c
+RUN: %clang_profuse=%t-atomic-tl.profdata   -o %t-main.atomic-tl.ll -S -emit-llvm %S/../Inputs/instrprof-tls-dlclose-main.c
+RUN: %clang_profuse=%t-tl-atomic.profdata   -o %t-main.tl-atomic.ll -S -emit-llvm %S/../Inputs/instrprof-tls-dlclose-main.c
+RUN: diff %t-main.tls.ll        %t-main.atomic.ll
+RUN: diff %t-main.atomic-tl.ll  %t-main.atomic.ll
+
+// The failure associated with threadlocal dlopened lib, atomic-update executable is that pthread_create
+// is not intercepted before it is first called.  That means that we can't run the thread exit handler.
+RUN: diff %t-main.tl-atomic.ll  %t-main.atomic.ll
+
+// Atomic exe, threadlocal lib does not pass.
+XFAIL: target={{.*}}
diff --git a/compiler-rt/test/profile/Linux/instrprof-tls-shared-mix-subset.test b/compiler-rt/test/profile/Linux/instrprof-tls-shared-mix-subset.test
new file mode 100644
index 0000000000000..6d6b7e4b3bb59
--- /dev/null
+++ b/compiler-rt/test/profile/Linux/instrprof-tls-shared-mix-subset.test
@@ -0,0 +1,35 @@
+// All threadlocal
+
+RUN: mkdir -p %t.tls.d
+RUN: %clang_profgen -fcoverage-mapping -fprofile-thread-local -fdata-sections -ffunction-sections -fuse-ld=gold -Wl,--gc-sections -o %t.tls.d/liblib.so -fPIC -shared %S/../Inputs/instrprof-tls-dlclose-lib.c
+
+RUN: %clang_profgen -fcoverage-mapping -lpthread -fprofile-thread-local -fdata-sections -ffunction-sections -fuse-ld=gold -Wl,--gc-sections -o %t-tls -rpath %t.tls.d -L%t.tls.d -llib %S/../Inputs/instrprof-tls-dlclose-main.c
+
+// All atomic
+
+RUN: mkdir -p %t.atomic.d
+RUN: %clang_profgen -fcoverage-mapping -fprofile-update=atomic -fdata-sections -ffunction-sections -fuse-ld=gold -Wl,--gc-sections -o %t.atomic.d/liblib.so -fPIC -shared %S/../Inputs/instrprof-tls-dlclose-lib.c
+
+RUN: %clang_profgen -fcoverage-mapping -lpthread -fprofile-update=atomic -fdata-sections -ffunction-sections -fuse-ld=gold -Wl,--gc-sections -o %t-atomic -rpath %t.atomic.d -L%t.atomic.d -llib %S/../Inputs/instrprof-tls-dlclose-main.c
+
+// Atomic lib, threadlocal exe
+
+RUN: mkdir -p %t.atomic-tl.d
+RUN: %clang_profgen -fcoverage-mapping -fprofile-update=atomic -fdata-sections -ffunction-sections -fuse-ld=gold -Wl,--gc-sections -o %t.atomic-tl.d/liblib.so -fPIC -shared %S/../Inputs/instrprof-tls-dlclose-lib.c
+
+RUN: %clang_profgen -fcoverage-mapping -lpthread -fprofile-thread-local -fdata-sections -ffunction-sections -fuse-ld=gold -Wl,--gc-sections -o %t-atomic-tl -rpath %t.atomic-tl.d -L%t.atomic-tl.d -llib %S/../Inputs/instrprof-tls-dlclose-main.c
+
+// Atomic exe, threadlocal lib
+
+RUN: env LLVM_PROFILE_FILE=%t-atomic.profraw    %run %t-atomic
+RUN: env LLVM_PROFILE_FILE=%t-tls.profraw       %run %t-tls
+RUN: env LLVM_PROFILE_FILE=%t-atomic-tl.profraw %run %t-atomic-tl
+
+RUN: llvm-profdata merge -o %t-tls.profdata %t-tls.profraw
+RUN: llvm-profdata merge -o %t-atomic.profdata %t-atomic.profraw
+RUN: llvm-profdata merge -o %t-atomic-tl.profdata %t-atomic-tl.profraw
+RUN: %clang_profuse=%t-tls.profdata         -o %t-main.tls.ll       -S -emit-llvm %S/../Inputs/instrprof-tls-dlclose-main.c
+RUN: %clang_profuse=%t-atomic.profdata      -o %t-main.atomic.ll    -S -emit-llvm %S/../Inputs/instrprof-tls-dlclose-main.c
+RUN: %clang_profuse=%t-atomic-tl.profdata   -o %t-main.atomic-tl.ll -S -emit-llvm %S/../Inputs/instrprof-tls-dlclose-main.c
+RUN: diff %t-main.tls.ll        %t-main.atomic.ll
+RUN: diff %t-main.atomic-tl.ll  %t-main.atomic.ll
diff --git a/compiler-rt/test/profile/Linux/instrprof-tls-shared-mix.test b/compiler-rt/test/profile/Linux/instrprof-tls-shared-mix.test
new file mode 100644
index 0000000000000..1fb58128ada32
--- /dev/null
+++ b/compiler-rt/test/profile/Linux/instrprof-tls-shared-mix.test
@@ -0,0 +1,48 @@
+// All threadlocal
+
+RUN: mkdir -p %t.tls.d
+RUN: %clang_profgen -fcoverage-mapping -fprofile-thread-local -fdata-sections -ffunction-sections -fuse-ld=gold -Wl,--gc-sections -o %t.tls.d/liblib.so -fPIC -shared %S/../Inputs/instrprof-tls-dlclose-lib.c
+
+RUN: %clang_profgen -fcoverage-mapping -lpthread -fprofile-thread-local -fdata-sections -ffunction-sections -fuse-ld=gold -Wl,--gc-sections -o %t-tls -rpath %t.tls.d -L%t.tls.d -llib %S/../Inputs/instrprof-tls-dlclose-main.c
+
+// All atomic
+
+RUN: mkdir -p %t.atomic.d
+RUN: %clang_profgen -fcoverage-mapping -fprofile-update=atomic -fdata-sections -ffunction-sections -fuse-ld=gold -Wl,--gc-sections -o %t.atomic.d/liblib.so -fPIC -shared %S/../Inputs/instrprof-tls-dlclose-lib.c
+
+RUN: %clang_profgen -fcoverage-mapping -lpthread -fprofile-update=atomic -fdata-sections -ffunction-sections -fuse-ld=gold -Wl,--gc-sections -o %t-atomic -rpath %t.atomic.d -L%t.atomic.d -llib %S/../Inputs/instrprof-tls-dlclose-main.c
+
+// Atomic lib, threadlocal exe
+
+RUN: mkdir -p %t.atomic-tl.d
+RUN: %clang_profgen -fcoverage-mapping -fprofile-update=atomic -fdata-sections -ffunction-sections -fuse-ld=gold -Wl,--gc-sections -o %t.atomic-tl.d/liblib.so -fPIC -shared %S/../Inputs/instrprof-tls-dlclose-lib.c
+
+RUN: %clang_profgen -fcoverage-mapping -lpthread -fprofile-thread-local -fdata-sections -ffunction-sections -fuse-ld=gold -Wl,--gc-sections -o %t-atomic-tl -rpath %t.atomic-tl.d -L%t.atomic-tl.d -llib %S/../Inputs/instrprof-tls-dlclose-main.c
+
+// Atomic exe, threadlocal lib
+
+RUN: mkdir -p %t.tl-atomic.d
+RUN: %clang_profgen -fcoverage-mapping -fprofile-thread-local -fdata-sections -ffunction-sections -fuse-ld=gold -Wl,--gc-sections -o %t.tl-atomic.d/liblib.so -fPIC -shared %S/../Inputs/instrprof-tls-dlclose-lib.c
+
+RUN: %clang_profgen -fcoverage-mapping -lpthread -fprofile-update=atomic -fdata-sections -ffunction-sections -fuse-ld=gold -Wl,--gc-sections -o %t-tl-atomic -rpath %t.tl-atomic.d -L%t.tl-atomic.d -llib %S/../Inputs/instrprof-tls-dlclose-main.c
+
+
+RUN: env LLVM_PROFILE_FILE=%t-atomic.profraw    %run %t-atomic
+RUN: env LLVM_PROFILE_FILE=%t-tls.profraw       %run %t-tls
+RUN: env LLVM_PROFILE_FILE=%t-atomic-tl.profraw %run %t-atomic-tl
+RUN: env LLVM_PROFILE_FILE=%t-tl-atomic.profraw %run %t-tl-atomic
+
+RUN: llvm-profdata merge -o %t-tls.profdata %t-tls.profraw
+RUN: llvm-profdata merge -o %t-atomic.profdata %t-atomic.profraw
+RUN: llvm-profdata merge -o %t-atomic-tl.profdata %t-atomic-tl.profraw
+RUN: llvm-profdata merge -o %t-tl-atomic.profdata %t-tl-atomic.profraw
+RUN: %clang_profuse=%t-tls.profdata         -o %t-main.tls.ll       -S -emit-llvm %S/../Inputs/instrprof-tls-dlclose-main.c
+RUN: %clang_profuse=%t-atomic.profdata      -o %t-main.atomic.ll    -S -emit-llvm %S/../Inputs/instrprof-tls-dlclose-main.c
+RUN: %clang_profuse=%t-atomic-tl.profdata   -o %t-main.atomic-tl.ll -S -emit-llvm %S/../Inputs/instrprof-tls-dlclose-main.c
+RUN: %clang_profuse=%t-tl-atomic.profdata   -o %t-main.tl-atomic.ll -S -emit-llvm %S/../Inputs/instrprof-tls-dlclose-main.c
+RUN: diff %t-main.tls.ll        %t-main.atomic.ll
+RUN: diff %t-main.atomic-tl.ll  %t-main.atomic.ll
+RUN: diff %t-main.tl-atomic.ll  %t-main.atomic.ll
+
+// Atomic exe, threadlocal lib does not pass.
+XFAIL: target={{.*}}
diff --git a/llvm/include/llvm/ProfileData/InstrProf.h b/llvm/include/llvm/ProfileData/InstrProf.h
index 817ad9550f652..0c78450641db6 100644
--- a/llvm/include/llvm/ProfileData/InstrProf.h
+++ b/llvm/include/llvm/ProfileData/InstrProf.h
@@ -100,6 +100,9 @@ inline StringRef getInstrProfDataVarPrefix() { return "__profd_"; }
 /// Return the name prefix of profile counter variables.
 inline StringRef getInstrProfCountersVarPrefix() { return "__profc_"; }
 
+/// Return the name prefix of profile counter variables.
+inline StringRef getInstrProfCountersTLSVarPrefix() { return "__profc_tls_"; }
+
 /// Return the name prefix of profile bitmap variables.
 inline StringRef getInstrProfBitmapVarPrefix() { return "__profbm_"; }
 
diff --git a/llvm/include/llvm/ProfileData/InstrProfData.inc b/llvm/include/llvm/ProfileData/InstrProfData.inc
index e9866d94b762c..8655bcf498437 100644
--- a/llvm/include/llvm/ProfileData/InstrProfData.inc
+++ b/llvm/include/llvm/ProfileData/InstrProfData.inc
@@ -312,6 +312,9 @@ INSTR_PROF_SECT_ENTRY(IPSK_data, \
 INSTR_PROF_SECT_ENTRY(IPSK_cnts, \
                       INSTR_PROF_QUOTE(INSTR_PROF_CNTS_COMMON), \
                       INSTR_PROF_CNTS_COFF, "__DATA,")
+INSTR_PROF_SECT_ENTRY(IPSK_tls_cnts, \
+                      INSTR_PROF_QUOTE(INSTR_PROF_TLS_CNTS_COMMON), \
+                      INSTR_PROF_CNTS_COFF, "__DATA,")
 INSTR_PROF_SECT_ENTRY(IPSK_bitmap, \
                       INSTR_PROF_QUOTE(INSTR_PROF_BITS_COMMON), \
                       INSTR_PROF_BITS_COFF, "__DATA,")
@@ -750,6 +753,7 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
 #define INSTR_PROF_NAME_COMMON __llvm_prf_names
 #define INSTR_PROF_VNAME_COMMON __llvm_prf_vns
 #define INSTR_PROF_CNTS_COMMON __llvm_prf_cnts
+#define INSTR_PROF_TLS_CNTS_COMMON __llvm_tls_prf_cnts
 #define INSTR_PROF_BITS_COMMON __llvm_prf_bits
 #define INSTR_PROF_VALS_COMMON __llvm_prf_vals
 #define INSTR_PROF_VNODES_COMMON __llvm_prf_vnds
diff --git a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
index f9b58d9f27821..d4005350e84ca 100644
--- a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
+++ b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
@@ -85,6 +85,11 @@ cl::opt<InstrProfCorrelator::ProfCorrelatorKind> ProfileCorrelate(
                           "Use debug info to correlate"),
                clEnumValN(InstrProfCorrelator::BINARY, "binary",
                           "Use binary to correlate")));
+
+cl::opt<bool>
+    InstrProfThreadLocal("instr-prof-thread-local",
+                         cl::desc("Generate thread local counter regions"),
+                         cl::init(false));
 } // namespace llvm
 
 namespace {
@@ -215,6 +220,10 @@ class InstrLowerer final {
   struct PerFunctionProfileData {
     uint32_t NumValueSites[IPVK_Last + 1] = {};
     GlobalVariable *RegionCounters = nullptr;
+    GlobalVariable *TLSRegionCounters = nullptr;
+    // Both a regular DataVar and TLS Datavar must exist when TLS counters are
+    // in use
+    GlobalVariable *TLSDataVar = nullptr;
     GlobalVariable *DataVar = nullptr;
     GlobalVariable *RegionBitmaps = nullptr;
     uint32_t NumBitmapBytes = 0;
@@ -286,16 +295,24 @@ class InstrLowerer final {
   /// acts on.
   Value *getCounterAddress(InstrProfCntrInstBase *I);
 
+  Value *getThreadLocalCounterAddress(InstrProfCntrInstBase *I);
+
   /// Get the region counters for an increment, creating them if necessary.
   ///
   /// If the counter array doesn't yet exist, the profile data variables
   /// referring to them will also be created.
   GlobalVariable *getOrCreateRegionCounters(InstrProfCntrInstBase *Inc);
 
+  /// Get the thread local region counters, creating them if necessary.
+  /// These must exist alongside the global region counters.
+  GlobalVariable *
+  getOrCreateThreadLocalRegionCounters(InstrProfCntrInstBase *Inc);
+
   /// Create the region counters.
   GlobalVariable *createRegionCounters(InstrProfCntrInstBase *Inc,
                                        StringRef Name,
-                                       GlobalValue::LinkageTypes Linkage);
+                                       GlobalValue::LinkageTypes Linkage,
+                                       bool ThreadLocal);
 
   /// Compute the address of the test vector bitmap that this profiling
   /// instruction acts on.
@@ -608,6 +625,7 @@ enum class ValueProfilingCallType {
 
 } // end anonymous namespace
 
+// TODO: put TLS counters incompatibility checks here
 PreservedAnalyses InstrProfilingLoweringPass::run(Module &M,
                                                   ModuleAnalysisManager &AM) {
   FunctionAnalysisManager &FAM =
@@ -894,6 +912,9 @@ void InstrLowerer::lowerValueProfileInst(InstrProfValueProfileInst *Ind) {
 
 Value *InstrLowerer::getCounterAddress(InstrProfCntrInstBase *I) {
   auto *Counters = getOrCreateRegionCounters(I);
+  if (InstrProfThreadLocal) {
+    return getThreadLocalCounterAddress(I);
+  }
   IRBuilder<> Builder(I);
 
   if (isa<InstrProfTimestampInst>(I))
@@ -932,6 +953,22 @@ Value *InstrLowerer::getCounterAddress(InstrProfCntrInstBase *I) {
   return Builder.CreateIntToPtr(Add, Addr->getType());
 }
 
+Value *InstrLowerer::getThreadLocalCounterAddress(InstrProfCntrInstBase *I) {
+  GlobalVariable *CountersTLS = getOrCreateThreadLocalRegionCounters(I);
+  IRBuilder<> Builder(I);
+
+  if (isa<InstrProfTimestampInst>(I))
+    CountersTLS->setAlignment(Align(8));
+
+  auto *Addr = Builder.CreateConstInBoundsGEP2_32(
+      CountersTLS->getValueType(),
+      Builder.CreateThreadLocalAddress(CountersTLS), 0,
+      I->getIndex()->getZExtValue());
+
+  assert(!isRuntimeCounterRelocationEnabled());
+  return Addr;
+}
+
 Value *InstrLowerer::getBitmapAddress(InstrProfMCDCTVBitmapUpdate *I) {
   auto *Bitmaps = getOrCreateRegionBitmaps(I);
   IRBuilder<> Builder(I);
@@ -1391,13 +1428,18 @@ GlobalVariable *InstrLowerer::setupProfileSection(InstrProfInstBase *Inc,
     VarPrefix = getInstrProfCountersVarPrefix();
     VarName = getVarName(Inc, VarPrefix, Renamed);
     InstrProfCntrInstBase *CntrIncrement = dyn_cast<InstrProfCntrInstBase>(Inc);
-    Ptr = createRegionCounters(CntrIncrement, VarName, Linkage);
+    Ptr = createRegionCounters(CntrIncrement, VarName, Linkage, false);
   } else if (IPSK == IPSK_bitmap) {
     VarPrefix = getInstrProfBitmapVarPrefix();
     VarName = getVarName(Inc, VarPrefix, Renamed);
     InstrProfMCDCBitmapInstBase *BitmapUpdate =
         dyn_cast<InstrProfMCDCBitmapInstBase>(Inc);
     Ptr = createRegionBitmaps(BitmapUpdate, VarName, Linkage);
+  } else if (IPSK == IPSK_tls_cnts) {
+    VarPrefix = getInstrProfCountersTLSVarPrefix();
+    VarName = getVarName(Inc, VarPrefix, Renamed);
+    InstrProfCntrInstBase *CntrIncrement = dyn_cast<InstrProfCntrInstBase>(Inc);
+    Ptr = createRegionCounters(CntrIncrement, VarName, Linkage, true);
   } else {
     llvm_unreachable("Profile Section must be for Counters or Bitmaps");
   }
@@ -1440,7 +1482,8 @@ InstrLowerer::getOrCreateRegionBitmaps(InstrProfMCDCBitmapInstBase *Inc) {
 
 GlobalVariable *
 InstrLowerer::createRegionCounters(InstrProfCntrInstBase *Inc, StringRef Name,
-                                   GlobalValue::LinkageTypes Linkage) {
+                                   GlobalValue::LinkageTypes Linkage,
+                                   bool ThreadLocal) {
   uint64_t NumCounters = Inc->getNumCounters()->getZExtValue();
   auto &Ctx = M.getContext();
   GlobalVariable *GV;
@@ -1460,6 +1503,7 @@ InstrLowerer::createRegionCounters(InstrProfCntrInstBase *Inc, StringRef Name,
                             Constant::getNullValue(CounterTy), Name);
     GV->setAlignment(Align(8));
   }
+  GV->setThreadLocal(ThreadLocal);
   return GV;
 }
 
@@ -1475,6 +1519,10 @@ InstrLowerer::getOrCreateRegionCounters(InstrProfCntrInstBase *Inc) {
   auto *CounterPtr = setupProfileSection(Inc, IPSK_cnts);
   PD.RegionCounters = CounterPtr;
 
+  if (InstrProfThreadLocal) {
+    PD.TLSRegionCounters = setupProfileSection(Inc, IPSK_tls_cnts);
+  }
+
   if (DebugInfoCorrelate ||
       ProfileCorrelate == InstrProfCorrelator::DEBUG_INFO) {
     LLVMContext &Ctx = M.getContext();
@@ -1518,6 +1566,21 @@ InstrLowerer::getOrCreateRegionCounters(InstrProfCntrInstBase *Inc) {
   return PD.RegionCounters;
 }
 
+GlobalVariable *
+InstrLowerer::getOrCreateThreadLocalRegionCounters(InstrProfCntrInstBase *Inc) {
+  // If this check fails, this function would return a null pointer
+  assert(InstrProfThreadLocal);
+  GlobalVariable *NamePtr = Inc->getName();
+  auto &PD = ProfileDataMap[NamePtr];
+  if (PD.TLSRegionCounters) {
+    return PD.TLSRegionCounters;
+  } else {
+    // Initializes TLSRegionCounters when InstrProfThreadLocal is true
+    (void)getOrCreateRegionCounters(Inc);
+    return PD.TLSRegionCounters;
+  }
+}
+
 void InstrLowerer::createDataVariable(InstrProfCntrInstBase *Inc) {
   // When debug information is correlated to profile data, a data variable
   // is not needed.
@@ -1555,6 +1618,8 @@ void InstrLowerer::createDataVariable(InstrProfCntrInstBase *Inc) {
       getVarName(Inc, getInstrProfCountersVarPrefix(), Renamed);
   std::string DataVarName =
       getVarName(Inc, getInstrProfDataVarPrefix(), Renamed);
+  std::string TLSDataVarName =
+      getVarName(Inc, getInstrProfCountersTLSVarPrefix(), Renamed);
 
   auto *Int8PtrTy = PointerType::getUnqual(Ctx);
   // Allocate statically the array of pointers to value profile nodes for



More information about the cfe-commits mailing list