[compiler-rt] 9ef6faf - [scudo][standalone] Fork support

Tue Jan 14 07:52:00 PST 2020

Author: Kostya Kortchinsky
Date: 2020-01-14T07:51:48-08:00
New Revision: 9ef6faf49670e18eb1ba04105a7c70b450cdaa71

URL: https://github.com/llvm/llvm-project/commit/9ef6faf49670e18eb1ba04105a7c70b450cdaa71
DIFF: https://github.com/llvm/llvm-project/commit/9ef6faf49670e18eb1ba04105a7c70b450cdaa71.diff

LOG: [scudo][standalone] Fork support

Summary:
fork() wasn't well (or at all) supported in Scudo. This materialized
in deadlocks in children.

In order to properly support fork, we will lock the allocator pre-fork
and unlock it post-fork in parent and child. This is done via a
`pthread_atfork` call installing the necessary handlers.

A couple of things suck here: this function allocates - so this has to
be done post initialization as our init path is not reentrance, and it
doesn't allow for an extra pointer - so we can't pass the allocator we
are currently working with.

In order to work around this, I added a post-init template parameter
that gets executed once the allocator is initialized for the current
thread. Its job for the C wrappers is to install the atfork handlers.

I reorganized a bit the impacted area and added some tests, courtesy
of cferris@ that were deadlocking prior to this fix.

Subscribers: jfb, #sanitizers, llvm-commits

Tags: #sanitizers, #llvm

Differential Revision: https://reviews.llvm.org/D72470

Added: 
    

Modified: 
    compiler-rt/lib/scudo/standalone/CMakeLists.txt
    compiler-rt/lib/scudo/standalone/bytemap.h
    compiler-rt/lib/scudo/standalone/combined.h
    compiler-rt/lib/scudo/standalone/primary32.h
    compiler-rt/lib/scudo/standalone/primary64.h
    compiler-rt/lib/scudo/standalone/quarantine.h
    compiler-rt/lib/scudo/standalone/stats.h
    compiler-rt/lib/scudo/standalone/tests/CMakeLists.txt
    compiler-rt/lib/scudo/standalone/tests/tsd_test.cpp
    compiler-rt/lib/scudo/standalone/tests/wrappers_c_test.cpp
    compiler-rt/lib/scudo/standalone/tests/wrappers_cpp_test.cpp
    compiler-rt/lib/scudo/standalone/tsd.h
    compiler-rt/lib/scudo/standalone/tsd_exclusive.h
    compiler-rt/lib/scudo/standalone/tsd_shared.h
    compiler-rt/lib/scudo/standalone/wrappers_c.cpp
    compiler-rt/lib/scudo/standalone/wrappers_c.inc
    compiler-rt/lib/scudo/standalone/wrappers_c_bionic.cpp
    compiler-rt/lib/scudo/standalone/wrappers_cpp.cpp

Removed: 
    


################################################################################
diff  --git a/compiler-rt/lib/scudo/standalone/CMakeLists.txt b/compiler-rt/lib/scudo/standalone/CMakeLists.txt
index 920034b35778..a7249d1854ad 100644

--- a/compiler-rt/lib/scudo/standalone/CMakeLists.txt
+++ b/compiler-rt/lib/scudo/standalone/CMakeLists.txt
@@ -1,5 +1,6 @@
 add_compiler_rt_component(scudo_standalone)
-if (COMPILER_RT_HAS_GWP_ASAN)
+# FIXME: GWP-ASan is temporarily disabled, re-enable once issues are fixed.
+if (FALSE AND COMPILER_RT_HAS_GWP_ASAN)
   add_dependencies(scudo_standalone gwp_asan)
 endif()
 
@@ -106,7 +107,7 @@ set(SCUDO_SOURCES_CXX_WRAPPERS
 
 set(SCUDO_OBJECT_LIBS)
 
-if (COMPILER_RT_HAS_GWP_ASAN)
+if (FALSE AND COMPILER_RT_HAS_GWP_ASAN)
   list(APPEND SCUDO_OBJECT_LIBS RTGwpAsan)
   list(APPEND SCUDO_CFLAGS -DGWP_ASAN_HOOKS)
 endif()

diff  --git a/compiler-rt/lib/scudo/standalone/bytemap.h b/compiler-rt/lib/scudo/standalone/bytemap.h
index caeeb2fac879..a03a0c471062 100644
--- a/compiler-rt/lib/scudo/standalone/bytemap.h
+++ b/compiler-rt/lib/scudo/standalone/bytemap.h
@@ -34,6 +34,9 @@ template <uptr Size> class FlatByteMap {
     return Map[Index];
   }
 
+  void disable() {}
+  void enable() {}
+
 private:
   u8 *Map;
 };
@@ -82,6 +85,9 @@ template <uptr Level1Size, uptr Level2Size> class TwoLevelByteMap {
     return Level2Map[Index % Level2Size];
   }
 
+  void disable() { Mutex.lock(); }
+  void enable() { Mutex.unlock(); }
+
 private:
   u8 *get(uptr Index) const {
     DCHECK_LT(Index, Level1Size);

diff  --git a/compiler-rt/lib/scudo/standalone/combined.h b/compiler-rt/lib/scudo/standalone/combined.h
index 8a33f827d498..a0b4b2973e96 100644
--- a/compiler-rt/lib/scudo/standalone/combined.h
+++ b/compiler-rt/lib/scudo/standalone/combined.h
@@ -31,15 +31,23 @@
 static gwp_asan::GuardedPoolAllocator GuardedAlloc;
 #endif // GWP_ASAN_HOOKS
 
+extern "C" inline void EmptyCallback() {}
+
 namespace scudo {
 
-template <class Params> class Allocator {
+template <class Params, void (*PostInitCallback)(void) = EmptyCallback>
+class Allocator {
 public:
   using PrimaryT = typename Params::Primary;
   using CacheT = typename PrimaryT::CacheT;
-  typedef Allocator<Params> ThisT;
+  typedef Allocator<Params, PostInitCallback> ThisT;
   typedef typename Params::template TSDRegistryT<ThisT> TSDRegistryT;
 
+  void callPostInitCallback() {
+    static pthread_once_t OnceControl = PTHREAD_ONCE_INIT;
+    pthread_once(&OnceControl, PostInitCallback);
+  }
+
   struct QuarantineCallback {
     explicit QuarantineCallback(ThisT &Instance, CacheT &LocalCache)
         : Allocator(Instance), Cache(LocalCache) {}
@@ -420,12 +428,18 @@ template <class Params> class Allocator {
   void disable() {
     initThreadMaybe();
     TSDRegistry.disable();
+    Stats.disable();
+    Quarantine.disable();
+    Primary.disable();
     Secondary.disable();
   }
 
   void enable() {
     initThreadMaybe();
     Secondary.enable();
+    Primary.enable();
+    Quarantine.enable();
+    Stats.enable();
     TSDRegistry.enable();
   }
 

diff  --git a/compiler-rt/lib/scudo/standalone/primary32.h b/compiler-rt/lib/scudo/standalone/primary32.h
index 945324914d30..e296a78778e0 100644
--- a/compiler-rt/lib/scudo/standalone/primary32.h
+++ b/compiler-rt/lib/scudo/standalone/primary32.h
@@ -123,13 +123,26 @@ template <class SizeClassMapT, uptr RegionSizeLog> class SizeClassAllocator32 {
   }
 
   void disable() {
-    for (uptr I = 0; I < NumClasses; I++)
-      getSizeClassInfo(I)->Mutex.lock();
+    // The BatchClassId must be locked last since other classes can use it.
+    for (sptr I = static_cast<sptr>(NumClasses) - 1; I >= 0; I--) {
+      if (static_cast<uptr>(I) == SizeClassMap::BatchClassId)
+        continue;
+      getSizeClassInfo(static_cast<uptr>(I))->Mutex.lock();
+    }
+    getSizeClassInfo(SizeClassMap::BatchClassId)->Mutex.lock();
+    RegionsStashMutex.lock();
+    PossibleRegions.disable();
   }
 
   void enable() {
-    for (sptr I = static_cast<sptr>(NumClasses) - 1; I >= 0; I--)
-      getSizeClassInfo(static_cast<uptr>(I))->Mutex.unlock();
+    PossibleRegions.enable();
+    RegionsStashMutex.unlock();
+    getSizeClassInfo(SizeClassMap::BatchClassId)->Mutex.unlock();
+    for (uptr I = 0; I < NumClasses; I++) {
+      if (I == SizeClassMap::BatchClassId)
+        continue;
+      getSizeClassInfo(I)->Mutex.unlock();
+    }
   }
 
   template <typename F> void iterateOverBlocks(F Callback) {

diff  --git a/compiler-rt/lib/scudo/standalone/primary64.h b/compiler-rt/lib/scudo/standalone/primary64.h
index b208ff69bb05..ef02f0b772d6 100644
--- a/compiler-rt/lib/scudo/standalone/primary64.h
+++ b/compiler-rt/lib/scudo/standalone/primary64.h
@@ -125,13 +125,22 @@ template <class SizeClassMapT, uptr RegionSizeLog> class SizeClassAllocator64 {
   }
 
   void disable() {
-    for (uptr I = 0; I < NumClasses; I++)
-      getRegionInfo(I)->Mutex.lock();
+    // The BatchClassId must be locked last since other classes can use it.
+    for (sptr I = static_cast<sptr>(NumClasses) - 1; I >= 0; I--) {
+      if (static_cast<uptr>(I) == SizeClassMap::BatchClassId)
+        continue;
+      getRegionInfo(static_cast<uptr>(I))->Mutex.lock();
+    }
+    getRegionInfo(SizeClassMap::BatchClassId)->Mutex.lock();
   }
 
   void enable() {
-    for (sptr I = static_cast<sptr>(NumClasses) - 1; I >= 0; I--)
-      getRegionInfo(static_cast<uptr>(I))->Mutex.unlock();
+    getRegionInfo(SizeClassMap::BatchClassId)->Mutex.unlock();
+    for (uptr I = 0; I < NumClasses; I++) {
+      if (I == SizeClassMap::BatchClassId)
+        continue;
+      getRegionInfo(I)->Mutex.unlock();
+    }
   }
 
   template <typename F> void iterateOverBlocks(F Callback) const {

diff  --git a/compiler-rt/lib/scudo/standalone/quarantine.h b/compiler-rt/lib/scudo/standalone/quarantine.h
index 2bf7e804ef35..406a0e23804d 100644
--- a/compiler-rt/lib/scudo/standalone/quarantine.h
+++ b/compiler-rt/lib/scudo/standalone/quarantine.h
@@ -205,7 +205,7 @@ template <typename Callback, typename Node> class GlobalQuarantine {
       ScopedLock L(CacheMutex);
       Cache.transfer(C);
     }
-    if (Cache.getSize() > getMaxSize() && RecyleMutex.tryLock())
+    if (Cache.getSize() > getMaxSize() && RecycleMutex.tryLock())
       recycle(atomic_load_relaxed(&MinSize), Cb);
   }
 
@@ -214,7 +214,7 @@ template <typename Callback, typename Node> class GlobalQuarantine {
       ScopedLock L(CacheMutex);
       Cache.transfer(C);
     }
-    RecyleMutex.lock();
+    RecycleMutex.lock();
     recycle(0, Cb);
   }
 
@@ -225,11 +225,22 @@ template <typename Callback, typename Node> class GlobalQuarantine {
                 getMaxSize() >> 10, getCacheSize() >> 10);
   }
 
+  void disable() {
+    // RecycleMutex must be locked 1st since we grab CacheMutex within recycle.
+    RecycleMutex.lock();
+    CacheMutex.lock();
+  }
+
+  void enable() {
+    CacheMutex.unlock();
+    RecycleMutex.unlock();
+  }
+
 private:
   // Read-only data.
   alignas(SCUDO_CACHE_LINE_SIZE) HybridMutex CacheMutex;
   CacheT Cache;
-  alignas(SCUDO_CACHE_LINE_SIZE) HybridMutex RecyleMutex;
+  alignas(SCUDO_CACHE_LINE_SIZE) HybridMutex RecycleMutex;
   atomic_uptr MinSize;
   atomic_uptr MaxSize;
   alignas(SCUDO_CACHE_LINE_SIZE) atomic_uptr MaxCacheSize;
@@ -261,7 +272,7 @@ template <typename Callback, typename Node> class GlobalQuarantine {
       while (Cache.getSize() > MinSize)
         Tmp.enqueueBatch(Cache.dequeueBatch());
     }
-    RecyleMutex.unlock();
+    RecycleMutex.unlock();
     doRecycle(&Tmp, Cb);
   }
 

diff  --git a/compiler-rt/lib/scudo/standalone/stats.h b/compiler-rt/lib/scudo/standalone/stats.h
index 294b891d7bb0..38481e98e48d 100644
--- a/compiler-rt/lib/scudo/standalone/stats.h
+++ b/compiler-rt/lib/scudo/standalone/stats.h
@@ -87,6 +87,9 @@ class GlobalStats : public LocalStats {
       S[I] = static_cast<sptr>(S[I]) >= 0 ? S[I] : 0;
   }
 
+  void disable() { Mutex.lock(); }
+  void enable() { Mutex.unlock(); }
+
 private:
   mutable HybridMutex Mutex;
   DoublyLinkedList<LocalStats> StatsList;

diff  --git a/compiler-rt/lib/scudo/standalone/tests/CMakeLists.txt b/compiler-rt/lib/scudo/standalone/tests/CMakeLists.txt
index 470f89df0227..63007e358965 100644
--- a/compiler-rt/lib/scudo/standalone/tests/CMakeLists.txt
+++ b/compiler-rt/lib/scudo/standalone/tests/CMakeLists.txt
@@ -20,7 +20,8 @@ if(ANDROID)
   list(APPEND SCUDO_UNITTEST_CFLAGS -fno-emulated-tls)
 endif()
 
-if (COMPILER_RT_HAS_GWP_ASAN)
+# FIXME: GWP-ASan is temporarily disabled, re-enable once issues are fixed.
+if (FALSE AND COMPILER_RT_HAS_GWP_ASAN)
   list(APPEND SCUDO_UNITTEST_CFLAGS -DGWP_ASAN_HOOKS)
 endif()
 
@@ -42,7 +43,7 @@ endforeach()
 
 macro(add_scudo_unittest testname)
   cmake_parse_arguments(TEST "" "" "SOURCES;ADDITIONAL_RTOBJECTS" ${ARGN})
-  if (COMPILER_RT_HAS_GWP_ASAN)
+  if (FALSE AND COMPILER_RT_HAS_GWP_ASAN)
     list(APPEND TEST_ADDITIONAL_RTOBJECTS RTGwpAsan)
   endif()
 

diff  --git a/compiler-rt/lib/scudo/standalone/tests/tsd_test.cpp b/compiler-rt/lib/scudo/standalone/tests/tsd_test.cpp
index b32c62fe6ca1..4a3cf1cd0fc3 100644
--- a/compiler-rt/lib/scudo/standalone/tests/tsd_test.cpp
+++ b/compiler-rt/lib/scudo/standalone/tests/tsd_test.cpp
@@ -36,6 +36,7 @@ template <class Config> class MockAllocator {
   void initCache(CacheT *Cache) { memset(Cache, 0, sizeof(*Cache)); }
   void commitBack(scudo::TSD<MockAllocator> *TSD) {}
   TSDRegistryT *getTSDRegistry() { return &TSDRegistry; }
+  void callPostInitCallback() {}
 
   bool isInitialized() { return Initialized; }
 

diff  --git a/compiler-rt/lib/scudo/standalone/tests/wrappers_c_test.cpp b/compiler-rt/lib/scudo/standalone/tests/wrappers_c_test.cpp
index c3699f1d2abd..976ac4f497c9 100644
--- a/compiler-rt/lib/scudo/standalone/tests/wrappers_c_test.cpp
+++ b/compiler-rt/lib/scudo/standalone/tests/wrappers_c_test.cpp
@@ -299,7 +299,9 @@ TEST(ScudoWrappersCTest, MallocDisableDeadlock) {
       "");
 }
 
+// Fuchsia doesn't have fork or malloc_info.
 #if !SCUDO_FUCHSIA
+
 TEST(ScudoWrappersCTest, MallocInfo) {
   char Buffer[64];
   FILE *F = fmemopen(Buffer, sizeof(Buffer), "w+");
@@ -310,4 +312,79 @@ TEST(ScudoWrappersCTest, MallocInfo) {
   fclose(F);
   EXPECT_EQ(strncmp(Buffer, "<malloc version=\"scudo-", 23), 0);
 }
-#endif
+
+TEST(ScudoWrappersCTest, Fork) {
+  void *P;
+  pid_t Pid = fork();
+  EXPECT_GE(Pid, 0);
+  if (Pid == 0) {
+    P = malloc(Size);
+    EXPECT_NE(P, nullptr);
+    memset(P, 0x42, Size);
+    free(P);
+    _exit(0);
+  }
+  waitpid(Pid, nullptr, 0);
+  P = malloc(Size);
+  EXPECT_NE(P, nullptr);
+  memset(P, 0x42, Size);
+  free(P);
+
+  // fork should stall if the allocator has been disabled.
+  EXPECT_DEATH(
+      {
+        malloc_disable();
+        alarm(1);
+        Pid = fork();
+        EXPECT_GE(Pid, 0);
+      },
+      "");
+}
+
+static pthread_mutex_t Mutex;
+static pthread_cond_t Conditional = PTHREAD_COND_INITIALIZER;
+
+static void *enableMalloc(void *Unused) {
+  // Initialize the allocator for this thread.
+  void *P = malloc(Size);
+  EXPECT_NE(P, nullptr);
+  memset(P, 0x42, Size);
+  free(P);
+
+  // Signal the main thread we are ready.
+  pthread_mutex_lock(&Mutex);
+  pthread_cond_signal(&Conditional);
+  pthread_mutex_unlock(&Mutex);
+
+  // Wait for the malloc_disable & fork, then enable the allocator again.
+  sleep(1);
+  malloc_enable();
+
+  return nullptr;
+}
+
+TEST(ScudoWrappersCTest, DisableForkEnable) {
+  pthread_t ThreadId;
+  EXPECT_EQ(pthread_create(&ThreadId, nullptr, &enableMalloc, nullptr), 0);
+
+  // Wait for the thread to be warmed up.
+  pthread_mutex_lock(&Mutex);
+  pthread_cond_wait(&Conditional, &Mutex);
+  pthread_mutex_unlock(&Mutex);
+
+  // Disable the allocator and fork. fork should succeed after malloc_enable.
+  malloc_disable();
+  pid_t Pid = fork();
+  EXPECT_GE(Pid, 0);
+  if (Pid == 0) {
+    void *P = malloc(Size);
+    EXPECT_NE(P, nullptr);
+    memset(P, 0x42, Size);
+    free(P);
+    _exit(0);
+  }
+  waitpid(Pid, nullptr, 0);
+  EXPECT_EQ(pthread_join(ThreadId, 0), 0);
+}
+
+#endif // SCUDO_FUCHSIA

diff  --git a/compiler-rt/lib/scudo/standalone/tests/wrappers_cpp_test.cpp b/compiler-rt/lib/scudo/standalone/tests/wrappers_cpp_test.cpp
index 28ae41c03f42..4ccef5bb0dee 100644
--- a/compiler-rt/lib/scudo/standalone/tests/wrappers_cpp_test.cpp
+++ b/compiler-rt/lib/scudo/standalone/tests/wrappers_cpp_test.cpp
@@ -8,6 +8,7 @@
 
 #include "tests/scudo_unit_test.h"
 
+#include <atomic>
 #include <condition_variable>
 #include <mutex>
 #include <thread>
@@ -113,3 +114,59 @@ TEST(ScudoWrappersCppTest, ThreadedNew) {
   for (auto &T : Threads)
     T.join();
 }
+
+#if !SCUDO_FUCHSIA
+// TODO(kostyak): for me, this test fails in a specific configuration when ran
+//                by itself with some Scudo or GWP-ASan violation. Other people
+//                can't seem to reproduce the failure. Consider skipping this in
+//                the event it fails on the upstream bots.
+TEST(ScudoWrappersCppTest, AllocAfterFork) {
+  std::atomic_bool Stop;
+
+  // Create threads that simply allocate and free 
diff erent sizes.
+  std::vector<std::thread *> Threads;
+  for (size_t N = 0; N < 5; N++) {
+    std::thread *T = new std::thread([&Stop] {
+      while (!Stop) {
+        for (size_t SizeLog = 3; SizeLog <= 21; SizeLog++) {
+          char *P = new char[1UL << SizeLog];
+          EXPECT_NE(P, nullptr);
+          // Make sure this value is not optimized away.
+          asm volatile("" : : "r,m"(P) : "memory");
+          delete[] P;
+        }
+      }
+    });
+    Threads.push_back(T);
+  }
+
+  // Create a thread to fork and allocate.
+  for (size_t N = 0; N < 100; N++) {
+    pid_t Pid;
+    if ((Pid = fork()) == 0) {
+      for (size_t SizeLog = 3; SizeLog <= 21; SizeLog++) {
+        char *P = new char[1UL << SizeLog];
+        EXPECT_NE(P, nullptr);
+        // Make sure this value is not optimized away.
+        asm volatile("" : : "r,m"(P) : "memory");
+        // Make sure we can touch all of the allocation.
+        memset(P, 0x32, 1U << SizeLog);
+        // EXPECT_LE(1U << SizeLog, malloc_usable_size(ptr));
+        delete[] P;
+      }
+      _exit(10);
+    }
+    EXPECT_NE(-1, Pid);
+    int Status;
+    EXPECT_EQ(Pid, waitpid(Pid, &Status, 0));
+    EXPECT_FALSE(WIFSIGNALED(Status));
+    EXPECT_EQ(10, WEXITSTATUS(Status));
+  }
+
+  printf("Waiting for threads to complete\n");
+  Stop = true;
+  for (auto Thread : Threads)
+    Thread->join();
+  Threads.clear();
+}
+#endif

diff  --git a/compiler-rt/lib/scudo/standalone/tsd.h b/compiler-rt/lib/scudo/standalone/tsd.h
index 626cc4b80fb7..20f0d69cabfd 100644
--- a/compiler-rt/lib/scudo/standalone/tsd.h
+++ b/compiler-rt/lib/scudo/standalone/tsd.h
@@ -14,6 +14,7 @@
 #include "mutex.h"
 
 #include <limits.h> // for PTHREAD_DESTRUCTOR_ITERATIONS
+#include <pthread.h>
 
 // With some build setups, this might still not be defined.
 #ifndef PTHREAD_DESTRUCTOR_ITERATIONS

diff  --git a/compiler-rt/lib/scudo/standalone/tsd_exclusive.h b/compiler-rt/lib/scudo/standalone/tsd_exclusive.h
index 89b001a739c1..69479ea7bdf4 100644
--- a/compiler-rt/lib/scudo/standalone/tsd_exclusive.h
+++ b/compiler-rt/lib/scudo/standalone/tsd_exclusive.h
@@ -11,8 +11,6 @@
 
 #include "tsd.h"
 
-#include <pthread.h>
-
 namespace scudo {
 
 enum class ThreadState : u8 {
@@ -62,6 +60,7 @@ template <class Allocator> struct TSDRegistryExT {
   // To disable the exclusive TSD registry, we effectively lock the fallback TSD
   // and force all threads to attempt to use it instead of their local one.
   void disable() {
+    Mutex.lock();
     FallbackTSD->lock();
     atomic_store(&Disabled, 1U, memory_order_release);
   }
@@ -69,6 +68,7 @@ template <class Allocator> struct TSDRegistryExT {
   void enable() {
     atomic_store(&Disabled, 0U, memory_order_release);
     FallbackTSD->unlock();
+    Mutex.unlock();
   }
 
 private:
@@ -90,6 +90,7 @@ template <class Allocator> struct TSDRegistryExT {
         pthread_setspecific(PThreadKey, reinterpret_cast<void *>(Instance)), 0);
     ThreadTSD.initLinkerInitialized(Instance);
     State = ThreadState::Initialized;
+    Instance->callPostInitCallback();
   }
 
   pthread_key_t PThreadKey;

diff  --git a/compiler-rt/lib/scudo/standalone/tsd_shared.h b/compiler-rt/lib/scudo/standalone/tsd_shared.h
index 347295011bb2..5ab8269519a9 100644
--- a/compiler-rt/lib/scudo/standalone/tsd_shared.h
+++ b/compiler-rt/lib/scudo/standalone/tsd_shared.h
@@ -12,8 +12,6 @@
 #include "linux.h" // for getAndroidTlsPtr()
 #include "tsd.h"
 
-#include <pthread.h>
-
 namespace scudo {
 
 template <class Allocator, u32 MaxTSDCount> struct TSDRegistrySharedT {
@@ -73,13 +71,15 @@ template <class Allocator, u32 MaxTSDCount> struct TSDRegistrySharedT {
   }
 
   void disable() {
+    Mutex.lock();
     for (u32 I = 0; I < NumberOfTSDs; I++)
       TSDs[I].lock();
   }
 
   void enable() {
-    for (u32 I = 0; I < NumberOfTSDs; I++)
+    for (s32 I = NumberOfTSDs - 1; I >= 0; I--)
       TSDs[I].unlock();
+    Mutex.unlock();
   }
 
 private:
@@ -117,6 +117,7 @@ template <class Allocator, u32 MaxTSDCount> struct TSDRegistrySharedT {
     // Initial context assignment is done in a plain round-robin fashion.
     const u32 Index = atomic_fetch_add(&CurrentIndex, 1U, memory_order_relaxed);
     setCurrentTSD(&TSDs[Index % NumberOfTSDs]);
+    Instance->callPostInitCallback();
   }
 
   NOINLINE TSD<Allocator> *getTSDAndLockSlow(TSD<Allocator> *CurrentTSD) {

diff  --git a/compiler-rt/lib/scudo/standalone/wrappers_c.cpp b/compiler-rt/lib/scudo/standalone/wrappers_c.cpp
index dffd7cc26fe8..93a666c4d61e 100644
--- a/compiler-rt/lib/scudo/standalone/wrappers_c.cpp
+++ b/compiler-rt/lib/scudo/standalone/wrappers_c.cpp
@@ -18,22 +18,23 @@
 #include <stdint.h>
 #include <stdio.h>
 
-static scudo::Allocator<scudo::Config> Allocator;
+#define SCUDO_PREFIX(name) name
+#define SCUDO_ALLOCATOR Allocator
+
+extern "C" void SCUDO_PREFIX(malloc_postinit)();
+static scudo::Allocator<scudo::Config, SCUDO_PREFIX(malloc_postinit)>
+    SCUDO_ALLOCATOR;
 // Pointer to the static allocator so that the C++ wrappers can access it.
 // Technically we could have a completely separated heap for C & C++ but in
 // reality the amount of cross pollination between the two is staggering.
-scudo::Allocator<scudo::Config> *AllocatorPtr = &Allocator;
-
-extern "C" {
+scudo::Allocator<scudo::Config, SCUDO_PREFIX(malloc_postinit)> *
+    CONCATENATE(SCUDO_ALLOCATOR, Ptr) = &SCUDO_ALLOCATOR;
 
-#define SCUDO_PREFIX(name) name
-#define SCUDO_ALLOCATOR Allocator
 #include "wrappers_c.inc"
+
 #undef SCUDO_ALLOCATOR
 #undef SCUDO_PREFIX
 
-INTERFACE void __scudo_print_stats(void) { Allocator.printStats(); }
-
-} // extern "C"
+extern "C" INTERFACE void __scudo_print_stats(void) { Allocator.printStats(); }
 
 #endif // !SCUDO_ANDROID || !_BIONIC

diff  --git a/compiler-rt/lib/scudo/standalone/wrappers_c.inc b/compiler-rt/lib/scudo/standalone/wrappers_c.inc
index edf39f514480..2fd709eaa1f6 100644
--- a/compiler-rt/lib/scudo/standalone/wrappers_c.inc
+++ b/compiler-rt/lib/scudo/standalone/wrappers_c.inc
@@ -17,6 +17,8 @@
 #define SCUDO_MALLOC_ALIGNMENT FIRST_32_SECOND_64(8U, 16U)
 #endif
 
+extern "C" {
+
 INTERFACE WEAK void *SCUDO_PREFIX(calloc)(size_t nmemb, size_t size) {
   scudo::uptr Product;
   if (UNLIKELY(scudo::checkForCallocOverflow(size, nmemb, &Product))) {
@@ -141,11 +143,16 @@ INTERFACE WEAK int SCUDO_PREFIX(malloc_iterate)(
   return 0;
 }
 
+INTERFACE WEAK void SCUDO_PREFIX(malloc_enable)() { SCUDO_ALLOCATOR.enable(); }
+
 INTERFACE WEAK void SCUDO_PREFIX(malloc_disable)() {
   SCUDO_ALLOCATOR.disable();
 }
 
-INTERFACE WEAK void SCUDO_PREFIX(malloc_enable)() { SCUDO_ALLOCATOR.enable(); }
+void SCUDO_PREFIX(malloc_postinit)() {
+  pthread_atfork(SCUDO_PREFIX(malloc_disable), SCUDO_PREFIX(malloc_enable),
+                 SCUDO_PREFIX(malloc_enable));
+}
 
 INTERFACE WEAK int SCUDO_PREFIX(mallopt)(int param, UNUSED int value) {
   if (param == M_DECAY_TIME) {
@@ -176,3 +183,5 @@ INTERFACE WEAK int SCUDO_PREFIX(malloc_info)(UNUSED int options, FILE *stream) {
   fputs("</malloc>", stream);
   return 0;
 }
+
+} // extern "C"

diff  --git a/compiler-rt/lib/scudo/standalone/wrappers_c_bionic.cpp b/compiler-rt/lib/scudo/standalone/wrappers_c_bionic.cpp
index fa4145c066b6..f004369d96cb 100644
--- a/compiler-rt/lib/scudo/standalone/wrappers_c_bionic.cpp
+++ b/compiler-rt/lib/scudo/standalone/wrappers_c_bionic.cpp
@@ -18,22 +18,40 @@
 #include <stdint.h>
 #include <stdio.h>
 
-static scudo::Allocator<scudo::AndroidConfig> Allocator;
-static scudo::Allocator<scudo::AndroidSvelteConfig> SvelteAllocator;
-
-extern "C" {
-
 // Regular MallocDispatch definitions.
 #define SCUDO_PREFIX(name) CONCATENATE(scudo_, name)
 #define SCUDO_ALLOCATOR Allocator
+
+extern "C" void SCUDO_PREFIX(malloc_postinit)();
+static scudo::Allocator<scudo::AndroidConfig, SCUDO_PREFIX(malloc_postinit)>
+    SCUDO_ALLOCATOR;
+// Pointer to the static allocator so that the C++ wrappers can access it.
+// Technically we could have a completely separated heap for C & C++ but in
+// reality the amount of cross pollination between the two is staggering.
+scudo::Allocator<scudo::AndroidConfig, SCUDO_PREFIX(malloc_postinit)> *
+    CONCATENATE(SCUDO_ALLOCATOR, Ptr) = &SCUDO_ALLOCATOR;
+
 #include "wrappers_c.inc"
+
 #undef SCUDO_ALLOCATOR
 #undef SCUDO_PREFIX
 
 // Svelte MallocDispatch definitions.
 #define SCUDO_PREFIX(name) CONCATENATE(scudo_svelte_, name)
 #define SCUDO_ALLOCATOR SvelteAllocator
+
+extern "C" void SCUDO_PREFIX(malloc_postinit)();
+static scudo::Allocator<scudo::AndroidSvelteConfig,
+                        SCUDO_PREFIX(malloc_postinit)>
+    SCUDO_ALLOCATOR;
+// Pointer to the static allocator so that the C++ wrappers can access it.
+// Technically we could have a completely separated heap for C & C++ but in
+// reality the amount of cross pollination between the two is staggering.
+scudo::Allocator<scudo::AndroidSvelteConfig, SCUDO_PREFIX(malloc_postinit)> *
+    CONCATENATE(SCUDO_ALLOCATOR, Ptr) = &SCUDO_ALLOCATOR;
+
 #include "wrappers_c.inc"
+
 #undef SCUDO_ALLOCATOR
 #undef SCUDO_PREFIX
 
@@ -44,6 +62,4 @@ INTERFACE void __scudo_print_stats(void) {
   SvelteAllocator.printStats();
 }
 
-} // extern "C"
-
 #endif // SCUDO_ANDROID && _BIONIC

diff  --git a/compiler-rt/lib/scudo/standalone/wrappers_cpp.cpp b/compiler-rt/lib/scudo/standalone/wrappers_cpp.cpp
index 72235e9c9820..1da5385c7789 100644
--- a/compiler-rt/lib/scudo/standalone/wrappers_cpp.cpp
+++ b/compiler-rt/lib/scudo/standalone/wrappers_cpp.cpp
@@ -15,7 +15,8 @@
 
 #include <stdint.h>
 
-extern scudo::Allocator<scudo::Config> *AllocatorPtr;
+extern "C" void malloc_postinit();
+extern scudo::Allocator<scudo::Config, malloc_postinit> *AllocatorPtr;
 
 namespace std {
 struct nothrow_t {};