[compiler-rt] 6d1f860 - tsan: optimize DenseSlabAlloc

Tue Jul 19 06:43:10 PDT 2022

Author: Dmitry Vyukov
Date: 2022-07-19T15:42:57+02:00
New Revision: 6d1f86095de94739d3ed74ca314541a1794db47f

URL: https://github.com/llvm/llvm-project/commit/6d1f86095de94739d3ed74ca314541a1794db47f
DIFF: https://github.com/llvm/llvm-project/commit/6d1f86095de94739d3ed74ca314541a1794db47f.diff

LOG: tsan: optimize DenseSlabAlloc

If lots of threads do lots of malloc/free and they overflow
per-pthread DenseSlabAlloc cache, it causes lots of contention:

  31.97%  race.old  race.old            [.] __sanitizer::StaticSpinMutex::LockSlow
  17.61%  race.old  race.old            [.] __tsan_read4
  10.77%  race.old  race.old            [.] __tsan::SlotLock

Optimize DenseSlabAlloc to use a lock-free stack of batches of nodes.
This way we don't take any locks in steady state at all and do only
1 push/pop per Refill/Drain.

Effect on the added benchmark:

$ TIME="%e %U %S %M" time ./test.old 36 5 2000000
34.51 978.22 175.67 5833592
32.53 891.73 167.03 5790036
36.17 1005.54 201.24 5802828
36.94 1004.76 226.58 5803188

$ TIME="%e %U %S %M" time ./test.new 36 5 2000000
26.44 720.99 13.45 5750704
25.92 721.98 13.58 5767764
26.33 725.15 13.41 5777936
25.93 713.49 13.41 5791796

Reviewed By: melver

Differential Revision: https://reviews.llvm.org/D130002

Added: 
    compiler-rt/test/tsan/bench_malloc.cpp

Modified: 
    compiler-rt/lib/tsan/rtl/tsan_dense_alloc.h

Removed: 
    


################################################################################
diff  --git a/compiler-rt/lib/tsan/rtl/tsan_dense_alloc.h b/compiler-rt/lib/tsan/rtl/tsan_dense_alloc.h
index 7a39a39d51de4..2eaff39057bc5 100644

--- a/compiler-rt/lib/tsan/rtl/tsan_dense_alloc.h
+++ b/compiler-rt/lib/tsan/rtl/tsan_dense_alloc.h
@@ -85,14 +85,7 @@ class DenseSlabAlloc {
   }
 
   void FlushCache(Cache *c) {
-    if (!c->pos)
-      return;
-    SpinMutexLock lock(&mtx_);
-    while (c->pos) {
-      IndexT idx = c->cache[--c->pos];
-      *(IndexT*)Map(idx) = freelist_;
-      freelist_ = idx;
-    }
+    while (c->pos) Drain(c);
   }
 
   void InitCache(Cache *c) {
@@ -106,7 +99,7 @@ class DenseSlabAlloc {
 
   template <typename Func>
   void ForEach(Func func) {
-    SpinMutexLock lock(&mtx_);
+    Lock lock(&mtx_);
     uptr fillpos = atomic_load_relaxed(&fillpos_);
     for (uptr l1 = 0; l1 < fillpos; l1++) {
       for (IndexT l2 = l1 == 0 ? 1 : 0; l2 < kL2Size; l2++) func(&map_[l1][l2]);
@@ -115,48 +108,86 @@ class DenseSlabAlloc {
 
  private:
   T *map_[kL1Size];
-  SpinMutex mtx_;
-  IndexT freelist_ = {0};
+  Mutex mtx_;
+  // The freelist is organized as a lock-free stack of batches of nodes.
+  // The stack itself uses Block::next links, while the batch within each
+  // stack node uses Block::batch links.
+  // Low 32-bits of freelist_ is the node index, top 32-bits is ABA-counter.
+  atomic_uint64_t freelist_ = {0};
   atomic_uintptr_t fillpos_ = {0};
   const char *const name_;
 
-  void Refill(Cache *c) {
-    SpinMutexLock lock(&mtx_);
-    if (freelist_ == 0) {
-      uptr fillpos = atomic_load_relaxed(&fillpos_);
-      if (fillpos == kL1Size) {
-        Printf("ThreadSanitizer: %s overflow (%zu*%zu). Dying.\n",
-            name_, kL1Size, kL2Size);
-        Die();
-      }
-      VPrintf(2, "ThreadSanitizer: growing %s: %zu out of %zu*%zu\n", name_,
-              fillpos, kL1Size, kL2Size);
-      T *batch = (T*)MmapOrDie(kL2Size * sizeof(T), name_);
-      // Reserve 0 as invalid index.
-      IndexT start = fillpos == 0 ? 1 : 0;
-      for (IndexT i = start; i < kL2Size; i++) {
-        new(batch + i) T;
-        *(IndexT *)(batch + i) = i + 1 + fillpos * kL2Size;
-      }
-      *(IndexT*)(batch + kL2Size - 1) = 0;
-      freelist_ = fillpos * kL2Size + start;
-      map_[fillpos] = batch;
-      atomic_store_relaxed(&fillpos_, fillpos + 1);
-    }
-    for (uptr i = 0; i < Cache::kSize / 2 && freelist_ != 0; i++) {
-      IndexT idx = freelist_;
+  struct Block {
+    IndexT next;
+    IndexT batch;
+  };
+
+  Block *MapBlock(IndexT idx) { return reinterpret_cast<Block *>(Map(idx)); }
+
+  static constexpr u64 kCounterInc = 1ull << 32;
+  static constexpr u64 kCounterMask = ~(kCounterInc - 1);
+
+  NOINLINE void Refill(Cache *c) {
+    // Pop 1 batch of nodes from the freelist.
+    IndexT idx;
+    u64 xchg;
+    u64 cmp = atomic_load(&freelist_, memory_order_acquire);
+    do {
+      idx = static_cast<IndexT>(cmp);
+      if (!idx)
+        return AllocSuperBlock(c);
+      Block *ptr = MapBlock(idx);
+      xchg = ptr->next | (cmp & kCounterMask);
+    } while (!atomic_compare_exchange_weak(&freelist_, &cmp, xchg,
+                                           memory_order_acq_rel));
+    // Unpack it into c->cache.
+    while (idx) {
       c->cache[c->pos++] = idx;
-      freelist_ = *(IndexT*)Map(idx);
+      idx = MapBlock(idx)->batch;
     }
   }
 
-  void Drain(Cache *c) {
-    SpinMutexLock lock(&mtx_);
-    for (uptr i = 0; i < Cache::kSize / 2; i++) {
+  NOINLINE void Drain(Cache *c) {
+    // Build a batch of at most Cache::kSize / 2 nodes linked by Block::batch.
+    IndexT head_idx = 0;
+    for (uptr i = 0; i < Cache::kSize / 2 && c->pos; i++) {
       IndexT idx = c->cache[--c->pos];
-      *(IndexT*)Map(idx) = freelist_;
-      freelist_ = idx;
+      Block *ptr = MapBlock(idx);
+      ptr->batch = head_idx;
+      head_idx = idx;
+    }
+    // Push it onto the freelist stack.
+    Block *head = MapBlock(head_idx);
+    u64 xchg;
+    u64 cmp = atomic_load(&freelist_, memory_order_acquire);
+    do {
+      head->next = static_cast<IndexT>(cmp);
+      xchg = head_idx | (cmp & kCounterMask) + kCounterInc;
+    } while (!atomic_compare_exchange_weak(&freelist_, &cmp, xchg,
+                                           memory_order_acq_rel));
+  }
+
+  NOINLINE void AllocSuperBlock(Cache *c) {
+    Lock lock(&mtx_);
+    uptr fillpos = atomic_load_relaxed(&fillpos_);
+    if (fillpos == kL1Size) {
+      Printf("ThreadSanitizer: %s overflow (%zu*%zu). Dying.\n", name_, kL1Size,
+             kL2Size);
+      Die();
+    }
+    VPrintf(2, "ThreadSanitizer: growing %s: %zu out of %zu*%zu\n", name_,
+            fillpos, kL1Size, kL2Size);
+    T *batch = (T *)MmapOrDie(kL2Size * sizeof(T), name_);
+    map_[fillpos] = batch;
+    // Reserve 0 as invalid index.
+    for (IndexT i = fillpos ? 0 : 1; i < kL2Size; i++) {
+      new (batch + i) T;
+      c->cache[c->pos++] = i + fillpos * kL2Size;
+      if (c->pos == Cache::kSize)
+        Drain(c);
     }
+    atomic_store_relaxed(&fillpos_, fillpos + 1);
+    CHECK(c->pos);
   }
 };
 

diff  --git a/compiler-rt/test/tsan/bench_malloc.cpp b/compiler-rt/test/tsan/bench_malloc.cpp
new file mode 100644
index 0000000000000..fb15ab25bec0a
--- /dev/null
+++ b/compiler-rt/test/tsan/bench_malloc.cpp
@@ -0,0 +1,22 @@
+// RUN: %clangxx_tsan %s -o %t
+// RUN: %run %t 2>&1 | FileCheck %s
+
+// bench.h needs pthread barriers which are not available on OS X
+// UNSUPPORTED: darwin
+
+#include "bench.h"
+
+void thread(int tid) {
+  void **blocks = new void *[bench_mode];
+  for (int i = 0; i < bench_niter; i++) {
+    for (int j = 0; j < bench_mode; j++)
+      blocks[j] = malloc(8);
+    for (int j = 0; j < bench_mode; j++)
+      free(blocks[j]);
+  }
+  delete[] blocks;
+}
+
+void bench() { start_thread_group(bench_nthread, thread); }
+
+// CHECK: DONE