[compiler-rt] 6d1f860 - tsan: optimize DenseSlabAlloc
Dmitry Vyukov via llvm-commits
llvm-commits at lists.llvm.org
Tue Jul 19 06:43:10 PDT 2022
Author: Dmitry Vyukov
Date: 2022-07-19T15:42:57+02:00
New Revision: 6d1f86095de94739d3ed74ca314541a1794db47f
URL: https://github.com/llvm/llvm-project/commit/6d1f86095de94739d3ed74ca314541a1794db47f
DIFF: https://github.com/llvm/llvm-project/commit/6d1f86095de94739d3ed74ca314541a1794db47f.diff
LOG: tsan: optimize DenseSlabAlloc
If lots of threads do lots of malloc/free and they overflow
per-pthread DenseSlabAlloc cache, it causes lots of contention:
31.97% race.old race.old [.] __sanitizer::StaticSpinMutex::LockSlow
17.61% race.old race.old [.] __tsan_read4
10.77% race.old race.old [.] __tsan::SlotLock
Optimize DenseSlabAlloc to use a lock-free stack of batches of nodes.
This way we don't take any locks in steady state at all and do only
1 push/pop per Refill/Drain.
Effect on the added benchmark:
$ TIME="%e %U %S %M" time ./test.old 36 5 2000000
34.51 978.22 175.67 5833592
32.53 891.73 167.03 5790036
36.17 1005.54 201.24 5802828
36.94 1004.76 226.58 5803188
$ TIME="%e %U %S %M" time ./test.new 36 5 2000000
26.44 720.99 13.45 5750704
25.92 721.98 13.58 5767764
26.33 725.15 13.41 5777936
25.93 713.49 13.41 5791796
Reviewed By: melver
Differential Revision: https://reviews.llvm.org/D130002
Added:
compiler-rt/test/tsan/bench_malloc.cpp
Modified:
compiler-rt/lib/tsan/rtl/tsan_dense_alloc.h
Removed:
################################################################################
diff --git a/compiler-rt/lib/tsan/rtl/tsan_dense_alloc.h b/compiler-rt/lib/tsan/rtl/tsan_dense_alloc.h
index 7a39a39d51de4..2eaff39057bc5 100644
--- a/compiler-rt/lib/tsan/rtl/tsan_dense_alloc.h
+++ b/compiler-rt/lib/tsan/rtl/tsan_dense_alloc.h
@@ -85,14 +85,7 @@ class DenseSlabAlloc {
}
void FlushCache(Cache *c) {
- if (!c->pos)
- return;
- SpinMutexLock lock(&mtx_);
- while (c->pos) {
- IndexT idx = c->cache[--c->pos];
- *(IndexT*)Map(idx) = freelist_;
- freelist_ = idx;
- }
+ while (c->pos) Drain(c);
}
void InitCache(Cache *c) {
@@ -106,7 +99,7 @@ class DenseSlabAlloc {
template <typename Func>
void ForEach(Func func) {
- SpinMutexLock lock(&mtx_);
+ Lock lock(&mtx_);
uptr fillpos = atomic_load_relaxed(&fillpos_);
for (uptr l1 = 0; l1 < fillpos; l1++) {
for (IndexT l2 = l1 == 0 ? 1 : 0; l2 < kL2Size; l2++) func(&map_[l1][l2]);
@@ -115,48 +108,86 @@ class DenseSlabAlloc {
private:
T *map_[kL1Size];
- SpinMutex mtx_;
- IndexT freelist_ = {0};
+ Mutex mtx_;
+ // The freelist is organized as a lock-free stack of batches of nodes.
+ // The stack itself uses Block::next links, while the batch within each
+ // stack node uses Block::batch links.
+ // Low 32-bits of freelist_ is the node index, top 32-bits is ABA-counter.
+ atomic_uint64_t freelist_ = {0};
atomic_uintptr_t fillpos_ = {0};
const char *const name_;
- void Refill(Cache *c) {
- SpinMutexLock lock(&mtx_);
- if (freelist_ == 0) {
- uptr fillpos = atomic_load_relaxed(&fillpos_);
- if (fillpos == kL1Size) {
- Printf("ThreadSanitizer: %s overflow (%zu*%zu). Dying.\n",
- name_, kL1Size, kL2Size);
- Die();
- }
- VPrintf(2, "ThreadSanitizer: growing %s: %zu out of %zu*%zu\n", name_,
- fillpos, kL1Size, kL2Size);
- T *batch = (T*)MmapOrDie(kL2Size * sizeof(T), name_);
- // Reserve 0 as invalid index.
- IndexT start = fillpos == 0 ? 1 : 0;
- for (IndexT i = start; i < kL2Size; i++) {
- new(batch + i) T;
- *(IndexT *)(batch + i) = i + 1 + fillpos * kL2Size;
- }
- *(IndexT*)(batch + kL2Size - 1) = 0;
- freelist_ = fillpos * kL2Size + start;
- map_[fillpos] = batch;
- atomic_store_relaxed(&fillpos_, fillpos + 1);
- }
- for (uptr i = 0; i < Cache::kSize / 2 && freelist_ != 0; i++) {
- IndexT idx = freelist_;
+ struct Block {
+ IndexT next;
+ IndexT batch;
+ };
+
+ Block *MapBlock(IndexT idx) { return reinterpret_cast<Block *>(Map(idx)); }
+
+ static constexpr u64 kCounterInc = 1ull << 32;
+ static constexpr u64 kCounterMask = ~(kCounterInc - 1);
+
+ NOINLINE void Refill(Cache *c) {
+ // Pop 1 batch of nodes from the freelist.
+ IndexT idx;
+ u64 xchg;
+ u64 cmp = atomic_load(&freelist_, memory_order_acquire);
+ do {
+ idx = static_cast<IndexT>(cmp);
+ if (!idx)
+ return AllocSuperBlock(c);
+ Block *ptr = MapBlock(idx);
+ xchg = ptr->next | (cmp & kCounterMask);
+ } while (!atomic_compare_exchange_weak(&freelist_, &cmp, xchg,
+ memory_order_acq_rel));
+ // Unpack it into c->cache.
+ while (idx) {
c->cache[c->pos++] = idx;
- freelist_ = *(IndexT*)Map(idx);
+ idx = MapBlock(idx)->batch;
}
}
- void Drain(Cache *c) {
- SpinMutexLock lock(&mtx_);
- for (uptr i = 0; i < Cache::kSize / 2; i++) {
+ NOINLINE void Drain(Cache *c) {
+ // Build a batch of at most Cache::kSize / 2 nodes linked by Block::batch.
+ IndexT head_idx = 0;
+ for (uptr i = 0; i < Cache::kSize / 2 && c->pos; i++) {
IndexT idx = c->cache[--c->pos];
- *(IndexT*)Map(idx) = freelist_;
- freelist_ = idx;
+ Block *ptr = MapBlock(idx);
+ ptr->batch = head_idx;
+ head_idx = idx;
+ }
+ // Push it onto the freelist stack.
+ Block *head = MapBlock(head_idx);
+ u64 xchg;
+ u64 cmp = atomic_load(&freelist_, memory_order_acquire);
+ do {
+ head->next = static_cast<IndexT>(cmp);
+ xchg = head_idx | (cmp & kCounterMask) + kCounterInc;
+ } while (!atomic_compare_exchange_weak(&freelist_, &cmp, xchg,
+ memory_order_acq_rel));
+ }
+
+ NOINLINE void AllocSuperBlock(Cache *c) {
+ Lock lock(&mtx_);
+ uptr fillpos = atomic_load_relaxed(&fillpos_);
+ if (fillpos == kL1Size) {
+ Printf("ThreadSanitizer: %s overflow (%zu*%zu). Dying.\n", name_, kL1Size,
+ kL2Size);
+ Die();
+ }
+ VPrintf(2, "ThreadSanitizer: growing %s: %zu out of %zu*%zu\n", name_,
+ fillpos, kL1Size, kL2Size);
+ T *batch = (T *)MmapOrDie(kL2Size * sizeof(T), name_);
+ map_[fillpos] = batch;
+ // Reserve 0 as invalid index.
+ for (IndexT i = fillpos ? 0 : 1; i < kL2Size; i++) {
+ new (batch + i) T;
+ c->cache[c->pos++] = i + fillpos * kL2Size;
+ if (c->pos == Cache::kSize)
+ Drain(c);
}
+ atomic_store_relaxed(&fillpos_, fillpos + 1);
+ CHECK(c->pos);
}
};
diff --git a/compiler-rt/test/tsan/bench_malloc.cpp b/compiler-rt/test/tsan/bench_malloc.cpp
new file mode 100644
index 0000000000000..fb15ab25bec0a
--- /dev/null
+++ b/compiler-rt/test/tsan/bench_malloc.cpp
@@ -0,0 +1,22 @@
+// RUN: %clangxx_tsan %s -o %t
+// RUN: %run %t 2>&1 | FileCheck %s
+
+// bench.h needs pthread barriers which are not available on OS X
+// UNSUPPORTED: darwin
+
+#include "bench.h"
+
+void thread(int tid) {
+ void **blocks = new void *[bench_mode];
+ for (int i = 0; i < bench_niter; i++) {
+ for (int j = 0; j < bench_mode; j++)
+ blocks[j] = malloc(8);
+ for (int j = 0; j < bench_mode; j++)
+ free(blocks[j]);
+ }
+ delete[] blocks;
+}
+
+void bench() { start_thread_group(bench_nthread, thread); }
+
+// CHECK: DONE
More information about the llvm-commits
mailing list