[compiler-rt] r214912 - tsan: allocate vector clocks using slab allocator

Tue Aug 5 11:45:03 PDT 2014

Author: dvyukov
Date: Tue Aug  5 13:45:02 2014
New Revision: 214912

URL: http://llvm.org/viewvc/llvm-project?rev=214912&view=rev
Log:
tsan: allocate vector clocks using slab allocator

Vector clocks is the most actively allocated object in tsan runtime.
Current internal allocator is not scalable enough to handle allocation
of clocks in scalable way (too small caches). This changes transforms
clocks to 2-level array with 512-byte blocks. Since all blocks are of
the same size, it's possible to cache them more efficiently in per-thread caches.



Added:
    compiler-rt/trunk/test/tsan/thread_detach.c
Modified:
    compiler-rt/trunk/lib/sanitizer_common/sanitizer_thread_registry.cc
    compiler-rt/trunk/lib/sanitizer_common/sanitizer_thread_registry.h
    compiler-rt/trunk/lib/tsan/rtl/tsan_clock.cc
    compiler-rt/trunk/lib/tsan/rtl/tsan_clock.h
    compiler-rt/trunk/lib/tsan/rtl/tsan_flags.cc
    compiler-rt/trunk/lib/tsan/rtl/tsan_rtl.h
    compiler-rt/trunk/lib/tsan/rtl/tsan_rtl_mutex.cc
    compiler-rt/trunk/lib/tsan/rtl/tsan_rtl_thread.cc
    compiler-rt/trunk/lib/tsan/rtl/tsan_sync.cc
    compiler-rt/trunk/lib/tsan/rtl/tsan_sync.h
    compiler-rt/trunk/lib/tsan/tests/unit/tsan_clock_test.cc
    compiler-rt/trunk/lib/tsan/tests/unit/tsan_sync_test.cc

Modified: compiler-rt/trunk/lib/sanitizer_common/sanitizer_thread_registry.cc
URL: http://llvm.org/viewvc/llvm-project/compiler-rt/trunk/lib/sanitizer_common/sanitizer_thread_registry.cc?rev=214912&r1=214911&r2=214912&view=diff
==============================================================================

--- compiler-rt/trunk/lib/sanitizer_common/sanitizer_thread_registry.cc (original)
+++ compiler-rt/trunk/lib/sanitizer_common/sanitizer_thread_registry.cc Tue Aug  5 13:45:02 2014
@@ -219,6 +219,10 @@ void ThreadRegistry::SetThreadNameByUser
 }
 
 void ThreadRegistry::DetachThread(u32 tid) {
+  DetachThread(tid, 0);
+}
+
+void ThreadRegistry::DetachThread(u32 tid, void *arg) {
   BlockingMutexLock l(&mtx_);
   CHECK_LT(tid, n_contexts_);
   ThreadContextBase *tctx = threads_[tid];
@@ -227,6 +231,7 @@ void ThreadRegistry::DetachThread(u32 ti
     Report("%s: Detach of non-existent thread\n", SanitizerToolName);
     return;
   }
+  tctx->OnDetached(arg);
   if (tctx->status == ThreadStatusFinished) {
     tctx->SetDead();
     QuarantinePush(tctx);

Modified: compiler-rt/trunk/lib/sanitizer_common/sanitizer_thread_registry.h
URL: http://llvm.org/viewvc/llvm-project/compiler-rt/trunk/lib/sanitizer_common/sanitizer_thread_registry.h?rev=214912&r1=214911&r2=214912&view=diff
==============================================================================
--- compiler-rt/trunk/lib/sanitizer_common/sanitizer_thread_registry.h (original)
+++ compiler-rt/trunk/lib/sanitizer_common/sanitizer_thread_registry.h Tue Aug  5 13:45:02 2014
@@ -68,6 +68,7 @@ class ThreadContextBase {
   virtual void OnStarted(void *arg) {}
   virtual void OnCreated(void *arg) {}
   virtual void OnReset() {}
+  virtual void OnDetached(void *arg) {}
 };
 
 typedef ThreadContextBase* (*ThreadContextFactory)(u32 tid);
@@ -111,6 +112,7 @@ class ThreadRegistry {
   void SetThreadName(u32 tid, const char *name);
   void SetThreadNameByUserId(uptr user_id, const char *name);
   void DetachThread(u32 tid);
+  void DetachThread(u32 tid, void *arg);
   void JoinThread(u32 tid, void *arg);
   void FinishThread(u32 tid);
   void StartThread(u32 tid, uptr os_id, void *arg);

Modified: compiler-rt/trunk/lib/tsan/rtl/tsan_clock.cc
URL: http://llvm.org/viewvc/llvm-project/compiler-rt/trunk/lib/tsan/rtl/tsan_clock.cc?rev=214912&r1=214911&r2=214912&view=diff
==============================================================================
--- compiler-rt/trunk/lib/tsan/rtl/tsan_clock.cc (original)
+++ compiler-rt/trunk/lib/tsan/rtl/tsan_clock.cc Tue Aug  5 13:45:02 2014
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 #include "tsan_clock.h"
 #include "tsan_rtl.h"
+#include "sanitizer_common/sanitizer_placement_new.h"
 
 // SyncClock and ThreadClock implement vector clocks for sync variables
 // (mutexes, atomic variables, file descriptors, etc) and threads, respectively.
@@ -102,13 +103,13 @@ ThreadClock::ThreadClock(unsigned tid, u
   clk_[tid_].reused = reused_;
 }
 
-void ThreadClock::acquire(const SyncClock *src) {
+void ThreadClock::acquire(ClockCache *c, const SyncClock *src) {
   DCHECK(nclk_ <= kMaxTid);
-  DCHECK(src->clk_.Size() <= kMaxTid);
+  DCHECK(src->size_ <= kMaxTid);
   CPP_STAT_INC(StatClockAcquire);
 
   // Check if it's empty -> no need to do anything.
-  const uptr nclk = src->clk_.Size();
+  const uptr nclk = src->size_;
   if (nclk == 0) {
     CPP_STAT_INC(StatClockAcquireEmpty);
     return;
@@ -118,12 +119,12 @@ void ThreadClock::acquire(const SyncCloc
   bool acquired = false;
   if (nclk > tid_) {
     CPP_STAT_INC(StatClockAcquireLarge);
-    if (src->clk_[tid_].reused == reused_) {
+    if (src->elem(tid_).reused == reused_) {
       CPP_STAT_INC(StatClockAcquireRepeat);
       for (unsigned i = 0; i < kDirtyTids; i++) {
         unsigned tid = src->dirty_tids_[i];
         if (tid != kInvalidTid) {
-          u64 epoch = src->clk_[tid].epoch;
+          u64 epoch = src->elem(tid).epoch;
           if (clk_[tid].epoch < epoch) {
             clk_[tid].epoch = epoch;
             acquired = true;
@@ -142,7 +143,7 @@ void ThreadClock::acquire(const SyncCloc
   CPP_STAT_INC(StatClockAcquireFull);
   nclk_ = max(nclk_, nclk);
   for (uptr i = 0; i < nclk; i++) {
-    u64 epoch = src->clk_[i].epoch;
+    u64 epoch = src->elem(i).epoch;
     if (clk_[i].epoch < epoch) {
       clk_[i].epoch = epoch;
       acquired = true;
@@ -151,7 +152,7 @@ void ThreadClock::acquire(const SyncCloc
 
   // Remember that this thread has acquired this clock.
   if (nclk > tid_)
-    src->clk_[tid_].reused = reused_;
+    src->elem(tid_).reused = reused_;
 
   if (acquired) {
     CPP_STAT_INC(StatClockAcquiredSomething);
@@ -159,28 +160,26 @@ void ThreadClock::acquire(const SyncCloc
   }
 }
 
-void ThreadClock::release(SyncClock *dst) const {
+void ThreadClock::release(ClockCache *c, SyncClock *dst) const {
   DCHECK_LE(nclk_, kMaxTid);
-  DCHECK_LE(dst->clk_.Size(), kMaxTid);
+  DCHECK_LE(dst->size_, kMaxTid);
 
-  if (dst->clk_.Size() == 0) {
+  if (dst->size_ == 0) {
     // ReleaseStore will correctly set release_store_tid_,
     // which can be important for future operations.
-    ReleaseStore(dst);
+    ReleaseStore(c, dst);
     return;
   }
 
   CPP_STAT_INC(StatClockRelease);
   // Check if we need to resize dst.
-  if (dst->clk_.Size() < nclk_) {
-    CPP_STAT_INC(StatClockReleaseResize);
-    dst->clk_.Resize(nclk_);
-  }
+  if (dst->size_ < nclk_)
+    Resize(c, dst);
 
   // Check if we had not acquired anything from other threads
   // since the last release on dst. If so, we need to update
-  // only dst->clk_[tid_].
-  if (dst->clk_[tid_].epoch > last_acquire_) {
+  // only dst->elem(tid_).
+  if (dst->elem(tid_).epoch > last_acquire_) {
     UpdateCurrentThread(dst);
     if (dst->release_store_tid_ != tid_ ||
         dst->release_store_reused_ != reused_)
@@ -196,14 +195,15 @@ void ThreadClock::release(SyncClock *dst
     CPP_STAT_INC(StatClockReleaseAcquired);
   // Update dst->clk_.
   for (uptr i = 0; i < nclk_; i++) {
-    dst->clk_[i].epoch = max(dst->clk_[i].epoch, clk_[i].epoch);
-    dst->clk_[i].reused = 0;
+    ClockElem &ce = dst->elem(i);
+    ce.epoch = max(ce.epoch, clk_[i].epoch);
+    ce.reused = 0;
   }
   // Clear 'acquired' flag in the remaining elements.
-  if (nclk_ < dst->clk_.Size())
+  if (nclk_ < dst->size_)
     CPP_STAT_INC(StatClockReleaseClearTail);
-  for (uptr i = nclk_; i < dst->clk_.Size(); i++)
-    dst->clk_[i].reused = 0;
+  for (uptr i = nclk_; i < dst->size_; i++)
+    dst->elem(i).reused = 0;
   for (unsigned i = 0; i < kDirtyTids; i++)
     dst->dirty_tids_[i] = kInvalidTid;
   dst->release_store_tid_ = kInvalidTid;
@@ -211,23 +211,21 @@ void ThreadClock::release(SyncClock *dst
   // If we've acquired dst, remember this fact,
   // so that we don't need to acquire it on next acquire.
   if (acquired)
-    dst->clk_[tid_].reused = reused_;
+    dst->elem(tid_).reused = reused_;
 }
 
-void ThreadClock::ReleaseStore(SyncClock *dst) const {
+void ThreadClock::ReleaseStore(ClockCache *c, SyncClock *dst) const {
   DCHECK(nclk_ <= kMaxTid);
-  DCHECK(dst->clk_.Size() <= kMaxTid);
+  DCHECK(dst->size_ <= kMaxTid);
   CPP_STAT_INC(StatClockStore);
 
   // Check if we need to resize dst.
-  if (dst->clk_.Size() < nclk_) {
-    CPP_STAT_INC(StatClockStoreResize);
-    dst->clk_.Resize(nclk_);
-  }
+  if (dst->size_ < nclk_)
+    Resize(c, dst);
 
   if (dst->release_store_tid_ == tid_ &&
       dst->release_store_reused_ == reused_ &&
-      dst->clk_[tid_].epoch > last_acquire_) {
+      dst->elem(tid_).epoch > last_acquire_) {
     CPP_STAT_INC(StatClockStoreFast);
     UpdateCurrentThread(dst);
     return;
@@ -236,13 +234,17 @@ void ThreadClock::ReleaseStore(SyncClock
   // O(N) release-store.
   CPP_STAT_INC(StatClockStoreFull);
   for (uptr i = 0; i < nclk_; i++) {
-    dst->clk_[i].epoch = clk_[i].epoch;
-    dst->clk_[i].reused = 0;
+    ClockElem &ce = dst->elem(i);
+    ce.epoch = clk_[i].epoch;
+    ce.reused = 0;
   }
   // Clear the tail of dst->clk_.
-  if (nclk_ < dst->clk_.Size()) {
-    internal_memset(&dst->clk_[nclk_], 0,
-        (dst->clk_.Size() - nclk_) * sizeof(dst->clk_[0]));
+  if (nclk_ < dst->size_) {
+    for (uptr i = nclk_; i < dst->size_; i++) {
+      ClockElem &ce = dst->elem(i);
+      ce.epoch = 0;
+      ce.reused = 0;
+    }
     CPP_STAT_INC(StatClockStoreTail);
   }
   for (unsigned i = 0; i < kDirtyTids; i++)
@@ -250,19 +252,19 @@ void ThreadClock::ReleaseStore(SyncClock
   dst->release_store_tid_ = tid_;
   dst->release_store_reused_ = reused_;
   // Rememeber that we don't need to acquire it in future.
-  dst->clk_[tid_].reused = reused_;
+  dst->elem(tid_).reused = reused_;
 }
 
-void ThreadClock::acq_rel(SyncClock *dst) {
+void ThreadClock::acq_rel(ClockCache *c, SyncClock *dst) {
   CPP_STAT_INC(StatClockAcquireRelease);
-  acquire(dst);
-  ReleaseStore(dst);
+  acquire(c, dst);
+  ReleaseStore(c, dst);
 }
 
 // Updates only single element related to the current thread in dst->clk_.
 void ThreadClock::UpdateCurrentThread(SyncClock *dst) const {
   // Update the threads time, but preserve 'acquired' flag.
-  dst->clk_[tid_].epoch = clk_[tid_].epoch;
+  dst->elem(tid_).epoch = clk_[tid_].epoch;
 
   for (unsigned i = 0; i < kDirtyTids; i++) {
     if (dst->dirty_tids_[i] == tid_) {
@@ -277,27 +279,73 @@ void ThreadClock::UpdateCurrentThread(Sy
   }
   // Reset all 'acquired' flags, O(N).
   CPP_STAT_INC(StatClockReleaseSlow);
-  for (uptr i = 0; i < dst->clk_.Size(); i++) {
-    dst->clk_[i].reused = 0;
-  }
+  for (uptr i = 0; i < dst->size_; i++)
+    dst->elem(i).reused = 0;
   for (unsigned i = 0; i < kDirtyTids; i++)
     dst->dirty_tids_[i] = kInvalidTid;
 }
 
 // Checks whether the current threads has already acquired src.
 bool ThreadClock::IsAlreadyAcquired(const SyncClock *src) const {
-  if (src->clk_[tid_].reused != reused_)
+  if (src->elem(tid_).reused != reused_)
     return false;
   for (unsigned i = 0; i < kDirtyTids; i++) {
     unsigned tid = src->dirty_tids_[i];
     if (tid != kInvalidTid) {
-      if (clk_[tid].epoch < src->clk_[tid].epoch)
+      if (clk_[tid].epoch < src->elem(tid).epoch)
         return false;
     }
   }
   return true;
 }
 
+void ThreadClock::Resize(ClockCache *c, SyncClock *dst) const {
+  CPP_STAT_INC(StatClockReleaseResize);
+  if (RoundUpTo(nclk_, ClockBlock::kClockCount) <=
+      RoundUpTo(dst->size_, ClockBlock::kClockCount)) {
+    // Growing within the same block.
+    // Memory is already allocated, just increase the size.
+    dst->size_ = nclk_;
+    return;
+  }
+  if (nclk_ <= ClockBlock::kClockCount) {
+    // Grow from 0 to one-level table.
+    CHECK_EQ(dst->size_, 0);
+    CHECK_EQ(dst->tab_, 0);
+    CHECK_EQ(dst->tab_idx_, 0);
+    dst->size_ = nclk_;
+    dst->tab_idx_ = ctx->clock_alloc.Alloc(c);
+    dst->tab_ = ctx->clock_alloc.Map(dst->tab_idx_);
+    internal_memset(dst->tab_, 0, sizeof(*dst->tab_));
+    return;
+  }
+  // Growing two-level table.
+  if (dst->size_ == 0) {
+    // Allocate first level table.
+    dst->tab_idx_ = ctx->clock_alloc.Alloc(c);
+    dst->tab_ = ctx->clock_alloc.Map(dst->tab_idx_);
+    internal_memset(dst->tab_, 0, sizeof(*dst->tab_));
+  } else if (dst->size_ <= ClockBlock::kClockCount) {
+    // Transform one-level table to two-level table.
+    u32 old = dst->tab_idx_;
+    dst->tab_idx_ = ctx->clock_alloc.Alloc(c);
+    dst->tab_ = ctx->clock_alloc.Map(dst->tab_idx_);
+    internal_memset(dst->tab_, 0, sizeof(*dst->tab_));
+    dst->tab_->table[0] = old;
+  }
+  // At this point we have first level table allocated.
+  // Add second level tables as necessary.
+  for (uptr i = RoundUpTo(dst->size_, ClockBlock::kClockCount);
+      i < nclk_; i += ClockBlock::kClockCount) {
+    u32 idx = ctx->clock_alloc.Alloc(c);
+    ClockBlock *cb = ctx->clock_alloc.Map(idx);
+    internal_memset(cb, 0, sizeof(*cb));
+    CHECK_EQ(dst->tab_->table[i/ClockBlock::kClockCount], 0);
+    dst->tab_->table[i/ClockBlock::kClockCount] = idx;
+  }
+  dst->size_ = nclk_;
+}
+
 // Sets a single element in the vector clock.
 // This function is called only from weird places like AcquireGlobal.
 void ThreadClock::set(unsigned tid, u64 v) {
@@ -320,34 +368,59 @@ void ThreadClock::DebugDump(int(*printf)
       tid_, reused_, last_acquire_);
 }
 
-SyncClock::SyncClock()
-    : clk_(MBlockClock) {
+SyncClock::SyncClock() {
+  tab_ = 0;
+  tab_idx_ = 0;
+  size_ = 0;
   release_store_tid_ = kInvalidTid;
   release_store_reused_ = 0;
   for (uptr i = 0; i < kDirtyTids; i++)
     dirty_tids_[i] = kInvalidTid;
 }
 
-void SyncClock::Reset() {
-  clk_.Reset();
-  Zero();
-}
-
-void SyncClock::Zero() {
-  clk_.Resize(0);
+SyncClock::~SyncClock() {
+  CHECK_EQ(size_, 0);
+  CHECK_EQ(tab_, 0);
+  CHECK_EQ(tab_idx_, 0);
+}
+
+void SyncClock::Reset(ClockCache *c) {
+  if (size_ == 0) {
+    // nothing
+  } else if (size_ <= ClockBlock::kClockCount) {
+    // One-level table.
+    ctx->clock_alloc.Free(c, tab_idx_);
+  } else {
+    // Two-level table.
+    for (uptr i = 0; i < size_; i += ClockBlock::kClockCount)
+      ctx->clock_alloc.Free(c, tab_->table[i / ClockBlock::kClockCount]);
+    ctx->clock_alloc.Free(c, tab_idx_);
+  }
+  tab_ = 0;
+  tab_idx_ = 0;
+  size_ = 0;
   release_store_tid_ = kInvalidTid;
   release_store_reused_ = 0;
   for (uptr i = 0; i < kDirtyTids; i++)
     dirty_tids_[i] = kInvalidTid;
 }
 
+ClockElem &SyncClock::elem(unsigned tid) const {
+  DCHECK_LT(tid, size_);
+  if (size_ <= ClockBlock::kClockCount)
+    return tab_->clock[tid];
+  u32 idx = tab_->table[tid / ClockBlock::kClockCount];
+  ClockBlock *cb = ctx->clock_alloc.Map(idx);
+  return cb->clock[tid % ClockBlock::kClockCount];
+}
+
 void SyncClock::DebugDump(int(*printf)(const char *s, ...)) {
   printf("clock=[");
-  for (uptr i = 0; i < clk_.Size(); i++)
-    printf("%s%llu", i == 0 ? "" : ",", clk_[i].epoch);
+  for (uptr i = 0; i < size_; i++)
+    printf("%s%llu", i == 0 ? "" : ",", elem(i).epoch);
   printf("] reused=[");
-  for (uptr i = 0; i < clk_.Size(); i++)
-    printf("%s%llu", i == 0 ? "" : ",", clk_[i].reused);
+  for (uptr i = 0; i < size_; i++)
+    printf("%s%llu", i == 0 ? "" : ",", elem(i).reused);
   printf("] release_store_tid=%d/%d dirty_tids=%d/%d",
       release_store_tid_, release_store_reused_,
       dirty_tids_[0], dirty_tids_[1]);

Modified: compiler-rt/trunk/lib/tsan/rtl/tsan_clock.h
URL: http://llvm.org/viewvc/llvm-project/compiler-rt/trunk/lib/tsan/rtl/tsan_clock.h?rev=214912&r1=214911&r2=214912&view=diff
==============================================================================
--- compiler-rt/trunk/lib/tsan/rtl/tsan_clock.h (original)
+++ compiler-rt/trunk/lib/tsan/rtl/tsan_clock.h Tue Aug  5 13:45:02 2014
@@ -14,7 +14,7 @@
 #define TSAN_CLOCK_H
 
 #include "tsan_defs.h"
-#include "tsan_vector.h"
+#include "tsan_dense_alloc.h"
 
 namespace __tsan {
 
@@ -23,37 +23,64 @@ struct ClockElem {
   u64 reused : 64 - kClkBits;
 };
 
+struct ClockBlock {
+  static const uptr kSize = 512;
+  static const uptr kTableSize = kSize / sizeof(u32);
+  static const uptr kClockCount = kSize / sizeof(ClockElem);
+
+  union {
+    u32       table[kTableSize];
+    ClockElem clock[kClockCount];
+  };
+
+  ClockBlock() {
+  }
+};
+
+typedef DenseSlabAlloc<ClockBlock, 1<<16, 1<<10> ClockAlloc;
+typedef DenseSlabAllocCache ClockCache;
+
 // The clock that lives in sync variables (mutexes, atomics, etc).
 class SyncClock {
  public:
   SyncClock();
+  ~SyncClock();
 
   uptr size() const {
-    return clk_.Size();
+    return size_;
   }
 
   u64 get(unsigned tid) const {
-    DCHECK_LT(tid, clk_.Size());
-    return clk_[tid].epoch;
+    return elem(tid).epoch;
   }
 
-  void Reset();
-  void Zero();
+  void Reset(ClockCache *c);
 
   void DebugDump(int(*printf)(const char *s, ...));
 
  private:
+  friend struct ThreadClock;
+  static const uptr kDirtyTids = 2;
+
   unsigned release_store_tid_;
   unsigned release_store_reused_;
-  static const uptr kDirtyTids = 2;
   unsigned dirty_tids_[kDirtyTids];
-  mutable Vector<ClockElem> clk_;
-  friend struct ThreadClock;
+  // tab_ contains indirect pointer to a 512b block using DenseSlabAlloc.
+  // If size_ <= 64, then tab_ points to an array with 64 ClockElem's.
+  // Otherwise, tab_ points to an array with 128 u32 elements,
+  // each pointing to the second-level 512b block with 64 ClockElem's.
+  ClockBlock *tab_;
+  u32 tab_idx_;
+  u32 size_;
+
+  ClockElem &elem(unsigned tid) const;
 };
 
 // The clock that lives in threads.
 struct ThreadClock {
  public:
+  typedef DenseSlabAllocCache Cache;
+
   explicit ThreadClock(unsigned tid, unsigned reused = 0);
 
   u64 get(unsigned tid) const {
@@ -76,10 +103,10 @@ struct ThreadClock {
     return nclk_;
   }
 
-  void acquire(const SyncClock *src);
-  void release(SyncClock *dst) const;
-  void acq_rel(SyncClock *dst);
-  void ReleaseStore(SyncClock *dst) const;
+  void acquire(ClockCache *c, const SyncClock *src);
+  void release(ClockCache *c, SyncClock *dst) const;
+  void acq_rel(ClockCache *c, SyncClock *dst);
+  void ReleaseStore(ClockCache *c, SyncClock *dst) const;
 
   void DebugReset();
   void DebugDump(int(*printf)(const char *s, ...));
@@ -94,6 +121,7 @@ struct ThreadClock {
 
   bool IsAlreadyAcquired(const SyncClock *src) const;
   void UpdateCurrentThread(SyncClock *dst) const;
+  void Resize(ClockCache *c, SyncClock *dst) const;
 };
 
 }  // namespace __tsan

Modified: compiler-rt/trunk/lib/tsan/rtl/tsan_flags.cc
URL: http://llvm.org/viewvc/llvm-project/compiler-rt/trunk/lib/tsan/rtl/tsan_flags.cc?rev=214912&r1=214911&r2=214912&view=diff
==============================================================================
--- compiler-rt/trunk/lib/tsan/rtl/tsan_flags.cc (original)
+++ compiler-rt/trunk/lib/tsan/rtl/tsan_flags.cc Tue Aug  5 13:45:02 2014
@@ -107,7 +107,7 @@ void InitializeFlags(Flags *f, const cha
   ParseCommonFlagsFromString(f, env);
 
   // Copy back to common flags.
-  *common_flags() = *f;
+  internal_memcpy(common_flags(), f, sizeof(*common_flags()));
 
   // Sanity check.
   if (!f->report_bugs) {

Modified: compiler-rt/trunk/lib/tsan/rtl/tsan_rtl.h
URL: http://llvm.org/viewvc/llvm-project/compiler-rt/trunk/lib/tsan/rtl/tsan_rtl.h?rev=214912&r1=214911&r2=214912&view=diff
==============================================================================
--- compiler-rt/trunk/lib/tsan/rtl/tsan_rtl.h (original)
+++ compiler-rt/trunk/lib/tsan/rtl/tsan_rtl.h Tue Aug  5 13:45:02 2014
@@ -374,6 +374,7 @@ struct ThreadState {
 
   DenseSlabAllocCache block_cache;
   DenseSlabAllocCache sync_cache;
+  DenseSlabAllocCache clock_cache;
 
 #ifndef TSAN_GO
   u32 last_sleep_stack_id;
@@ -418,6 +419,7 @@ class ThreadContext : public ThreadConte
   void OnStarted(void *arg);
   void OnCreated(void *arg);
   void OnReset();
+  void OnDetached(void *arg);
 };
 
 struct RacyStacks {
@@ -466,6 +468,8 @@ struct Context {
   InternalMmapVector<FiredSuppression> fired_suppressions;
   DDetector *dd;
 
+  ClockAlloc clock_alloc;
+
   Flags flags;
 
   u64 stat[StatCnt];

Modified: compiler-rt/trunk/lib/tsan/rtl/tsan_rtl_mutex.cc
URL: http://llvm.org/viewvc/llvm-project/compiler-rt/trunk/lib/tsan/rtl/tsan_rtl_mutex.cc?rev=214912&r1=214911&r2=214912&view=diff
==============================================================================
--- compiler-rt/trunk/lib/tsan/rtl/tsan_rtl_mutex.cc (original)
+++ compiler-rt/trunk/lib/tsan/rtl/tsan_rtl_mutex.cc Tue Aug  5 13:45:02 2014
@@ -118,7 +118,7 @@ void MutexDestroy(ThreadState *thr, uptr
   u64 mid = s->GetId();
   u32 last_lock = s->last_lock;
   if (!unlock_locked)
-    s->Reset();  // must not reset it before the report is printed
+    s->Reset(thr);  // must not reset it before the report is printed
   s->mtx.Unlock();
   if (unlock_locked) {
     ThreadRegistryLock l(ctx->thread_registry);
@@ -136,7 +136,7 @@ void MutexDestroy(ThreadState *thr, uptr
   if (unlock_locked) {
     SyncVar *s = ctx->metamap.GetIfExistsAndLock(addr);
     if (s != 0) {
-      s->Reset();
+      s->Reset(thr);
       s->mtx.Unlock();
     }
   }
@@ -429,7 +429,7 @@ void AcquireImpl(ThreadState *thr, uptr
   if (thr->ignore_sync)
     return;
   thr->clock.set(thr->fast_state.epoch());
-  thr->clock.acquire(c);
+  thr->clock.acquire(&thr->clock_cache, c);
   StatInc(thr, StatSyncAcquire);
 }
 
@@ -438,7 +438,7 @@ void ReleaseImpl(ThreadState *thr, uptr
     return;
   thr->clock.set(thr->fast_state.epoch());
   thr->fast_synch_epoch = thr->fast_state.epoch();
-  thr->clock.release(c);
+  thr->clock.release(&thr->clock_cache, c);
   StatInc(thr, StatSyncRelease);
 }
 
@@ -447,7 +447,7 @@ void ReleaseStoreImpl(ThreadState *thr,
     return;
   thr->clock.set(thr->fast_state.epoch());
   thr->fast_synch_epoch = thr->fast_state.epoch();
-  thr->clock.ReleaseStore(c);
+  thr->clock.ReleaseStore(&thr->clock_cache, c);
   StatInc(thr, StatSyncRelease);
 }
 
@@ -456,7 +456,7 @@ void AcquireReleaseImpl(ThreadState *thr
     return;
   thr->clock.set(thr->fast_state.epoch());
   thr->fast_synch_epoch = thr->fast_state.epoch();
-  thr->clock.acq_rel(c);
+  thr->clock.acq_rel(&thr->clock_cache, c);
   StatInc(thr, StatSyncAcquire);
   StatInc(thr, StatSyncRelease);
 }

Modified: compiler-rt/trunk/lib/tsan/rtl/tsan_rtl_thread.cc
URL: http://llvm.org/viewvc/llvm-project/compiler-rt/trunk/lib/tsan/rtl/tsan_rtl_thread.cc?rev=214912&r1=214911&r2=214912&view=diff
==============================================================================
--- compiler-rt/trunk/lib/tsan/rtl/tsan_rtl_thread.cc (original)
+++ compiler-rt/trunk/lib/tsan/rtl/tsan_rtl_thread.cc Tue Aug  5 13:45:02 2014
@@ -36,13 +36,13 @@ ThreadContext::~ThreadContext() {
 #endif
 
 void ThreadContext::OnDead() {
-  sync.Reset();
+  CHECK_EQ(sync.size(), 0);
 }
 
 void ThreadContext::OnJoined(void *arg) {
   ThreadState *caller_thr = static_cast<ThreadState *>(arg);
   AcquireImpl(caller_thr, 0, &sync);
-  sync.Reset();
+  sync.Reset(&caller_thr->clock_cache);
 }
 
 struct OnCreatedArgs {
@@ -65,11 +65,16 @@ void ThreadContext::OnCreated(void *arg)
 }
 
 void ThreadContext::OnReset() {
-  sync.Reset();
+  CHECK_EQ(sync.size(), 0);
   FlushUnneededShadowMemory(GetThreadTrace(tid), TraceSize() * sizeof(Event));
   //!!! FlushUnneededShadowMemory(GetThreadTraceHeader(tid), sizeof(Trace));
 }
 
+void ThreadContext::OnDetached(void *arg) {
+  ThreadState *thr1 = static_cast<ThreadState*>(arg);
+  sync.Reset(&thr1->clock_cache);
+}
+
 struct OnStartedArgs {
   ThreadState *thr;
   uptr stk_addr;
@@ -113,7 +118,7 @@ void ThreadContext::OnStarted(void *arg)
   Trace *thr_trace = ThreadTrace(thr->tid);
   thr_trace->headers[trace].epoch0 = epoch0;
   StatInc(thr, StatSyncAcquire);
-  sync.Reset();
+  sync.Reset(&thr->clock_cache);
   DPrintf("#%d: ThreadStart epoch=%zu stk_addr=%zx stk_size=%zx "
           "tls_addr=%zx tls_size=%zx\n",
           tid, (uptr)epoch0, args->stk_addr, args->stk_size,
@@ -134,6 +139,7 @@ void ThreadContext::OnFinished() {
     ctx->dd->DestroyPhysicalThread(thr->dd_pt);
     ctx->dd->DestroyLogicalThread(thr->dd_lt);
   }
+  ctx->clock_alloc.FlushCache(&thr->clock_cache);
   ctx->metamap.OnThreadIdle(thr);
 #ifndef TSAN_GO
   AllocatorThreadFinish(thr);
@@ -307,7 +313,7 @@ void ThreadJoin(ThreadState *thr, uptr p
 void ThreadDetach(ThreadState *thr, uptr pc, int tid) {
   CHECK_GT(tid, 0);
   CHECK_LT(tid, kMaxTid);
-  ctx->thread_registry->DetachThread(tid);
+  ctx->thread_registry->DetachThread(tid, thr);
 }
 
 void ThreadSetName(ThreadState *thr, const char *name) {

Modified: compiler-rt/trunk/lib/tsan/rtl/tsan_sync.cc
URL: http://llvm.org/viewvc/llvm-project/compiler-rt/trunk/lib/tsan/rtl/tsan_sync.cc?rev=214912&r1=214911&r2=214912&view=diff
==============================================================================
--- compiler-rt/trunk/lib/tsan/rtl/tsan_sync.cc (original)
+++ compiler-rt/trunk/lib/tsan/rtl/tsan_sync.cc Tue Aug  5 13:45:02 2014
@@ -21,7 +21,7 @@ void DDMutexInit(ThreadState *thr, uptr
 
 SyncVar::SyncVar()
     : mtx(MutexTypeSyncVar, StatMtxSyncVar) {
-  Reset();
+  Reset(0);
 }
 
 void SyncVar::Init(ThreadState *thr, uptr pc, uptr addr, u64 uid) {
@@ -36,7 +36,7 @@ void SyncVar::Init(ThreadState *thr, upt
     DDMutexInit(thr, pc, this);
 }
 
-void SyncVar::Reset() {
+void SyncVar::Reset(ThreadState *thr) {
   uid = 0;
   creation_stack_id = 0;
   owner_tid = kInvalidTid;
@@ -47,8 +47,13 @@ void SyncVar::Reset() {
   is_broken = 0;
   is_linker_init = 0;
 
-  clock.Zero();
-  read_clock.Reset();
+  if (thr == 0) {
+    CHECK_EQ(clock.size(), 0);
+    CHECK_EQ(read_clock.size(), 0);
+  } else {
+    clock.Reset(&thr->clock_cache);
+    read_clock.Reset(&thr->clock_cache);
+  }
 }
 
 MetaMap::MetaMap() {
@@ -93,7 +98,7 @@ void MetaMap::FreeRange(ThreadState *thr
         DCHECK(idx & kFlagSync);
         SyncVar *s = sync_alloc_.Map(idx & ~kFlagMask);
         u32 next = s->next;
-        s->Reset();
+        s->Reset(thr);
         sync_alloc_.Free(&thr->sync_cache, idx & ~kFlagMask);
         idx = next;
       } else {
@@ -143,7 +148,7 @@ SyncVar* MetaMap::GetAndLock(ThreadState
       SyncVar * s = sync_alloc_.Map(idx & ~kFlagMask);
       if (s->addr == addr) {
         if (myidx != 0) {
-          mys->Reset();
+          mys->Reset(thr);
           sync_alloc_.Free(&thr->sync_cache, myidx);
         }
         if (write_lock)

Modified: compiler-rt/trunk/lib/tsan/rtl/tsan_sync.h
URL: http://llvm.org/viewvc/llvm-project/compiler-rt/trunk/lib/tsan/rtl/tsan_sync.h?rev=214912&r1=214911&r2=214912&view=diff
==============================================================================
--- compiler-rt/trunk/lib/tsan/rtl/tsan_sync.h (original)
+++ compiler-rt/trunk/lib/tsan/rtl/tsan_sync.h Tue Aug  5 13:45:02 2014
@@ -47,7 +47,7 @@ struct SyncVar {
   SyncClock clock;
 
   void Init(ThreadState *thr, uptr pc, uptr addr, u64 uid);
-  void Reset();
+  void Reset(ThreadState *thr);
 
   u64 GetId() const {
     // 47 lsb is addr, then 14 bits is low part of uid, then 3 zero bits.

Modified: compiler-rt/trunk/lib/tsan/tests/unit/tsan_clock_test.cc
URL: http://llvm.org/viewvc/llvm-project/compiler-rt/trunk/lib/tsan/tests/unit/tsan_clock_test.cc?rev=214912&r1=214911&r2=214912&view=diff
==============================================================================
--- compiler-rt/trunk/lib/tsan/tests/unit/tsan_clock_test.cc (original)
+++ compiler-rt/trunk/lib/tsan/tests/unit/tsan_clock_test.cc Tue Aug  5 13:45:02 2014
@@ -17,6 +17,8 @@
 
 namespace __tsan {
 
+ClockCache cache;
+
 TEST(Clock, VectorBasic) {
   ThreadClock clk(0);
   ASSERT_EQ(clk.size(), 1U);
@@ -38,30 +40,32 @@ TEST(Clock, ChunkedBasic) {
   SyncClock chunked;
   ASSERT_EQ(vector.size(), 1U);
   ASSERT_EQ(chunked.size(), 0U);
-  vector.acquire(&chunked);
+  vector.acquire(&cache, &chunked);
   ASSERT_EQ(vector.size(), 1U);
   ASSERT_EQ(chunked.size(), 0U);
-  vector.release(&chunked);
+  vector.release(&cache, &chunked);
   ASSERT_EQ(vector.size(), 1U);
   ASSERT_EQ(chunked.size(), 1U);
-  vector.acq_rel(&chunked);
+  vector.acq_rel(&cache, &chunked);
   ASSERT_EQ(vector.size(), 1U);
   ASSERT_EQ(chunked.size(), 1U);
+  chunked.Reset(&cache);
 }
 
 TEST(Clock, AcquireRelease) {
   ThreadClock vector1(100);
   vector1.tick();
   SyncClock chunked;
-  vector1.release(&chunked);
+  vector1.release(&cache, &chunked);
   ASSERT_EQ(chunked.size(), 101U);
   ThreadClock vector2(0);
-  vector2.acquire(&chunked);
+  vector2.acquire(&cache, &chunked);
   ASSERT_EQ(vector2.size(), 101U);
   ASSERT_EQ(vector2.get(0), 0U);
   ASSERT_EQ(vector2.get(1), 0U);
   ASSERT_EQ(vector2.get(99), 0U);
   ASSERT_EQ(vector2.get(100), 1U);
+  chunked.Reset(&cache);
 }
 
 TEST(Clock, RepeatedAcquire) {
@@ -71,10 +75,12 @@ TEST(Clock, RepeatedAcquire) {
   thr2.tick();
 
   SyncClock sync;
-  thr1.ReleaseStore(&sync);
+  thr1.ReleaseStore(&cache, &sync);
+
+  thr2.acquire(&cache, &sync);
+  thr2.acquire(&cache, &sync);
 
-  thr2.acquire(&sync);
-  thr2.acquire(&sync);
+  sync.Reset(&cache);
 }
 
 TEST(Clock, ManyThreads) {
@@ -83,9 +89,9 @@ TEST(Clock, ManyThreads) {
     ThreadClock vector(0);
     vector.tick();
     vector.set(i, 1);
-    vector.release(&chunked);
+    vector.release(&cache, &chunked);
     ASSERT_EQ(i + 1, chunked.size());
-    vector.acquire(&chunked);
+    vector.acquire(&cache, &chunked);
     ASSERT_EQ(i + 1, vector.size());
   }
 
@@ -93,10 +99,12 @@ TEST(Clock, ManyThreads) {
     ASSERT_EQ(1U, chunked.get(i));
 
   ThreadClock vector(1);
-  vector.acquire(&chunked);
+  vector.acquire(&cache, &chunked);
   ASSERT_EQ(100U, vector.size());
   for (unsigned i = 0; i < 100; i++)
     ASSERT_EQ(1U, vector.get(i));
+
+  chunked.Reset(&cache);
 }
 
 TEST(Clock, DifferentSizes) {
@@ -107,33 +115,102 @@ TEST(Clock, DifferentSizes) {
     vector2.tick();
     {
       SyncClock chunked;
-      vector1.release(&chunked);
+      vector1.release(&cache, &chunked);
       ASSERT_EQ(chunked.size(), 11U);
-      vector2.release(&chunked);
+      vector2.release(&cache, &chunked);
       ASSERT_EQ(chunked.size(), 21U);
+      chunked.Reset(&cache);
     }
     {
       SyncClock chunked;
-      vector2.release(&chunked);
+      vector2.release(&cache, &chunked);
       ASSERT_EQ(chunked.size(), 21U);
-      vector1.release(&chunked);
+      vector1.release(&cache, &chunked);
       ASSERT_EQ(chunked.size(), 21U);
+      chunked.Reset(&cache);
     }
     {
       SyncClock chunked;
-      vector1.release(&chunked);
-      vector2.acquire(&chunked);
+      vector1.release(&cache, &chunked);
+      vector2.acquire(&cache, &chunked);
       ASSERT_EQ(vector2.size(), 21U);
+      chunked.Reset(&cache);
     }
     {
       SyncClock chunked;
-      vector2.release(&chunked);
-      vector1.acquire(&chunked);
+      vector2.release(&cache, &chunked);
+      vector1.acquire(&cache, &chunked);
       ASSERT_EQ(vector1.size(), 21U);
+      chunked.Reset(&cache);
     }
   }
 }
 
+TEST(Clock, Growth) {
+  {
+    ThreadClock vector(10);
+    vector.tick();
+    vector.set(5, 42);
+    SyncClock sync;
+    vector.release(&cache, &sync);
+    ASSERT_EQ(sync.size(), 11U);
+    ASSERT_EQ(sync.get(0), 0ULL);
+    ASSERT_EQ(sync.get(1), 0ULL);
+    ASSERT_EQ(sync.get(5), 42ULL);
+    ASSERT_EQ(sync.get(9), 0ULL);
+    ASSERT_EQ(sync.get(10), 1ULL);
+    sync.Reset(&cache);
+  }
+  {
+    ThreadClock vector1(10);
+    vector1.tick();
+    ThreadClock vector2(20);
+    vector2.tick();
+    SyncClock sync;
+    vector1.release(&cache, &sync);
+    vector2.release(&cache, &sync);
+    ASSERT_EQ(sync.size(), 21U);
+    ASSERT_EQ(sync.get(0), 0ULL);
+    ASSERT_EQ(sync.get(10), 1ULL);
+    ASSERT_EQ(sync.get(19), 0ULL);
+    ASSERT_EQ(sync.get(20), 1ULL);
+    sync.Reset(&cache);
+  }
+  {
+    ThreadClock vector(100);
+    vector.tick();
+    vector.set(5, 42);
+    vector.set(90, 84);
+    SyncClock sync;
+    vector.release(&cache, &sync);
+    ASSERT_EQ(sync.size(), 101U);
+    ASSERT_EQ(sync.get(0), 0ULL);
+    ASSERT_EQ(sync.get(1), 0ULL);
+    ASSERT_EQ(sync.get(5), 42ULL);
+    ASSERT_EQ(sync.get(60), 0ULL);
+    ASSERT_EQ(sync.get(70), 0ULL);
+    ASSERT_EQ(sync.get(90), 84ULL);
+    ASSERT_EQ(sync.get(99), 0ULL);
+    ASSERT_EQ(sync.get(100), 1ULL);
+    sync.Reset(&cache);
+  }
+  {
+    ThreadClock vector1(10);
+    vector1.tick();
+    ThreadClock vector2(100);
+    vector2.tick();
+    SyncClock sync;
+    vector1.release(&cache, &sync);
+    vector2.release(&cache, &sync);
+    ASSERT_EQ(sync.size(), 101U);
+    ASSERT_EQ(sync.get(0), 0ULL);
+    ASSERT_EQ(sync.get(10), 1ULL);
+    ASSERT_EQ(sync.get(99), 0ULL);
+    ASSERT_EQ(sync.get(100), 1ULL);
+    sync.Reset(&cache);
+  }
+}
+
 const int kThreads = 4;
 const int kClocks = 4;
 
@@ -257,31 +334,31 @@ static bool ClockFuzzer(bool printing) {
       if (printing)
         printf("acquire thr%d <- clk%d\n", tid, cid);
       thr0[tid]->acquire(sync0[cid]);
-      thr1[tid]->acquire(sync1[cid]);
+      thr1[tid]->acquire(&cache, sync1[cid]);
       break;
     case 1:
       if (printing)
         printf("release thr%d -> clk%d\n", tid, cid);
       thr0[tid]->release(sync0[cid]);
-      thr1[tid]->release(sync1[cid]);
+      thr1[tid]->release(&cache, sync1[cid]);
       break;
     case 2:
       if (printing)
         printf("acq_rel thr%d <> clk%d\n", tid, cid);
       thr0[tid]->acq_rel(sync0[cid]);
-      thr1[tid]->acq_rel(sync1[cid]);
+      thr1[tid]->acq_rel(&cache, sync1[cid]);
       break;
     case 3:
       if (printing)
         printf("rel_str thr%d >> clk%d\n", tid, cid);
       thr0[tid]->ReleaseStore(sync0[cid]);
-      thr1[tid]->ReleaseStore(sync1[cid]);
+      thr1[tid]->ReleaseStore(&cache, sync1[cid]);
       break;
     case 4:
       if (printing)
         printf("reset clk%d\n", cid);
       sync0[cid]->Reset();
-      sync1[cid]->Reset();
+      sync1[cid]->Reset(&cache);
       break;
     case 5:
       if (printing)
@@ -331,6 +408,10 @@ static bool ClockFuzzer(bool printing) {
       return false;
     }
   }
+
+  for (unsigned i = 0; i < kClocks; i++) {
+    sync1[i]->Reset(&cache);
+  }
   return true;
 }
 

Modified: compiler-rt/trunk/lib/tsan/tests/unit/tsan_sync_test.cc
URL: http://llvm.org/viewvc/llvm-project/compiler-rt/trunk/lib/tsan/tests/unit/tsan_sync_test.cc?rev=214912&r1=214911&r2=214912&view=diff
==============================================================================
--- compiler-rt/trunk/lib/tsan/tests/unit/tsan_sync_test.cc (original)
+++ compiler-rt/trunk/lib/tsan/tests/unit/tsan_sync_test.cc Tue Aug  5 13:45:02 2014
@@ -114,7 +114,7 @@ TEST(MetaMap, ResetSync) {
   u64 block[1] = {};  // fake malloc block
   m->AllocBlock(thr, 0, (uptr)&block[0], 1 * sizeof(u64));
   SyncVar *s = m->GetOrCreateAndLock(thr, 0, (uptr)&block[0], true);
-  s->Reset();
+  s->Reset(thr);
   s->mtx.Unlock();
   uptr sz = m->FreeBlock(thr, 0, (uptr)&block[0]);
   EXPECT_EQ(sz, 1 * sizeof(u64));

Added: compiler-rt/trunk/test/tsan/thread_detach.c
URL: http://llvm.org/viewvc/llvm-project/compiler-rt/trunk/test/tsan/thread_detach.c?rev=214912&view=auto
==============================================================================
--- compiler-rt/trunk/test/tsan/thread_detach.c (added)
+++ compiler-rt/trunk/test/tsan/thread_detach.c Tue Aug  5 13:45:02 2014
@@ -0,0 +1,20 @@
+// RUN: %clang_tsan -O1 %s -o %t && %run %t 2>&1 | FileCheck %s
+#include <pthread.h>
+#include <stdio.h>
+#include <unistd.h>
+
+void *Thread(void *x) {
+  return 0;
+}
+
+int main() {
+  pthread_t t;
+  pthread_create(&t, 0, Thread, 0);
+  sleep(1);
+  pthread_detach(t);
+  printf("PASS\n");
+  return 0;
+}
+
+// CHECK-NOT: WARNING: ThreadSanitizer: thread leak
+// CHECK: PASS