[compiler-rt] r308018 - tsan: optimize sync clock memory consumption

Fri Jul 14 04:30:06 PDT 2017

Author: dvyukov
Date: Fri Jul 14 04:30:06 2017
New Revision: 308018

URL: http://llvm.org/viewvc/llvm-project?rev=308018&view=rev
Log:
tsan: optimize sync clock memory consumption

This change implements 2 optimizations of sync clocks that reduce memory consumption:

Use previously unused first level block space to store clock elements.
Currently a clock for 100 threads consumes 3 512-byte blocks:

2 64-bit second level blocks to store clock elements
+1 32-bit first level block to store indices to second level blocks
Only 8 bytes of the first level block are actually used.
With this change such clock consumes only 2 blocks.

Share similar clocks differing only by a single clock entry for the current thread.
When a thread does several release operations on fresh sync objects without intervening
acquire operations in between (e.g. initialization of several fields in ctor),
the resulting clocks differ only by a single entry for the current thread.
This change reuses a single clock for such release operations. The current thread time
(which is different for different clocks) is stored in dirty entries.

We are experiencing issues with a large program that eats all 64M clock blocks
(32GB of non-flushable memory) and crashes with dense allocator overflow.
Max number of threads in the program is ~170 which is currently quite unfortunate
(consume 4 blocks per clock). Currently it crashes after consuming 60+ GB of memory.
The first optimization brings clock block consumption down to ~40M and
allows the program to work. The second optimization further reduces block consumption
to "modest" 16M blocks (~8GB of RAM) and reduces overall RAM consumption to ~30GB.

Measurements on another real world C++ RPC benchmark show RSS reduction
from 3.491G to 3.186G and a modest speedup of ~5%.

Go parallel client/server HTTP benchmark:
https://github.com/golang/benchmarks/blob/master/http/http.go
shows RSS reduction from 320MB to 240MB and a few percent speedup.

Reviewed in https://reviews.llvm.org/D35323


Modified:
    compiler-rt/trunk/lib/tsan/rtl/tsan_clock.cc
    compiler-rt/trunk/lib/tsan/rtl/tsan_clock.h
    compiler-rt/trunk/lib/tsan/rtl/tsan_defs.h
    compiler-rt/trunk/lib/tsan/tests/unit/tsan_clock_test.cc

Modified: compiler-rt/trunk/lib/tsan/rtl/tsan_clock.cc
URL: http://llvm.org/viewvc/llvm-project/compiler-rt/trunk/lib/tsan/rtl/tsan_clock.cc?rev=308018&r1=308017&r2=308018&view=diff
==============================================================================

--- compiler-rt/trunk/lib/tsan/rtl/tsan_clock.cc (original)
+++ compiler-rt/trunk/lib/tsan/rtl/tsan_clock.cc Fri Jul 14 04:30:06 2017
@@ -61,20 +61,13 @@
 // an exclusive lock; ThreadClock's are private to respective threads and so
 // do not need any protection.
 //
-// Description of ThreadClock state:
-// clk_ - fixed size vector clock.
-// nclk_ - effective size of the vector clock (the rest is zeros).
-// tid_ - index of the thread associated with he clock ("current thread").
-// last_acquire_ - current thread time when it acquired something from
-//   other threads.
-//
 // Description of SyncClock state:
 // clk_ - variable size vector clock, low kClkBits hold timestamp,
 //   the remaining bits hold "acquired" flag (the actual value is thread's
 //   reused counter);
 //   if acquried == thr->reused_, then the respective thread has already
-//   acquired this clock (except possibly dirty_tids_).
-// dirty_tids_ - holds up to two indeces in the vector clock that other threads
+//   acquired this clock (except possibly for dirty elements).
+// dirty_ - holds up to two indeces in the vector clock that other threads
 //   need to acquire regardless of "acquired" flag value;
 // release_store_tid_ - denotes that the clock state is a result of
 //   release-store operation by the thread with release_store_tid_ index.
@@ -90,21 +83,51 @@
 
 namespace __tsan {
 
+static atomic_uint32_t *ref_ptr(ClockBlock *cb) {
+  return reinterpret_cast<atomic_uint32_t *>(&cb->table[ClockBlock::kRefIdx]);
+}
+
+// Drop reference to the first level block idx.
+static void UnrefClockBlock(ClockCache *c, u32 idx, uptr blocks) {
+  ClockBlock *cb = ctx->clock_alloc.Map(idx);
+  atomic_uint32_t *ref = ref_ptr(cb);
+  u32 v = atomic_load(ref, memory_order_acquire);
+  for (;;) {
+    CHECK_GT(v, 0);
+    if (v == 1)
+      break;
+    if (atomic_compare_exchange_strong(ref, &v, v - 1, memory_order_acq_rel))
+      return;
+  }
+  // First level block owns second level blocks, so them as well.
+  for (uptr i = 0; i < blocks; i++)
+    ctx->clock_alloc.Free(c, cb->table[ClockBlock::kBlockIdx - i]);
+  ctx->clock_alloc.Free(c, idx);
+}
+
 ThreadClock::ThreadClock(unsigned tid, unsigned reused)
     : tid_(tid)
-    , reused_(reused + 1) {  // 0 has special meaning
+    , reused_(reused + 1)  // 0 has special meaning
+    , cached_idx_()
+    , cached_size_()
+    , cached_blocks_() {
   CHECK_LT(tid, kMaxTidInClock);
   CHECK_EQ(reused_, ((u64)reused_ << kClkBits) >> kClkBits);
   nclk_ = tid_ + 1;
   last_acquire_ = 0;
   internal_memset(clk_, 0, sizeof(clk_));
-  clk_[tid_].reused = reused_;
 }
 
 void ThreadClock::ResetCached(ClockCache *c) {
+  if (cached_idx_) {
+    UnrefClockBlock(c, cached_idx_, cached_blocks_);
+    cached_idx_ = 0;
+    cached_size_ = 0;
+    cached_blocks_ = 0;
+  }
 }
 
-void ThreadClock::acquire(ClockCache *c, const SyncClock *src) {
+void ThreadClock::acquire(ClockCache *c, SyncClock *src) {
   DCHECK_LE(nclk_, kMaxTid);
   DCHECK_LE(src->size_, kMaxTid);
   CPP_STAT_INC(StatClockAcquire);
@@ -116,50 +139,46 @@ void ThreadClock::acquire(ClockCache *c,
     return;
   }
 
-  // Check if we've already acquired src after the last release operation on src
   bool acquired = false;
-  if (nclk > tid_) {
-    if (src->elem(tid_).reused == reused_) {
-      for (unsigned i = 0; i < kDirtyTids; i++) {
-        unsigned tid = src->dirty_tids_[i];
-        if (tid != kInvalidTid) {
-          u64 epoch = src->elem(tid).epoch;
-          if (clk_[tid].epoch < epoch) {
-            clk_[tid].epoch = epoch;
-            acquired = true;
-          }
-        }
-      }
-      if (acquired) {
-        CPP_STAT_INC(StatClockAcquiredSomething);
-        last_acquire_ = clk_[tid_].epoch;
+  for (unsigned i = 0; i < kDirtyTids; i++) {
+    SyncClock::Dirty dirty = src->dirty_[i];
+    unsigned tid = dirty.tid;
+    if (tid != kInvalidTid) {
+      if (clk_[tid] < dirty.epoch) {
+        clk_[tid] = dirty.epoch;
+        acquired = true;
       }
-      return;
     }
   }
 
-  // O(N) acquire.
-  CPP_STAT_INC(StatClockAcquireFull);
-  nclk_ = max(nclk_, nclk);
-  for (uptr i = 0; i < nclk; i++) {
-    u64 epoch = src->elem(i).epoch;
-    if (clk_[i].epoch < epoch) {
-      clk_[i].epoch = epoch;
-      acquired = true;
+  // Check if we've already acquired src after the last release operation on src
+  if (tid_ >= nclk || src->elem(tid_).reused != reused_) {
+    // O(N) acquire.
+    CPP_STAT_INC(StatClockAcquireFull);
+    nclk_ = max(nclk_, nclk);
+    u64 *dst_pos = &clk_[0];
+    for (ClockElem &src_elem : *src) {
+      u64 epoch = src_elem.epoch;
+      if (*dst_pos < epoch) {
+        *dst_pos = epoch;
+        acquired = true;
+      }
+      dst_pos++;
     }
-  }
 
-  // Remember that this thread has acquired this clock.
-  if (nclk > tid_)
-    src->elem(tid_).reused = reused_;
+    // Remember that this thread has acquired this clock.
+    if (nclk > tid_)
+      src->elem(tid_).reused = reused_;
+  }
 
   if (acquired) {
     CPP_STAT_INC(StatClockAcquiredSomething);
-    last_acquire_ = clk_[tid_].epoch;
+    last_acquire_ = clk_[tid_];
+    ResetCached(c);
   }
 }
 
-void ThreadClock::release(ClockCache *c, SyncClock *dst) const {
+void ThreadClock::release(ClockCache *c, SyncClock *dst) {
   DCHECK_LE(nclk_, kMaxTid);
   DCHECK_LE(dst->size_, kMaxTid);
 
@@ -179,7 +198,7 @@ void ThreadClock::release(ClockCache *c,
   // since the last release on dst. If so, we need to update
   // only dst->elem(tid_).
   if (dst->elem(tid_).epoch > last_acquire_) {
-    UpdateCurrentThread(dst);
+    UpdateCurrentThread(c, dst);
     if (dst->release_store_tid_ != tid_ ||
         dst->release_store_reused_ != reused_)
       dst->release_store_tid_ = kInvalidTid;
@@ -188,23 +207,24 @@ void ThreadClock::release(ClockCache *c,
 
   // O(N) release.
   CPP_STAT_INC(StatClockReleaseFull);
+  dst->Unshare(c);
   // First, remember whether we've acquired dst.
   bool acquired = IsAlreadyAcquired(dst);
   if (acquired)
     CPP_STAT_INC(StatClockReleaseAcquired);
   // Update dst->clk_.
-  for (uptr i = 0; i < nclk_; i++) {
-    ClockElem &ce = dst->elem(i);
-    ce.epoch = max(ce.epoch, clk_[i].epoch);
+  dst->FlushDirty();
+  uptr i = 0;
+  for (ClockElem &ce : *dst) {
+    ce.epoch = max(ce.epoch, clk_[i]);
     ce.reused = 0;
+    i++;
   }
   // Clear 'acquired' flag in the remaining elements.
   if (nclk_ < dst->size_)
     CPP_STAT_INC(StatClockReleaseClearTail);
   for (uptr i = nclk_; i < dst->size_; i++)
     dst->elem(i).reused = 0;
-  for (unsigned i = 0; i < kDirtyTids; i++)
-    dst->dirty_tids_[i] = kInvalidTid;
   dst->release_store_tid_ = kInvalidTid;
   dst->release_store_reused_ = 0;
   // If we've acquired dst, remember this fact,
@@ -213,11 +233,37 @@ void ThreadClock::release(ClockCache *c,
     dst->elem(tid_).reused = reused_;
 }
 
-void ThreadClock::ReleaseStore(ClockCache *c, SyncClock *dst) const {
+void ThreadClock::ReleaseStore(ClockCache *c, SyncClock *dst) {
   DCHECK_LE(nclk_, kMaxTid);
   DCHECK_LE(dst->size_, kMaxTid);
   CPP_STAT_INC(StatClockStore);
 
+  if (dst->size_ == 0 && cached_idx_ != 0) {
+    // Reuse the cached clock.
+    // Note: we could reuse/cache the cached clock in more cases:
+    // we could update the existing clock and cache it, or replace it with the
+    // currently cached clock and release the old one. And for a shared
+    // existing clock, we could replace it with the currently cached;
+    // or unshare, update and cache. But, for simplicity, we currnetly reuse
+    // cached clock only when the target clock is empty.
+    dst->tab_ = ctx->clock_alloc.Map(cached_idx_);
+    dst->tab_idx_ = cached_idx_;
+    dst->size_ = cached_size_;
+    dst->blocks_ = cached_blocks_;
+    CHECK_EQ(dst->dirty_[0].tid, kInvalidTid);
+    // The cached clock is shared (immutable),
+    // so this is where we store the current clock.
+    dst->dirty_[0].tid = tid_;
+    dst->dirty_[0].epoch = clk_[tid_];
+    dst->release_store_tid_ = tid_;
+    dst->release_store_reused_ = reused_;
+    // Rememeber that we don't need to acquire it in future.
+    dst->elem(tid_).reused = reused_;
+    // Grab a reference.
+    atomic_fetch_add(ref_ptr(dst->tab_), 1, memory_order_relaxed);
+    return;
+  }
+
   // Check if we need to resize dst.
   if (dst->size_ < nclk_)
     dst->Resize(c, nclk_);
@@ -226,32 +272,41 @@ void ThreadClock::ReleaseStore(ClockCach
       dst->release_store_reused_ == reused_ &&
       dst->elem(tid_).epoch > last_acquire_) {
     CPP_STAT_INC(StatClockStoreFast);
-    UpdateCurrentThread(dst);
+    UpdateCurrentThread(c, dst);
     return;
   }
 
   // O(N) release-store.
   CPP_STAT_INC(StatClockStoreFull);
-  for (uptr i = 0; i < nclk_; i++) {
-    ClockElem &ce = dst->elem(i);
-    ce.epoch = clk_[i].epoch;
+  dst->Unshare(c);
+  // Note: dst can be larger than this ThreadClock.
+  // This is fine since clk_ beyond size is all zeros.
+  uptr i = 0;
+  for (ClockElem &ce : *dst) {
+    ce.epoch = clk_[i];
     ce.reused = 0;
+    i++;
   }
-  // Clear the tail of dst->clk_.
-  if (nclk_ < dst->size_) {
-    for (uptr i = nclk_; i < dst->size_; i++) {
-      ClockElem &ce = dst->elem(i);
-      ce.epoch = 0;
-      ce.reused = 0;
-    }
-    CPP_STAT_INC(StatClockStoreTail);
-  }
-  for (unsigned i = 0; i < kDirtyTids; i++)
-    dst->dirty_tids_[i] = kInvalidTid;
+  for (uptr i = 0; i < kDirtyTids; i++)
+    dst->dirty_[i].tid = kInvalidTid;
   dst->release_store_tid_ = tid_;
   dst->release_store_reused_ = reused_;
   // Rememeber that we don't need to acquire it in future.
   dst->elem(tid_).reused = reused_;
+
+  // If the resulting clock is cachable, cache it for future release operations.
+  // The clock is always cachable if we released to an empty sync object.
+  if (cached_idx_ == 0 && dst->Cachable()) {
+    // Grab a reference to the ClockBlock.
+    atomic_uint32_t *ref = ref_ptr(dst->tab_);
+    if (atomic_load(ref, memory_order_acquire) == 1)
+      atomic_store_relaxed(ref, 2);
+    else
+      atomic_fetch_add(ref_ptr(dst->tab_), 1, memory_order_relaxed);
+    cached_idx_ = dst->tab_idx_;
+    cached_size_ = dst->size_;
+    cached_blocks_ = dst->blocks_;
+  }
 }
 
 void ThreadClock::acq_rel(ClockCache *c, SyncClock *dst) {
@@ -261,37 +316,36 @@ void ThreadClock::acq_rel(ClockCache *c,
 }
 
 // Updates only single element related to the current thread in dst->clk_.
-void ThreadClock::UpdateCurrentThread(SyncClock *dst) const {
+void ThreadClock::UpdateCurrentThread(ClockCache *c, SyncClock *dst) const {
   // Update the threads time, but preserve 'acquired' flag.
-  dst->elem(tid_).epoch = clk_[tid_].epoch;
-
   for (unsigned i = 0; i < kDirtyTids; i++) {
-    if (dst->dirty_tids_[i] == tid_) {
+    SyncClock::Dirty *dirty = &dst->dirty_[i];
+    const unsigned tid = dirty->tid;
+    if (tid == tid_ || tid == kInvalidTid) {
       CPP_STAT_INC(StatClockReleaseFast);
-      return;
-    }
-    if (dst->dirty_tids_[i] == kInvalidTid) {
-      CPP_STAT_INC(StatClockReleaseFast);
-      dst->dirty_tids_[i] = tid_;
+      dirty->tid = tid_;
+      dirty->epoch = clk_[tid_];
       return;
     }
   }
   // Reset all 'acquired' flags, O(N).
+  // We are going to touch dst elements, so we need to unshare it.
+  dst->Unshare(c);
   CPP_STAT_INC(StatClockReleaseSlow);
+  dst->elem(tid_).epoch = clk_[tid_];
   for (uptr i = 0; i < dst->size_; i++)
     dst->elem(i).reused = 0;
-  for (unsigned i = 0; i < kDirtyTids; i++)
-    dst->dirty_tids_[i] = kInvalidTid;
+  dst->FlushDirty();
 }
 
-// Checks whether the current threads has already acquired src.
+// Checks whether the current thread has already acquired src.
 bool ThreadClock::IsAlreadyAcquired(const SyncClock *src) const {
   if (src->elem(tid_).reused != reused_)
     return false;
   for (unsigned i = 0; i < kDirtyTids; i++) {
-    unsigned tid = src->dirty_tids_[i];
-    if (tid != kInvalidTid) {
-      if (clk_[tid].epoch < src->elem(tid).epoch)
+    SyncClock::Dirty dirty = src->dirty_[i];
+    if (dirty.tid != kInvalidTid) {
+      if (clk_[dirty.tid] < dirty.epoch)
         return false;
     }
   }
@@ -302,22 +356,19 @@ bool ThreadClock::IsAlreadyAcquired(cons
 // This function is called only from weird places like AcquireGlobal.
 void ThreadClock::set(ClockCache *c, unsigned tid, u64 v) {
   DCHECK_LT(tid, kMaxTid);
-  DCHECK_GE(v, clk_[tid].epoch);
-  clk_[tid].epoch = v;
+  DCHECK_GE(v, clk_[tid]);
+  clk_[tid] = v;
   if (nclk_ <= tid)
     nclk_ = tid + 1;
-  last_acquire_ = clk_[tid_].epoch;
+  last_acquire_ = clk_[tid_];
+  ResetCached(c);
 }
 
 void ThreadClock::DebugDump(int(*printf)(const char *s, ...)) {
   printf("clock=[");
   for (uptr i = 0; i < nclk_; i++)
-    printf("%s%llu", i == 0 ? "" : ",", clk_[i].epoch);
-  printf("] reused=[");
-  for (uptr i = 0; i < nclk_; i++)
-    printf("%s%llu", i == 0 ? "" : ",", clk_[i].reused);
-  printf("] tid=%u/%u last_acq=%llu",
-      tid_, reused_, last_acquire_);
+    printf("%s%llu", i == 0 ? "" : ",", clk_[i]);
+  printf("] tid=%u/%u last_acq=%llu", tid_, reused_, last_acquire_);
 }
 
 SyncClock::SyncClock() {
@@ -327,22 +378,14 @@ SyncClock::SyncClock() {
 SyncClock::~SyncClock() {
   // Reset must be called before dtor.
   CHECK_EQ(size_, 0);
+  CHECK_EQ(blocks_, 0);
   CHECK_EQ(tab_, 0);
   CHECK_EQ(tab_idx_, 0);
 }
 
 void SyncClock::Reset(ClockCache *c) {
-  if (size_ == 0) {
-    // nothing
-  } else if (size_ <= ClockBlock::kClockCount) {
-    // One-level table.
-    ctx->clock_alloc.Free(c, tab_idx_);
-  } else {
-    // Two-level table.
-    for (uptr i = 0; i < size_; i += ClockBlock::kClockCount)
-      ctx->clock_alloc.Free(c, tab_->table[i / ClockBlock::kClockCount]);
-    ctx->clock_alloc.Free(c, tab_idx_);
-  }
+  if (size_)
+    UnrefClockBlock(c, tab_idx_, blocks_);
   ResetImpl();
 }
 
@@ -350,66 +393,171 @@ void SyncClock::ResetImpl() {
   tab_ = 0;
   tab_idx_ = 0;
   size_ = 0;
+  blocks_ = 0;
   release_store_tid_ = kInvalidTid;
   release_store_reused_ = 0;
   for (uptr i = 0; i < kDirtyTids; i++)
-    dirty_tids_[i] = kInvalidTid;
+    dirty_[i].tid = kInvalidTid;
 }
 
 void SyncClock::Resize(ClockCache *c, uptr nclk) {
   CPP_STAT_INC(StatClockReleaseResize);
-  if (RoundUpTo(nclk, ClockBlock::kClockCount) <=
-      RoundUpTo(size_, ClockBlock::kClockCount)) {
-    // Growing within the same block.
+  Unshare(c);
+  if (nclk <= capacity()) {
     // Memory is already allocated, just increase the size.
     size_ = nclk;
     return;
   }
-  if (nclk <= ClockBlock::kClockCount) {
+  if (size_ == 0) {
     // Grow from 0 to one-level table.
     CHECK_EQ(size_, 0);
+    CHECK_EQ(blocks_, 0);
     CHECK_EQ(tab_, 0);
     CHECK_EQ(tab_idx_, 0);
-    size_ = nclk;
-    tab_idx_ = ctx->clock_alloc.Alloc(c);
-    tab_ = ctx->clock_alloc.Map(tab_idx_);
-    internal_memset(tab_, 0, sizeof(*tab_));
-    return;
-  }
-  // Growing two-level table.
-  if (size_ == 0) {
-    // Allocate first level table.
-    tab_idx_ = ctx->clock_alloc.Alloc(c);
-    tab_ = ctx->clock_alloc.Map(tab_idx_);
-    internal_memset(tab_, 0, sizeof(*tab_));
-  } else if (size_ <= ClockBlock::kClockCount) {
-    // Transform one-level table to two-level table.
-    u32 old = tab_idx_;
     tab_idx_ = ctx->clock_alloc.Alloc(c);
     tab_ = ctx->clock_alloc.Map(tab_idx_);
     internal_memset(tab_, 0, sizeof(*tab_));
-    tab_->table[0] = old;
+    atomic_store_relaxed(ref_ptr(tab_), 1);
+    size_ = 1;
+  } else if (size_ > blocks_ * ClockBlock::kClockCount) {
+    u32 idx = ctx->clock_alloc.Alloc(c);
+    ClockBlock *new_cb = ctx->clock_alloc.Map(idx);
+    uptr top = size_ - blocks_ * ClockBlock::kClockCount;
+    CHECK_LT(top, ClockBlock::kClockCount);
+    const uptr move = top * sizeof(tab_->clock[0]);
+    internal_memcpy(&new_cb->clock[0], tab_->clock, move);
+    internal_memset(&new_cb->clock[top], 0, sizeof(*new_cb) - move);
+    internal_memset(tab_->clock, 0, move);
+    append_block(idx);
   }
-  // At this point we have first level table allocated.
+  // At this point we have first level table allocated and all clock elements
+  // are evacuated from it to a second level block.
   // Add second level tables as necessary.
-  for (uptr i = RoundUpTo(size_, ClockBlock::kClockCount);
-      i < nclk; i += ClockBlock::kClockCount) {
+  while (nclk > capacity()) {
     u32 idx = ctx->clock_alloc.Alloc(c);
     ClockBlock *cb = ctx->clock_alloc.Map(idx);
     internal_memset(cb, 0, sizeof(*cb));
-    CHECK_EQ(tab_->table[i/ClockBlock::kClockCount], 0);
-    tab_->table[i/ClockBlock::kClockCount] = idx;
+    append_block(idx);
   }
   size_ = nclk;
 }
 
-ClockElem &SyncClock::elem(unsigned tid) const {
+// Flushes all dirty elements into the main clock array.
+void SyncClock::FlushDirty() {
+  for (unsigned i = 0; i < kDirtyTids; i++) {
+    Dirty *dirty = &dirty_[i];
+    if (dirty->tid != kInvalidTid) {
+      CHECK_LT(dirty->tid, size_);
+      elem(dirty->tid).epoch = dirty->epoch;
+      dirty->tid = kInvalidTid;
+    }
+  }
+}
+
+bool SyncClock::IsShared() const {
+  if (size_ == 0)
+    return false;
+  atomic_uint32_t *ref = ref_ptr(tab_);
+  u32 v = atomic_load(ref, memory_order_acquire);
+  CHECK_GT(v, 0);
+  return v > 1;
+}
+
+// Unshares the current clock if it's shared.
+// Shared clocks are immutable, so they need to be unshared before any updates.
+// Note: this does not apply to dirty entries as they are not shared.
+void SyncClock::Unshare(ClockCache *c) {
+  if (!IsShared())
+    return;
+  // First, copy current state into old.
+  SyncClock old;
+  old.tab_ = tab_;
+  old.tab_idx_ = tab_idx_;
+  old.size_ = size_;
+  old.blocks_ = blocks_;
+  old.release_store_tid_ = release_store_tid_;
+  old.release_store_reused_ = release_store_reused_;
+  for (unsigned i = 0; i < kDirtyTids; i++)
+    old.dirty_[i] = dirty_[i];
+  // Then, clear current object.
+  ResetImpl();
+  // Allocate brand new clock in the current object.
+  Resize(c, old.size_);
+  // Now copy state back into this object.
+  Iter old_iter(&old);
+  for (ClockElem &ce : *this) {
+    ce = *old_iter;
+    ++old_iter;
+  }
+  release_store_tid_ = old.release_store_tid_;
+  release_store_reused_ = old.release_store_reused_;
+  for (unsigned i = 0; i < kDirtyTids; i++)
+    dirty_[i] = old.dirty_[i];
+  // Drop reference to old and delete if necessary.
+  old.Reset(c);
+}
+
+// Can we cache this clock for future release operations?
+ALWAYS_INLINE bool SyncClock::Cachable() const {
+  if (size_ == 0)
+    return false;
+  for (unsigned i = 0; i < kDirtyTids; i++) {
+    if (dirty_[i].tid != kInvalidTid)
+      return false;
+  }
+  return atomic_load_relaxed(ref_ptr(tab_)) == 1;
+}
+
+// elem linearizes the two-level structure into linear array.
+// Note: this is used only for one time accesses, vector operations use
+// the iterator as it is much faster.
+ALWAYS_INLINE ClockElem &SyncClock::elem(unsigned tid) const {
   DCHECK_LT(tid, size_);
-  if (size_ <= ClockBlock::kClockCount)
+  const uptr block = tid / ClockBlock::kClockCount;
+  DCHECK_LE(block, blocks_);
+  tid %= ClockBlock::kClockCount;
+  if (block == blocks_)
     return tab_->clock[tid];
-  u32 idx = tab_->table[tid / ClockBlock::kClockCount];
+  u32 idx = get_block(block);
   ClockBlock *cb = ctx->clock_alloc.Map(idx);
-  return cb->clock[tid % ClockBlock::kClockCount];
+  return cb->clock[tid];
+}
+
+ALWAYS_INLINE uptr SyncClock::capacity() const {
+  if (size_ == 0)
+    return 0;
+  uptr ratio = sizeof(ClockBlock::clock[0]) / sizeof(ClockBlock::table[0]);
+  // How many clock elements we can fit into the first level block.
+  // +1 for ref counter.
+  uptr top = ClockBlock::kClockCount - RoundUpTo(blocks_ + 1, ratio) / ratio;
+  return blocks_ * ClockBlock::kClockCount + top;
+}
+
+ALWAYS_INLINE u32 SyncClock::get_block(uptr bi) const {
+  DCHECK(size_);
+  DCHECK_LT(bi, blocks_);
+  return tab_->table[ClockBlock::kBlockIdx - bi];
+}
+
+ALWAYS_INLINE void SyncClock::append_block(u32 idx) {
+  uptr bi = blocks_++;
+  CHECK_EQ(get_block(bi), 0);
+  tab_->table[ClockBlock::kBlockIdx - bi] = idx;
+}
+
+// Used only by tests.
+u64 SyncClock::get(unsigned tid) const {
+  for (unsigned i = 0; i < kDirtyTids; i++) {
+    Dirty dirty = dirty_[i];
+    if (dirty.tid == tid)
+      return dirty.epoch;
+  }
+  return elem(tid).epoch;
+}
+
+// Used only by Iter test.
+u64 SyncClock::get_clean(unsigned tid) const {
+  return elem(tid).epoch;
 }
 
 void SyncClock::DebugDump(int(*printf)(const char *s, ...)) {
@@ -419,8 +567,32 @@ void SyncClock::DebugDump(int(*printf)(c
   printf("] reused=[");
   for (uptr i = 0; i < size_; i++)
     printf("%s%llu", i == 0 ? "" : ",", elem(i).reused);
-  printf("] release_store_tid=%d/%d dirty_tids=%d/%d",
+  printf("] release_store_tid=%d/%d dirty_tids=%d[%llu]/%d[%llu]",
       release_store_tid_, release_store_reused_,
-      dirty_tids_[0], dirty_tids_[1]);
+      dirty_[0].tid, dirty_[0].epoch,
+      dirty_[1].tid, dirty_[1].epoch);
+}
+
+void SyncClock::Iter::Next() {
+  // Finished with the current block, move on to the next one.
+  block_++;
+  if (block_ < parent_->blocks_) {
+    // Iterate over the next second level block.
+    u32 idx = parent_->get_block(block_);
+    ClockBlock *cb = ctx->clock_alloc.Map(idx);
+    pos_ = &cb->clock[0];
+    end_ = pos_ + min(parent_->size_ - block_ * ClockBlock::kClockCount,
+        ClockBlock::kClockCount);
+    return;
+  }
+  if (block_ == parent_->blocks_ &&
+      parent_->size_ > parent_->blocks_ * ClockBlock::kClockCount) {
+    // Iterate over elements in the first level block.
+    pos_ = &parent_->tab_->clock[0];
+    end_ = pos_ + min(parent_->size_ - block_ * ClockBlock::kClockCount,
+        ClockBlock::kClockCount);
+    return;
+  }
+  parent_ = nullptr;  // denotes end
 }
 }  // namespace __tsan

Modified: compiler-rt/trunk/lib/tsan/rtl/tsan_clock.h
URL: http://llvm.org/viewvc/llvm-project/compiler-rt/trunk/lib/tsan/rtl/tsan_clock.h?rev=308018&r1=308017&r2=308018&view=diff
==============================================================================
--- compiler-rt/trunk/lib/tsan/rtl/tsan_clock.h (original)
+++ compiler-rt/trunk/lib/tsan/rtl/tsan_clock.h Fri Jul 14 04:30:06 2017
@@ -18,25 +18,6 @@
 
 namespace __tsan {
 
-struct ClockElem {
-  u64 epoch  : kClkBits;
-  u64 reused : 64 - kClkBits;
-};
-
-struct ClockBlock {
-  static const uptr kSize = 512;
-  static const uptr kTableSize = kSize / sizeof(u32);
-  static const uptr kClockCount = kSize / sizeof(ClockElem);
-
-  union {
-    u32       table[kTableSize];
-    ClockElem clock[kClockCount];
-  };
-
-  ClockBlock() {
-  }
-};
-
 typedef DenseSlabAlloc<ClockBlock, 1<<16, 1<<10> ClockAlloc;
 typedef DenseSlabAllocCache ClockCache;
 
@@ -46,69 +27,117 @@ class SyncClock {
   SyncClock();
   ~SyncClock();
 
-  uptr size() const {
-    return size_;
-  }
-
-  u64 get(unsigned tid) const {
-    return elem(tid).epoch;
-  }
+  uptr size() const;
+
+  // These are used only in tests.
+  u64 get(unsigned tid) const;
+  u64 get_clean(unsigned tid) const;
 
   void Resize(ClockCache *c, uptr nclk);
   void Reset(ClockCache *c);
 
   void DebugDump(int(*printf)(const char *s, ...));
 
+  // Clock element iterator.
+  // Note: it iterates only over the table without regard to dirty entries.
+  class Iter {
+   public:
+    explicit Iter(SyncClock* parent);
+    Iter& operator++();
+    bool operator!=(const Iter& other);
+    ClockElem &operator*();
+
+   private:
+    SyncClock *parent_;
+    // [pos_, end_) is the current continuous range of clock elements.
+    ClockElem *pos_;
+    ClockElem *end_;
+    int block_;  // Current number of second level block.
+
+    NOINLINE void Next();
+  };
+
+  Iter begin();
+  Iter end();
+
  private:
-  friend struct ThreadClock;
+  friend class ThreadClock;
+  friend class Iter;
   static const uptr kDirtyTids = 2;
 
+  struct Dirty {
+    u64 epoch  : kClkBits;
+    u64 tid : 64 - kClkBits;  // kInvalidId if not active
+  };
+
   unsigned release_store_tid_;
   unsigned release_store_reused_;
-  unsigned dirty_tids_[kDirtyTids];
-  // tab_ contains indirect pointer to a 512b block using DenseSlabAlloc.
-  // If size_ <= 64, then tab_ points to an array with 64 ClockElem's.
-  // Otherwise, tab_ points to an array with 128 u32 elements,
+  Dirty dirty_[kDirtyTids];
+  // If size_ is 0, tab_ is nullptr.
+  // If size <= 64 (kClockCount), tab_ contains pointer to an array with
+  // 64 ClockElem's (ClockBlock::clock).
+  // Otherwise, tab_ points to an array with up to 127 u32 elements,
   // each pointing to the second-level 512b block with 64 ClockElem's.
+  // Unused space in the first level ClockBlock is used to store additional
+  // clock elements.
+  // The last u32 element in the first level ClockBlock is always used as
+  // reference counter.
+  //
+  // See the following scheme for details.
+  // All memory blocks are 512 bytes (allocated from ClockAlloc).
+  // Clock (clk) elements are 64 bits.
+  // Idx and ref are 32 bits.
+  //
+  // tab_
+  //    |
+  //    \/
+  //    +----------------------------------------------------+
+  //    | clk128 | clk129 | ...unused... | idx1 | idx0 | ref |
+  //    +----------------------------------------------------+
+  //                                        |      |
+  //                                        |      \/
+  //                                        |      +----------------+
+  //                                        |      | clk0 ... clk63 |
+  //                                        |      +----------------+
+  //                                        \/
+  //                                        +------------------+
+  //                                        | clk64 ... clk127 |
+  //                                        +------------------+
+  //
+  // Note: dirty entries, if active, always override what's stored in the clock.
   ClockBlock *tab_;
   u32 tab_idx_;
-  u32 size_;
+  u16 size_;
+  u16 blocks_;  // Number of second level blocks.
 
+  void Unshare(ClockCache *c);
+  bool IsShared() const;
+  bool Cachable() const;
   void ResetImpl();
+  void FlushDirty();
+  uptr capacity() const;
+  u32 get_block(uptr bi) const;
+  void append_block(u32 idx);
   ClockElem &elem(unsigned tid) const;
 };
 
 // The clock that lives in threads.
-struct ThreadClock {
+class ThreadClock {
  public:
   typedef DenseSlabAllocCache Cache;
 
   explicit ThreadClock(unsigned tid, unsigned reused = 0);
 
-  u64 get(unsigned tid) const {
-    DCHECK_LT(tid, kMaxTidInClock);
-    return clk_[tid].epoch;
-  }
-
+  u64 get(unsigned tid) const;
   void set(ClockCache *c, unsigned tid, u64 v);
+  void set(u64 v);
+  void tick();
+  uptr size() const;
 
-  void set(u64 v) {
-    DCHECK_GE(v, clk_[tid_].epoch);
-    clk_[tid_].epoch = v;
-  }
-
-  void tick() {
-    clk_[tid_].epoch++;
-  }
-
-  uptr size() const {
-    return nclk_;
-  }
-
-  void acquire(ClockCache *c, const SyncClock *src);
-  void release(ClockCache *c, SyncClock *dst) const;
+  void acquire(ClockCache *c, SyncClock *src);
+  void release(ClockCache *c, SyncClock *dst);
   void acq_rel(ClockCache *c, SyncClock *dst);
-  void ReleaseStore(ClockCache *c, SyncClock *dst) const;
+  void ReleaseStore(ClockCache *c, SyncClock *dst);
   void ResetCached(ClockCache *c);
 
   void DebugReset();
@@ -116,16 +145,82 @@ struct ThreadClock {
 
  private:
   static const uptr kDirtyTids = SyncClock::kDirtyTids;
+  // Index of the thread associated with he clock ("current thread").
   const unsigned tid_;
-  const unsigned reused_;
+  const unsigned reused_;  // tid_ reuse count.
+  // Current thread time when it acquired something from other threads.
   u64 last_acquire_;
+
+  // Cached SyncClock (without dirty entries and release_store_tid_).
+  // We reuse it for subsequent store-release operations without intervening
+  // acquire operations. Since it is shared (and thus constant), clock value
+  // for the current thread is then stored in dirty entries in the SyncClock.
+  // We host a refernece to the table while it is cached here.
+  u32 cached_idx_;
+  u16 cached_size_;
+  u16 cached_blocks_;
+
+  // Number of active elements in the clk_ table (the rest is zeros).
   uptr nclk_;
-  ClockElem clk_[kMaxTidInClock];
+  u64 clk_[kMaxTidInClock];  // Fixed size vector clock.
 
   bool IsAlreadyAcquired(const SyncClock *src) const;
-  void UpdateCurrentThread(SyncClock *dst) const;
+  void UpdateCurrentThread(ClockCache *c, SyncClock *dst) const;
 };
 
+ALWAYS_INLINE u64 ThreadClock::get(unsigned tid) const {
+  DCHECK_LT(tid, kMaxTidInClock);
+  return clk_[tid];
+}
+
+ALWAYS_INLINE void ThreadClock::set(u64 v) {
+  DCHECK_GE(v, clk_[tid_]);
+  clk_[tid_] = v;
+}
+
+ALWAYS_INLINE void ThreadClock::tick() {
+  clk_[tid_]++;
+}
+
+ALWAYS_INLINE uptr ThreadClock::size() const {
+  return nclk_;
+}
+
+ALWAYS_INLINE SyncClock::Iter SyncClock::begin() {
+  return Iter(this);
+}
+
+ALWAYS_INLINE SyncClock::Iter SyncClock::end() {
+  return Iter(nullptr);
+}
+
+ALWAYS_INLINE uptr SyncClock::size() const {
+  return size_;
+}
+
+ALWAYS_INLINE SyncClock::Iter::Iter(SyncClock* parent)
+    : parent_(parent)
+    , pos_(nullptr)
+    , end_(nullptr)
+    , block_(-1) {
+  if (parent)
+    Next();
+}
+
+ALWAYS_INLINE SyncClock::Iter& SyncClock::Iter::operator++() {
+  pos_++;
+  if (UNLIKELY(pos_ >= end_))
+    Next();
+  return *this;
+}
+
+ALWAYS_INLINE bool SyncClock::Iter::operator!=(const SyncClock::Iter& other) {
+  return parent_ != other.parent_;
+}
+
+ALWAYS_INLINE ClockElem &SyncClock::Iter::operator*() {
+  return *pos_;
+}
 }  // namespace __tsan
 
 #endif  // TSAN_CLOCK_H

Modified: compiler-rt/trunk/lib/tsan/rtl/tsan_defs.h
URL: http://llvm.org/viewvc/llvm-project/compiler-rt/trunk/lib/tsan/rtl/tsan_defs.h?rev=308018&r1=308017&r2=308018&view=diff
==============================================================================
--- compiler-rt/trunk/lib/tsan/rtl/tsan_defs.h (original)
+++ compiler-rt/trunk/lib/tsan/rtl/tsan_defs.h Fri Jul 14 04:30:06 2017
@@ -38,15 +38,40 @@
 
 namespace __tsan {
 
+const int kClkBits = 42;
+const unsigned kMaxTidReuse = (1 << (64 - kClkBits)) - 1;
+
+struct ClockElem {
+  u64 epoch  : kClkBits;
+  u64 reused : 64 - kClkBits;  // tid reuse count
+};
+
+struct ClockBlock {
+  static const uptr kSize = 512;
+  static const uptr kTableSize = kSize / sizeof(u32);
+  static const uptr kClockCount = kSize / sizeof(ClockElem);
+  static const uptr kRefIdx = kTableSize - 1;
+  static const uptr kBlockIdx = kTableSize - 2;
+
+  union {
+    u32       table[kTableSize];
+    ClockElem clock[kClockCount];
+  };
+
+  ClockBlock() {
+  }
+};
+
 const int kTidBits = 13;
-const unsigned kMaxTid = 1 << kTidBits;
+// Reduce kMaxTid by kClockCount because one slot in ClockBlock table is
+// occupied by reference counter, so total number of elements we can store
+// in SyncClock is kClockCount * (kTableSize - 1).
+const unsigned kMaxTid = (1 << kTidBits) - ClockBlock::kClockCount;
 #if !SANITIZER_GO
 const unsigned kMaxTidInClock = kMaxTid * 2;  // This includes msb 'freed' bit.
 #else
 const unsigned kMaxTidInClock = kMaxTid;  // Go does not track freed memory.
 #endif
-const int kClkBits = 42;
-const unsigned kMaxTidReuse = (1 << (64 - kClkBits)) - 1;
 const uptr kShadowStackSize = 64 * 1024;
 
 // Count of shadow values in a shadow cell.
@@ -74,7 +99,7 @@ const bool kCollectHistory = false;
 const bool kCollectHistory = true;
 #endif
 
-const unsigned kInvalidTid = (unsigned)-1;
+const u16 kInvalidTid = kMaxTid + 1;
 
 // The following "build consistency" machinery ensures that all source files
 // are built in the same configuration. Inconsistent builds lead to

Modified: compiler-rt/trunk/lib/tsan/tests/unit/tsan_clock_test.cc
URL: http://llvm.org/viewvc/llvm-project/compiler-rt/trunk/lib/tsan/tests/unit/tsan_clock_test.cc?rev=308018&r1=308017&r2=308018&view=diff
==============================================================================
--- compiler-rt/trunk/lib/tsan/tests/unit/tsan_clock_test.cc (original)
+++ compiler-rt/trunk/lib/tsan/tests/unit/tsan_clock_test.cc Fri Jul 14 04:30:06 2017
@@ -53,6 +53,31 @@ TEST(Clock, ChunkedBasic) {
   chunked.Reset(&cache);
 }
 
+static const uptr interesting_sizes[] = {0, 1, 2, 30, 61, 62, 63, 64, 65, 66,
+    100, 124, 125, 126, 127, 128, 129, 130, 188, 189, 190, 191, 192, 193, 254,
+    255};
+
+TEST(Clock, Iter) {
+  const uptr n = ARRAY_SIZE(interesting_sizes);
+  for (uptr fi = 0; fi < n; fi++) {
+    const uptr size = interesting_sizes[fi];
+    SyncClock sync;
+    ThreadClock vector(0);
+    for (uptr i = 0; i < size; i++)
+      vector.set(&cache, i, i + 1);
+    if (size != 0)
+      vector.release(&cache, &sync);
+    uptr i = 0;
+    for (ClockElem &ce : sync) {
+      ASSERT_LT(i, size);
+      ASSERT_EQ(sync.get_clean(i), ce.epoch);
+      i++;
+    }
+    ASSERT_EQ(i, size);
+    sync.Reset(&cache);
+  }
+}
+
 TEST(Clock, AcquireRelease) {
   ThreadClock vector1(100);
   vector1.tick();
@@ -216,13 +241,11 @@ TEST(Clock, Growth) {
 
 TEST(Clock, Growth2) {
   // Test clock growth for every pair of sizes:
-  const uptr sizes[] = {0, 1, 2, 30, 61, 62, 63, 64, 65, 66, 100, 124, 125, 126,
-      127, 128, 129, 130, 188, 189, 190, 191, 192, 193, 254, 255};
-  const uptr n = sizeof(sizes) / sizeof(sizes[0]);
+  const uptr n = ARRAY_SIZE(interesting_sizes);
   for (uptr fi = 0; fi < n; fi++) {
     for (uptr ti = fi + 1; ti < n; ti++) {
-      const uptr from = sizes[fi];
-      const uptr to = sizes[ti];
+      const uptr from = interesting_sizes[fi];
+      const uptr to = interesting_sizes[ti];
       SyncClock sync;
       ThreadClock vector(0);
       for (uptr i = 0; i < from; i++)