[compiler-rt] r204656 - tsan: optimize vector clock operations

Mon Mar 24 11:54:21 PDT 2014

Author: dvyukov
Date: Mon Mar 24 13:54:20 2014
New Revision: 204656

URL: http://llvm.org/viewvc/llvm-project?rev=204656&view=rev
Log:
tsan: optimize vector clock operations
Make vector clock operations O(1) for several important classes of use cases.
See comments for details.
Below are stats from a large server app, 77% of all clock operations are handled as O(1).

Clock acquire                     :         25983645
  empty clock                     :          6288080
  fast from release-store         :         14917504
  contains my tid                 :          4515743
  repeated (fast)                 :          2141428
  full (slow)                     :          2636633
  acquired something              :          1426863
Clock release                     :          2544216
  resize                          :             6241
  fast1                           :           197693
  fast2                           :          1016293
  fast3                           :             2007
  full (slow)                     :          1797488
  was acquired                    :           709227
  clear tail                      :                1
  last overflow                   :                0
Clock release store               :          3446946
  resize                          :           200516
  fast                            :           469265
  slow                            :          2977681
  clear tail                      :                0
Clock acquire-release             :           820028



Modified:
    compiler-rt/trunk/lib/tsan/rtl/tsan_clock.cc
    compiler-rt/trunk/lib/tsan/rtl/tsan_clock.h
    compiler-rt/trunk/lib/tsan/rtl/tsan_rtl.cc
    compiler-rt/trunk/lib/tsan/rtl/tsan_rtl_mutex.cc
    compiler-rt/trunk/lib/tsan/rtl/tsan_stat.cc
    compiler-rt/trunk/lib/tsan/rtl/tsan_stat.h
    compiler-rt/trunk/lib/tsan/tests/unit/tsan_clock_test.cc

Modified: compiler-rt/trunk/lib/tsan/rtl/tsan_clock.cc
URL: http://llvm.org/viewvc/llvm-project/compiler-rt/trunk/lib/tsan/rtl/tsan_clock.cc?rev=204656&r1=204655&r2=204656&view=diff
==============================================================================

--- compiler-rt/trunk/lib/tsan/rtl/tsan_clock.cc (original)
+++ compiler-rt/trunk/lib/tsan/rtl/tsan_clock.cc Mon Mar 24 13:54:20 2014
@@ -13,66 +13,157 @@
 #include "tsan_clock.h"
 #include "tsan_rtl.h"
 
-// It's possible to optimize clock operations for some important cases
-// so that they are O(1). The cases include singletons, once's, local mutexes.
-// First, SyncClock must be re-implemented to allow indexing by tid.
-// It must not necessarily be a full vector clock, though. For example it may
-// be a multi-level table.
-// Then, each slot in SyncClock must contain a dirty bit (it's united with
-// the clock value, so no space increase). The acquire algorithm looks
-// as follows:
-// void acquire(thr, tid, thr_clock, sync_clock) {
-//   if (!sync_clock[tid].dirty)
-//     return;  // No new info to acquire.
-//              // This handles constant reads of singleton pointers and
-//              // stop-flags.
-//   acquire_impl(thr_clock, sync_clock);  // As usual, O(N).
-//   sync_clock[tid].dirty = false;
-//   sync_clock.dirty_count--;
-// }
-// The release operation looks as follows:
-// void release(thr, tid, thr_clock, sync_clock) {
-//   // thr->sync_cache is a simple fixed-size hash-based cache that holds
-//   // several previous sync_clock's.
-//   if (thr->sync_cache[sync_clock] >= thr->last_acquire_epoch) {
-//     // The thread did no acquire operations since last release on this clock.
-//     // So update only the thread's slot (other slots can't possibly change).
-//     sync_clock[tid].clock = thr->epoch;
-//     if (sync_clock.dirty_count == sync_clock.cnt
-//         || (sync_clock.dirty_count == sync_clock.cnt - 1
-//           && sync_clock[tid].dirty == false))
-//       // All dirty flags are set, bail out.
-//       return;
-//     set all dirty bits, but preserve the thread's bit.  // O(N)
-//     update sync_clock.dirty_count;
-//     return;
+// SyncClock and ThreadClock implement vector clocks for sync variables
+// (mutexes, atomic variables, file descriptors, etc) and threads, respectively.
+// ThreadClock contains fixed-size vector clock for maximum number of threads.
+// SyncClock contains growable vector clock for currently necessary number of
+// threads.
+// Together they implement very simple model of operations, namely:
+//
+//   void ThreadClock::acquire(const SyncClock *src) {
+//     for (int i = 0; i < kMaxThreads; i++)
+//       clock[i] = max(clock[i], src->clock[i]);
 //   }
-//   release_impl(thr_clock, sync_clock);  // As usual, O(N).
-//   set all dirty bits, but preserve the thread's bit.
-//   // The previous step is combined with release_impl(), so that
-//   // we scan the arrays only once.
-//   update sync_clock.dirty_count;
-// }
+//
+//   void ThreadClock::release(SyncClock *dst) const {
+//     for (int i = 0; i < kMaxThreads; i++)
+//       dst->clock[i] = max(dst->clock[i], clock[i]);
+//   }
+//
+//   void ThreadClock::ReleaseStore(SyncClock *dst) const {
+//     for (int i = 0; i < kMaxThreads; i++)
+//       dst->clock[i] = clock[i];
+//   }
+//
+//   void ThreadClock::acq_rel(SyncClock *dst) {
+//     acquire(dst);
+//     release(dst);
+//   }
+//
+// Conformance to this model is extensively verified in tsan_clock_test.cc.
+// However, the implementation is significantly more complex. The complexity
+// allows to implement important classes of use cases in O(1) instead of O(N).
+//
+// The use cases are:
+// 1. Singleton/once atomic that has a single release-store operation followed
+//    by zillions of acquire-loads (the acquire-load is O(1)).
+// 2. Thread-local mutex (both lock and unlock can be O(1)).
+// 3. Leaf mutex (unlock is O(1)).
+// 4. A mutex shared by 2 threads (both lock and unlock can be O(1)).
+// 5. An atomic with a single writer (writes can be O(1)).
+// The implementation dynamically adopts to workload. So if an atomic is in
+// read-only phase, these reads will be O(1); if it later switches to read/write
+// phase, the implementation will correctly handle that by switching to O(N).
+//
+// Thread-safety note: all const operations on SyncClock's are conducted under
+// a shared lock; all non-const operations on SyncClock's are conducted under
+// an exclusive lock; ThreadClock's are private to respective threads and so
+// do not need any protection.
+//
+// Description of ThreadClock state:
+// clk_ - fixed size vector clock.
+// nclk_ - effective size of the vector clock (the rest is zeros).
+// tid_ - index of the thread associated with he clock ("current thread").
+// last_acquire_ - current thread time when it acquired something from
+//   other threads.
+//
+// Description of SyncClock state:
+// clk_ - variable size vector clock, low kClkBits hold timestamp,
+//   the remaining bits hold "last_acq" counter;
+//   if last_acq == release_seq_, then the respective thread has already
+//   acquired this clock (except possibly dirty_tids_).
+// dirty_tids_ - holds up to two indeces in the vector clock that other threads
+//   need to acquire regardless of last_acq value;
+// release_store_tid_ - denotes that the clock state is a result of
+//   release-store operation by the thread with release_store_tid_ index.
+
+// We don't have ThreadState in these methods, so this is an ugly hack that
+// works only in C++.
+#ifndef TSAN_GO
+# define CPP_STAT_INC(typ) StatInc(cur_thread(), typ)
+#else
+# define CPP_STAT_INC(typ) (void)0
+#endif
 
 namespace __tsan {
 
-ThreadClock::ThreadClock() {
-  nclk_ = 0;
-  for (uptr i = 0; i < (uptr)kMaxTidInClock; i++)
-    clk_[i] = 0;
+const unsigned kInvalidTid = (unsigned)-1;
+
+ThreadClock::ThreadClock(unsigned tid)
+    : tid_(tid) {
+  DCHECK_LT(tid, kMaxTidInClock);
+  nclk_ = tid_ + 1;
+  internal_memset(clk_, 0, sizeof(clk_));
 }
 
 void ThreadClock::acquire(const SyncClock *src) {
   DCHECK(nclk_ <= kMaxTid);
   DCHECK(src->clk_.Size() <= kMaxTid);
+  CPP_STAT_INC(StatClockAcquire);
 
+  // Check if it's empty -> no need to do anything.
   const uptr nclk = src->clk_.Size();
-  if (nclk == 0)
+  if (nclk == 0) {
+    CPP_STAT_INC(StatClockAcquireEmpty);
+    return;
+  }
+
+  // If the clock is a result of release-store operation, and the current thread
+  // has already acquired from that thread after or at that time,
+  // don't need to do anything (src can't contain anything new for the
+  // current thread).
+  unsigned tid1 = src->release_store_tid_;
+  if (tid1 != kInvalidTid && (src->clk_[tid1] & kClkMask) <= clk_[tid1]) {
+    CPP_STAT_INC(StatClockAcquireFastRelease);
     return;
+  }
+
+  // Check if we've already acquired src after the last release operation on src
+  bool acquired = false;
+  if (nclk > tid_) {
+    CPP_STAT_INC(StatClockAcquireLarge);
+    u64 myepoch = src->clk_[tid_];
+    u64 last_acq = myepoch >> kClkBits;
+    if (last_acq == src->release_seq_) {
+      CPP_STAT_INC(StatClockAcquireRepeat);
+      for (unsigned i = 0; i < kDirtyTids; i++) {
+        unsigned tid = src->dirty_tids_[i];
+        if (tid != kInvalidTid) {
+          u64 epoch = src->clk_[tid] & kClkMask;
+          if (clk_[tid] < epoch) {
+            clk_[tid] = epoch;
+            acquired = true;
+          }
+        }
+      }
+      if (acquired) {
+        CPP_STAT_INC(StatClockAcquiredSomething);
+        last_acquire_ = clk_[tid_];
+      }
+      return;
+    }
+  }
+
+  // O(N) acquire.
+  CPP_STAT_INC(StatClockAcquireFull);
   nclk_ = max(nclk_, nclk);
   for (uptr i = 0; i < nclk; i++) {
-    if (clk_[i] < src->clk_[i])
-      clk_[i] = src->clk_[i];
+    u64 epoch = src->clk_[i] & kClkMask;
+    if (clk_[i] < epoch) {
+      clk_[i] = epoch;
+      acquired = true;
+    }
+  }
+
+  // Remember that this thread has acquired this clock.
+  if (nclk > tid_) {
+    u64 myepoch = src->clk_[tid_];
+    src->clk_[tid_] = (myepoch & kClkMask) | (src->release_seq_ << kClkBits);
+  }
+
+  if (acquired) {
+    CPP_STAT_INC(StatClockAcquiredSomething);
+    last_acquire_ = clk_[tid_];
   }
 }
 
@@ -80,32 +171,185 @@ void ThreadClock::release(SyncClock *dst
   DCHECK(nclk_ <= kMaxTid);
   DCHECK(dst->clk_.Size() <= kMaxTid);
 
-  if (dst->clk_.Size() < nclk_)
+  if (dst->clk_.Size() == 0) {
+    // ReleaseStore will correctly set release_store_tid_,
+    // which can be important for future operations.
+    ReleaseStore(dst);
+    return;
+  }
+
+  CPP_STAT_INC(StatClockRelease);
+  // Check if we need to resize dst.
+  if (dst->clk_.Size() < nclk_) {
+    CPP_STAT_INC(StatClockReleaseResize);
     dst->clk_.Resize(nclk_);
-  for (uptr i = 0; i < nclk_; i++) {
-    if (dst->clk_[i] < clk_[i])
-      dst->clk_[i] = clk_[i];
   }
+
+  // Check if we had not acquired anything from other threads
+  // since the last release on dst. If so, we need to update
+  // only dst->clk_[tid_].
+  if ((dst->clk_[tid_] & kClkMask) > last_acquire_) {
+    UpdateCurrentThread(dst);
+    if (dst->release_store_tid_ != tid_)
+      dst->release_store_tid_ = kInvalidTid;
+    return;
+  }
+
+  // O(N) release.
+  CPP_STAT_INC(StatClockReleaseFull);
+  // First, remember whether we've acquired dst.
+  bool acquired = IsAlreadyAcquired(dst);
+  if (acquired)
+    CPP_STAT_INC(StatClockReleaseAcquired);
+  // Update dst->clk_.
+  for (uptr i = 0; i < nclk_; i++)
+    dst->clk_[i] = max(dst->clk_[i] & kClkMask, clk_[i]);
+  // Clear last_acq in the remaining elements.
+  if (nclk_ < dst->clk_.Size())
+    CPP_STAT_INC(StatClockReleaseClearTail);
+  for (uptr i = nclk_; i < dst->clk_.Size(); i++)
+    dst->clk_[i] = dst->clk_[i] & kClkMask;
+  // Since we've cleared all last_acq, we can reset release_seq_ as well.
+  dst->release_seq_ = 1;
+  for (unsigned i = 0; i < kDirtyTids; i++)
+    dst->dirty_tids_[i] = kInvalidTid;
+  dst->release_store_tid_ = kInvalidTid;
+  // If we've acquired dst, remember this fact,
+  // so that we don't need to acquire it on next acquire.
+  if (acquired)
+    dst->clk_[tid_] = dst->clk_[tid_] | (1ULL << kClkBits);
 }
 
 void ThreadClock::ReleaseStore(SyncClock *dst) const {
   DCHECK(nclk_ <= kMaxTid);
   DCHECK(dst->clk_.Size() <= kMaxTid);
+  CPP_STAT_INC(StatClockStore);
 
-  if (dst->clk_.Size() < nclk_)
+  // Check if we need to resize dst.
+  if (dst->clk_.Size() < nclk_) {
+    CPP_STAT_INC(StatClockStoreResize);
     dst->clk_.Resize(nclk_);
+  }
+
+  if (dst->release_store_tid_ == tid_ &&
+      (dst->clk_[tid_] & kClkMask) > last_acquire_) {
+    CPP_STAT_INC(StatClockStoreFast);
+    UpdateCurrentThread(dst);
+    return;
+  }
+
+  // O(N) release-store.
+  CPP_STAT_INC(StatClockStoreFull);
   for (uptr i = 0; i < nclk_; i++)
     dst->clk_[i] = clk_[i];
-  for (uptr i = nclk_; i < dst->clk_.Size(); i++)
-    dst->clk_[i] = 0;
+  // Clear the tail of dst->clk_.
+  if (nclk_ < dst->clk_.Size()) {
+    internal_memset(&dst->clk_[nclk_], 0,
+        (dst->clk_.Size() - nclk_) * sizeof(dst->clk_[0]));
+    CPP_STAT_INC(StatClockStoreTail);
+  }
+  // Since we've cleared all last_acq, we can reset release_seq_ as well.
+  dst->release_seq_ = 1;
+  for (unsigned i = 0; i < kDirtyTids; i++)
+    dst->dirty_tids_[i] = kInvalidTid;
+  dst->release_store_tid_ = tid_;
+  // Rememeber that we don't need to acquire it in future.
+  dst->clk_[tid_] = clk_[tid_] | (1ULL << kClkBits);
 }
 
 void ThreadClock::acq_rel(SyncClock *dst) {
+  CPP_STAT_INC(StatClockAcquireRelease);
   acquire(dst);
-  release(dst);
+  ReleaseStore(dst);
+}
+
+// Updates only single element related to the current thread in dst->clk_.
+void ThreadClock::UpdateCurrentThread(SyncClock *dst) const {
+  // Update the threads time, but preserve last_acq.
+  dst->clk_[tid_] = clk_[tid_] | (dst->clk_[tid_] & ~kClkMask);
+
+  for (unsigned i = 0; i < kDirtyTids; i++) {
+    if (dst->dirty_tids_[i] == tid_) {
+      CPP_STAT_INC(StatClockReleaseFast1);
+      return;
+    }
+    if (dst->dirty_tids_[i] == kInvalidTid) {
+      CPP_STAT_INC(StatClockReleaseFast2);
+      dst->dirty_tids_[i] = tid_;
+      return;
+    }
+  }
+  CPP_STAT_INC(StatClockReleaseFast3);
+  dst->release_seq_++;
+  for (unsigned i = 0; i < kDirtyTids; i++)
+    dst->dirty_tids_[i] = kInvalidTid;
+  if ((dst->release_seq_ << kClkBits) == 0) {
+    CPP_STAT_INC(StatClockReleaseLastOverflow);
+    dst->release_seq_ = 1;
+    for (uptr i = 0; i < dst->clk_.Size(); i++)
+      dst->clk_[i] = dst->clk_[i] & kClkMask;
+  }
+}
+
+// Checks whether the current threads has already acquired src.
+bool ThreadClock::IsAlreadyAcquired(const SyncClock *src) const {
+  u64 myepoch = src->clk_[tid_];
+  u64 last_acq = myepoch >> kClkBits;
+  if (last_acq != src->release_seq_)
+    return false;
+  for (unsigned i = 0; i < kDirtyTids; i++) {
+    unsigned tid = src->dirty_tids_[i];
+    if (tid != kInvalidTid) {
+      u64 epoch = src->clk_[tid] & kClkMask;
+      if (clk_[tid] < epoch)
+        return false;
+    }
+  }
+  return true;
+}
+
+// Sets a single element in the vector clock.
+// This function is called only from weird places like AcquireGlobal.
+void ThreadClock::set(unsigned tid, u64 v) {
+  DCHECK_LT(tid, kMaxTid);
+  DCHECK_GE(v, clk_[tid]);
+  clk_[tid] = v;
+  if (nclk_ <= tid)
+    nclk_ = tid + 1;
+  last_acquire_ = clk_[tid_];
+}
+
+void ThreadClock::DebugDump(int(*printf)(const char *s, ...)) {
+  printf("clock=[");
+  for (uptr i = 0; i < nclk_; i++)
+    printf("%s%llu", i == 0 ? "" : ",", clk_[i]);
+  printf("] tid=%u last_acq=%llu", tid_, last_acquire_);
 }
 
 SyncClock::SyncClock()
-  : clk_(MBlockClock) {
+    : clk_(MBlockClock) {
+  for (uptr i = 0; i < kDirtyTids; i++)
+    dirty_tids_[i] = kInvalidTid;
+  release_seq_ = 0;
+  release_store_tid_ = kInvalidTid;
+}
+
+void SyncClock::Reset() {
+  clk_.Reset();
+  release_seq_ = 0;
+  release_store_tid_ = kInvalidTid;
+  for (uptr i = 0; i < kDirtyTids; i++)
+    dirty_tids_[i] = kInvalidTid;
+}
+
+void SyncClock::DebugDump(int(*printf)(const char *s, ...)) {
+  printf("clock=[");
+  for (uptr i = 0; i < clk_.Size(); i++)
+    printf("%s%llu", i == 0 ? "" : ",", clk_[i] & kClkMask);
+  printf("] last_acq=[");
+  for (uptr i = 0; i < clk_.Size(); i++)
+    printf("%s%llu", i == 0 ? "" : ",", clk_[i] >> kClkBits);
+  printf("] release_seq=%llu release_store_tid=%d dirty_tids=%d/%d",
+      release_seq_, release_store_tid_, dirty_tids_[0], dirty_tids_[1]);
 }
 }  // namespace __tsan

Modified: compiler-rt/trunk/lib/tsan/rtl/tsan_clock.h
URL: http://llvm.org/viewvc/llvm-project/compiler-rt/trunk/lib/tsan/rtl/tsan_clock.h?rev=204656&r1=204655&r2=204656&view=diff
==============================================================================
--- compiler-rt/trunk/lib/tsan/rtl/tsan_clock.h (original)
+++ compiler-rt/trunk/lib/tsan/rtl/tsan_clock.h Mon Mar 24 13:54:20 2014
@@ -18,6 +18,8 @@
 
 namespace __tsan {
 
+const u64 kClkMask = (1ULL << kClkBits) - 1;
+
 // The clock that lives in sync variables (mutexes, atomics, etc).
 class SyncClock {
  public:
@@ -27,38 +29,44 @@ class SyncClock {
     return clk_.Size();
   }
 
-  void Reset() {
-    clk_.Reset();
+  u64 get(unsigned tid) const {
+    DCHECK_LT(tid, clk_.Size());
+    return clk_[tid] & kClkMask;
   }
 
+  void Reset();
+
+  void DebugDump(int(*printf)(const char *s, ...));
+
  private:
-  Vector<u64> clk_;
+  u64 release_seq_;
+  unsigned release_store_tid_;
+  static const uptr kDirtyTids = 2;
+  unsigned dirty_tids_[kDirtyTids];
+  mutable Vector<u64> clk_;
   friend struct ThreadClock;
 };
 
 // The clock that lives in threads.
 struct ThreadClock {
  public:
-  ThreadClock();
+  explicit ThreadClock(unsigned tid);
 
   u64 get(unsigned tid) const {
     DCHECK_LT(tid, kMaxTidInClock);
+    DCHECK_EQ(clk_[tid], clk_[tid] & kClkMask);
     return clk_[tid];
   }
 
-  void set(unsigned tid, u64 v) {
-    DCHECK_LT(tid, kMaxTid);
-    DCHECK_GE(v, clk_[tid]);
-    clk_[tid] = v;
-    if (nclk_ <= tid)
-      nclk_ = tid + 1;
+  void set(unsigned tid, u64 v);
+
+  void set(u64 v) {
+    DCHECK_GE(v, clk_[tid_]);
+    clk_[tid_] = v;
   }
 
-  void tick(unsigned tid) {
-    DCHECK_LT(tid, kMaxTid);
-    clk_[tid]++;
-    if (nclk_ <= tid)
-      nclk_ = tid + 1;
+  void tick() {
+    clk_[tid_]++;
   }
 
   uptr size() const {
@@ -70,9 +78,17 @@ struct ThreadClock {
   void acq_rel(SyncClock *dst);
   void ReleaseStore(SyncClock *dst) const;
 
+  void DebugDump(int(*printf)(const char *s, ...));
+
  private:
+  static const uptr kDirtyTids = SyncClock::kDirtyTids;
+  const unsigned tid_;
+  u64 last_acquire_;
   uptr nclk_;
   u64 clk_[kMaxTidInClock];
+
+  bool IsAlreadyAcquired(const SyncClock *src) const;
+  void UpdateCurrentThread(SyncClock *dst) const;
 };
 
 }  // namespace __tsan

Modified: compiler-rt/trunk/lib/tsan/rtl/tsan_rtl.cc
URL: http://llvm.org/viewvc/llvm-project/compiler-rt/trunk/lib/tsan/rtl/tsan_rtl.cc?rev=204656&r1=204655&r2=204656&view=diff
==============================================================================
--- compiler-rt/trunk/lib/tsan/rtl/tsan_rtl.cc (original)
+++ compiler-rt/trunk/lib/tsan/rtl/tsan_rtl.cc Mon Mar 24 13:54:20 2014
@@ -90,6 +90,7 @@ ThreadState::ThreadState(Context *ctx, i
   // they may be accessed before the ctor.
   // , ignore_reads_and_writes()
   // , ignore_interceptors()
+  , clock(tid)
 #ifndef TSAN_GO
   , jmp_bufs(MBlockJmpBuf)
 #endif
@@ -98,7 +99,11 @@ ThreadState::ThreadState(Context *ctx, i
   , stk_addr(stk_addr)
   , stk_size(stk_size)
   , tls_addr(tls_addr)
-  , tls_size(tls_size) {
+  , tls_size(tls_size)
+#ifndef TSAN_GO
+  , last_sleep_clock(tid)
+#endif
+{
 }
 
 static void MemoryProfiler(Context *ctx, fd_t fd, int i) {

Modified: compiler-rt/trunk/lib/tsan/rtl/tsan_rtl_mutex.cc
URL: http://llvm.org/viewvc/llvm-project/compiler-rt/trunk/lib/tsan/rtl/tsan_rtl_mutex.cc?rev=204656&r1=204655&r2=204656&view=diff
==============================================================================
--- compiler-rt/trunk/lib/tsan/rtl/tsan_rtl_mutex.cc (original)
+++ compiler-rt/trunk/lib/tsan/rtl/tsan_rtl_mutex.cc Mon Mar 24 13:54:20 2014
@@ -376,7 +376,7 @@ void AfterSleep(ThreadState *thr, uptr p
 void AcquireImpl(ThreadState *thr, uptr pc, SyncClock *c) {
   if (thr->ignore_sync)
     return;
-  thr->clock.set(thr->tid, thr->fast_state.epoch());
+  thr->clock.set(thr->fast_state.epoch());
   thr->clock.acquire(c);
   StatInc(thr, StatSyncAcquire);
 }
@@ -384,7 +384,7 @@ void AcquireImpl(ThreadState *thr, uptr
 void ReleaseImpl(ThreadState *thr, uptr pc, SyncClock *c) {
   if (thr->ignore_sync)
     return;
-  thr->clock.set(thr->tid, thr->fast_state.epoch());
+  thr->clock.set(thr->fast_state.epoch());
   thr->fast_synch_epoch = thr->fast_state.epoch();
   thr->clock.release(c);
   StatInc(thr, StatSyncRelease);
@@ -393,7 +393,7 @@ void ReleaseImpl(ThreadState *thr, uptr
 void ReleaseStoreImpl(ThreadState *thr, uptr pc, SyncClock *c) {
   if (thr->ignore_sync)
     return;
-  thr->clock.set(thr->tid, thr->fast_state.epoch());
+  thr->clock.set(thr->fast_state.epoch());
   thr->fast_synch_epoch = thr->fast_state.epoch();
   thr->clock.ReleaseStore(c);
   StatInc(thr, StatSyncRelease);
@@ -402,7 +402,7 @@ void ReleaseStoreImpl(ThreadState *thr,
 void AcquireReleaseImpl(ThreadState *thr, uptr pc, SyncClock *c) {
   if (thr->ignore_sync)
     return;
-  thr->clock.set(thr->tid, thr->fast_state.epoch());
+  thr->clock.set(thr->fast_state.epoch());
   thr->fast_synch_epoch = thr->fast_state.epoch();
   thr->clock.acq_rel(c);
   StatInc(thr, StatSyncAcquire);

Modified: compiler-rt/trunk/lib/tsan/rtl/tsan_stat.cc
URL: http://llvm.org/viewvc/llvm-project/compiler-rt/trunk/lib/tsan/rtl/tsan_stat.cc?rev=204656&r1=204655&r2=204656&view=diff
==============================================================================
--- compiler-rt/trunk/lib/tsan/rtl/tsan_stat.cc (original)
+++ compiler-rt/trunk/lib/tsan/rtl/tsan_stat.cc Mon Mar 24 13:54:20 2014
@@ -74,6 +74,29 @@ void StatOutput(u64 *stat) {
   name[StatSyncAcquire]                  = "             acquired             ";
   name[StatSyncRelease]                  = "             released             ";
 
+  name[StatClockAcquire]                 = "Clock acquire                     ";
+  name[StatClockAcquireEmpty]            = "  empty clock                     ";
+  name[StatClockAcquireFastRelease]      = "  fast from release-store         ";
+  name[StatClockAcquireLarge]            = "  contains my tid                 ";
+  name[StatClockAcquireRepeat]           = "  repeated (fast)                 ";
+  name[StatClockAcquireFull]             = "  full (slow)                     ";
+  name[StatClockAcquiredSomething]       = "  acquired something              ";
+  name[StatClockRelease]                 = "Clock release                     ";
+  name[StatClockReleaseResize]           = "  resize                          ";
+  name[StatClockReleaseFast1]            = "  fast1                           ";
+  name[StatClockReleaseFast2]            = "  fast2                           ";
+  name[StatClockReleaseFast3]            = "  fast3                           ";
+  name[StatClockReleaseFull]             = "  full (slow)                     ";
+  name[StatClockReleaseAcquired]         = "  was acquired                    ";
+  name[StatClockReleaseClearTail]        = "  clear tail                      ";
+  name[StatClockReleaseLastOverflow]     = "  last overflow                   ";
+  name[StatClockStore]                   = "Clock release store               ";
+  name[StatClockStoreResize]             = "  resize                          ";
+  name[StatClockStoreFast]               = "  fast                            ";
+  name[StatClockStoreFull]               = "  slow                            ";
+  name[StatClockStoreTail]               = "  clear tail                      ";
+  name[StatClockAcquireRelease]          = "Clock acquire-release             ";
+
   name[StatAtomic]                       = "Atomic operations                 ";
   name[StatAtomicLoad]                   = "  Including load                  ";
   name[StatAtomicStore]                  = "            store                 ";
@@ -150,7 +173,7 @@ void StatOutput(u64 *stat) {
 
   Printf("Statistics:\n");
   for (int i = 0; i < StatCnt; i++)
-    Printf("%s: %zu\n", name[i], (uptr)stat[i]);
+    Printf("%s: %16zu\n", name[i], (uptr)stat[i]);
 }
 
 }  // namespace __tsan

Modified: compiler-rt/trunk/lib/tsan/rtl/tsan_stat.h
URL: http://llvm.org/viewvc/llvm-project/compiler-rt/trunk/lib/tsan/rtl/tsan_stat.h?rev=204656&r1=204655&r2=204656&view=diff
==============================================================================
--- compiler-rt/trunk/lib/tsan/rtl/tsan_stat.h (original)
+++ compiler-rt/trunk/lib/tsan/rtl/tsan_stat.h Mon Mar 24 13:54:20 2014
@@ -69,6 +69,33 @@ enum StatType {
   StatSyncAcquire,
   StatSyncRelease,
 
+  // Clocks - acquire.
+  StatClockAcquire,
+  StatClockAcquireEmpty,
+  StatClockAcquireFastRelease,
+  StatClockAcquireLarge,
+  StatClockAcquireRepeat,
+  StatClockAcquireFull,
+  StatClockAcquiredSomething,
+  // Clocks - release.
+  StatClockRelease,
+  StatClockReleaseResize,
+  StatClockReleaseFast1,
+  StatClockReleaseFast2,
+  StatClockReleaseFast3,
+  StatClockReleaseFull,
+  StatClockReleaseAcquired,
+  StatClockReleaseClearTail,
+  StatClockReleaseLastOverflow,
+  // Clocks - release store.
+  StatClockStore,
+  StatClockStoreResize,
+  StatClockStoreFast,
+  StatClockStoreFull,
+  StatClockStoreTail,
+  // Clocks - acquire-release.
+  StatClockAcquireRelease,
+
   // Atomics.
   StatAtomic,
   StatAtomicLoad,

Modified: compiler-rt/trunk/lib/tsan/tests/unit/tsan_clock_test.cc
URL: http://llvm.org/viewvc/llvm-project/compiler-rt/trunk/lib/tsan/tests/unit/tsan_clock_test.cc?rev=204656&r1=204655&r2=204656&view=diff
==============================================================================
--- compiler-rt/trunk/lib/tsan/tests/unit/tsan_clock_test.cc (original)
+++ compiler-rt/trunk/lib/tsan/tests/unit/tsan_clock_test.cc Mon Mar 24 13:54:20 2014
@@ -17,101 +17,294 @@
 namespace __tsan {
 
 TEST(Clock, VectorBasic) {
-  ThreadClock clk;
-  CHECK_EQ(clk.size(), 0);
-  clk.tick(0);
-  CHECK_EQ(clk.size(), 1);
-  CHECK_EQ(clk.get(0), 1);
-  clk.tick(3);
-  CHECK_EQ(clk.size(), 4);
-  CHECK_EQ(clk.get(0), 1);
-  CHECK_EQ(clk.get(1), 0);
-  CHECK_EQ(clk.get(2), 0);
-  CHECK_EQ(clk.get(3), 1);
-  clk.tick(3);
-  CHECK_EQ(clk.get(3), 2);
+  ThreadClock clk(0);
+  ASSERT_EQ(clk.size(), 1);
+  clk.tick();
+  ASSERT_EQ(clk.size(), 1);
+  ASSERT_EQ(clk.get(0), 1);
+  clk.set(3, clk.get(3) + 1);
+  ASSERT_EQ(clk.size(), 4);
+  ASSERT_EQ(clk.get(0), 1);
+  ASSERT_EQ(clk.get(1), 0);
+  ASSERT_EQ(clk.get(2), 0);
+  ASSERT_EQ(clk.get(3), 1);
+  clk.set(3, clk.get(3) + 1);
+  ASSERT_EQ(clk.get(3), 2);
 }
 
 TEST(Clock, ChunkedBasic) {
-  ThreadClock vector;
+  ThreadClock vector(0);
   SyncClock chunked;
-  CHECK_EQ(vector.size(), 0);
-  CHECK_EQ(chunked.size(), 0);
+  ASSERT_EQ(vector.size(), 1);
+  ASSERT_EQ(chunked.size(), 0);
   vector.acquire(&chunked);
-  CHECK_EQ(vector.size(), 0);
-  CHECK_EQ(chunked.size(), 0);
+  ASSERT_EQ(vector.size(), 1);
+  ASSERT_EQ(chunked.size(), 0);
   vector.release(&chunked);
-  CHECK_EQ(vector.size(), 0);
-  CHECK_EQ(chunked.size(), 0);
+  ASSERT_EQ(vector.size(), 1);
+  ASSERT_EQ(chunked.size(), 1);
   vector.acq_rel(&chunked);
-  CHECK_EQ(vector.size(), 0);
-  CHECK_EQ(chunked.size(), 0);
+  ASSERT_EQ(vector.size(), 1);
+  ASSERT_EQ(chunked.size(), 1);
 }
 
 TEST(Clock, AcquireRelease) {
-  ThreadClock vector1;
-  vector1.tick(100);
+  ThreadClock vector1(100);
+  vector1.tick();
   SyncClock chunked;
   vector1.release(&chunked);
-  CHECK_EQ(chunked.size(), 101);
-  ThreadClock vector2;
+  ASSERT_EQ(chunked.size(), 101);
+  ThreadClock vector2(0);
   vector2.acquire(&chunked);
-  CHECK_EQ(vector2.size(), 101);
-  CHECK_EQ(vector2.get(0), 0);
-  CHECK_EQ(vector2.get(1), 0);
-  CHECK_EQ(vector2.get(99), 0);
-  CHECK_EQ(vector2.get(100), 1);
+  ASSERT_EQ(vector2.size(), 101);
+  ASSERT_EQ(vector2.get(0), 0);
+  ASSERT_EQ(vector2.get(1), 0);
+  ASSERT_EQ(vector2.get(99), 0);
+  ASSERT_EQ(vector2.get(100), 1);
 }
 
 TEST(Clock, ManyThreads) {
   SyncClock chunked;
   for (int i = 0; i < 100; i++) {
-    ThreadClock vector;
-    vector.tick(i);
+    ThreadClock vector(0);
+    vector.tick();
+    vector.set(i, 1);
     vector.release(&chunked);
-    CHECK_EQ(chunked.size(), i + 1);
+    ASSERT_EQ(i + 1, chunked.size());
     vector.acquire(&chunked);
-    CHECK_EQ(vector.size(), i + 1);
+    ASSERT_EQ(i + 1, vector.size());
   }
-  ThreadClock vector;
+
+  for (int i = 0; i < 100; i++)
+    ASSERT_EQ(1, chunked.get(i));
+
+  ThreadClock vector(1);
   vector.acquire(&chunked);
-  CHECK_EQ(vector.size(), 100);
+  ASSERT_EQ(100, vector.size());
   for (int i = 0; i < 100; i++)
-    CHECK_EQ(vector.get(i), 1);
+    ASSERT_EQ(1, vector.get(i));
 }
 
 TEST(Clock, DifferentSizes) {
   {
-    ThreadClock vector1;
-    vector1.tick(10);
-    ThreadClock vector2;
-    vector2.tick(20);
+    ThreadClock vector1(10);
+    vector1.tick();
+    ThreadClock vector2(20);
+    vector2.tick();
     {
       SyncClock chunked;
       vector1.release(&chunked);
-      CHECK_EQ(chunked.size(), 11);
+      ASSERT_EQ(chunked.size(), 11);
       vector2.release(&chunked);
-      CHECK_EQ(chunked.size(), 21);
+      ASSERT_EQ(chunked.size(), 21);
     }
     {
       SyncClock chunked;
       vector2.release(&chunked);
-      CHECK_EQ(chunked.size(), 21);
+      ASSERT_EQ(chunked.size(), 21);
       vector1.release(&chunked);
-      CHECK_EQ(chunked.size(), 21);
+      ASSERT_EQ(chunked.size(), 21);
     }
     {
       SyncClock chunked;
       vector1.release(&chunked);
       vector2.acquire(&chunked);
-      CHECK_EQ(vector2.size(), 21);
+      ASSERT_EQ(vector2.size(), 21);
     }
     {
       SyncClock chunked;
       vector2.release(&chunked);
       vector1.acquire(&chunked);
-      CHECK_EQ(vector1.size(), 21);
+      ASSERT_EQ(vector1.size(), 21);
+    }
+  }
+}
+
+const int kThreads = 4;
+const int kClocks = 4;
+
+// SimpleSyncClock and SimpleThreadClock implement the same thing as
+// SyncClock and ThreadClock, but in a very simple way.
+struct SimpleSyncClock {
+  u64 clock[kThreads];
+  uptr size;
+
+  SimpleSyncClock() {
+    size = 0;
+    for (uptr i = 0; i < kThreads; i++)
+      clock[i] = 0;
+  }
+
+  bool verify(const SyncClock *other) const {
+    for (uptr i = 0; i < min(size, other->size()); i++) {
+      if (clock[i] != other->get(i))
+        return false;
+    }
+    for (uptr i = min(size, other->size()); i < max(size, other->size()); i++) {
+      if (i < size && clock[i] != 0)
+        return false;
+      if (i < other->size() && other->get(i) != 0)
+        return false;
     }
+    return true;
+  }
+};
+
+struct SimpleThreadClock {
+  u64 clock[kThreads];
+  uptr size;
+  unsigned tid;
+
+  explicit SimpleThreadClock(unsigned tid) {
+    this->tid = tid;
+    size = tid + 1;
+    for (uptr i = 0; i < kThreads; i++)
+      clock[i] = 0;
+  }
+
+  void tick() {
+    clock[tid]++;
+  }
+
+  void acquire(const SimpleSyncClock *src) {
+    if (size < src->size)
+      size = src->size;
+    for (uptr i = 0; i < kThreads; i++)
+      clock[i] = max(clock[i], src->clock[i]);
+  }
+
+  void release(SimpleSyncClock *dst) const {
+    if (dst->size < size)
+      dst->size = size;
+    for (uptr i = 0; i < kThreads; i++)
+      dst->clock[i] = max(dst->clock[i], clock[i]);
+  }
+
+  void acq_rel(SimpleSyncClock *dst) {
+    acquire(dst);
+    release(dst);
+  }
+
+  void ReleaseStore(SimpleSyncClock *dst) const {
+    if (dst->size < size)
+      dst->size = size;
+    for (uptr i = 0; i < kThreads; i++)
+      dst->clock[i] = clock[i];
+  }
+
+  bool verify(const ThreadClock *other) const {
+    for (uptr i = 0; i < min(size, other->size()); i++) {
+      if (clock[i] != other->get(i))
+        return false;
+    }
+    for (uptr i = min(size, other->size()); i < max(size, other->size()); i++) {
+      if (i < size && clock[i] != 0)
+        return false;
+      if (i < other->size() && other->get(i) != 0)
+        return false;
+    }
+    return true;
+  }
+};
+
+static bool ClockFuzzer(bool printing) {
+  // Create kThreads thread clocks.
+  SimpleThreadClock *thr0[kThreads];
+  ThreadClock *thr1[kThreads];
+  for (unsigned i = 0; i < kThreads; i++) {
+    thr0[i] = new SimpleThreadClock(i);
+    thr1[i] = new ThreadClock(i);
+  }
+
+  // Create kClocks sync clocks.
+  SimpleSyncClock *sync0[kClocks];
+  SyncClock *sync1[kClocks];
+  for (unsigned i = 0; i < kClocks; i++) {
+    sync0[i] = new SimpleSyncClock();
+    sync1[i] = new SyncClock();
+  }
+
+  // Do N random operations (acquire, release, etc) and compare results
+  // for SimpleThread/SyncClock and real Thread/SyncClock.
+  for (int i = 0; i < 1000000; i++) {
+    unsigned tid = rand() % kThreads;
+    unsigned cid = rand() % kClocks;
+    thr0[tid]->tick();
+    thr1[tid]->tick();
+
+    switch (rand() % 4) {
+    case 0:
+      if (printing)
+        printf("acquire thr%d <- clk%d\n", tid, cid);
+      thr0[tid]->acquire(sync0[cid]);
+      thr1[tid]->acquire(sync1[cid]);
+      break;
+    case 1:
+      if (printing)
+        printf("release thr%d -> clk%d\n", tid, cid);
+      thr0[tid]->release(sync0[cid]);
+      thr1[tid]->release(sync1[cid]);
+      break;
+    case 2:
+      if (printing)
+        printf("acq_rel thr%d <> clk%d\n", tid, cid);
+      thr0[tid]->acq_rel(sync0[cid]);
+      thr1[tid]->acq_rel(sync1[cid]);
+      break;
+    case 3:
+      if (printing)
+        printf("rel_str thr%d >> clk%d\n", tid, cid);
+      thr0[tid]->ReleaseStore(sync0[cid]);
+      thr1[tid]->ReleaseStore(sync1[cid]);
+      break;
+    }
+
+    if (printing) {
+      for (unsigned i = 0; i < kThreads; i++) {
+        printf("thr%d: ", i);
+        thr1[i]->DebugDump(printf);
+        printf("\n");
+      }
+      for (unsigned i = 0; i < kClocks; i++) {
+        printf("clk%d: ", i);
+        sync1[i]->DebugDump(printf);
+        printf("\n");
+      }
+
+      printf("\n");
+    }
+
+    if (!thr0[tid]->verify(thr1[tid]) || !sync0[cid]->verify(sync1[cid])) {
+      if (!printing)
+        return false;
+      printf("differs with model:\n");
+      for (unsigned i = 0; i < kThreads; i++) {
+        printf("thr%d: clock=[", i);
+        for (uptr j = 0; j < thr0[i]->size; j++)
+          printf("%s%llu", j == 0 ? "" : ",", thr0[i]->clock[j]);
+        printf("]\n");
+      }
+      for (unsigned i = 0; i < kClocks; i++) {
+        printf("clk%d: clock=[", i);
+        for (uptr j = 0; j < sync0[i]->size; j++)
+          printf("%s%llu", j == 0 ? "" : ",", sync0[i]->clock[j]);
+        printf("]\n");
+      }
+      return false;
+    }
+  }
+  return true;
+}
+
+TEST(Clock, Fuzzer) {
+  int seed = time(0);
+  printf("seed=%d\n", seed);
+  srand(seed);
+  if (!ClockFuzzer(false)) {
+    // Redo the test with the same seed, but logging operations.
+    srand(seed);
+    ClockFuzzer(true);
+    ASSERT_TRUE(false);
   }
 }