[compiler-rt] 4408eee - tsan: fix false positives in AcquireGlobal

Wed May 27 07:30:03 PDT 2020

Author: Dmitry Vyukov
Date: 2020-05-27T16:27:47+02:00
New Revision: 4408eeed0ff191304121c11168aa1db861cccb97

URL: https://github.com/llvm/llvm-project/commit/4408eeed0ff191304121c11168aa1db861cccb97
DIFF: https://github.com/llvm/llvm-project/commit/4408eeed0ff191304121c11168aa1db861cccb97.diff

LOG: tsan: fix false positives in AcquireGlobal

Add ThreadClock:: global_acquire_ which is the last time another thread
has done a global acquire of this thread's clock.

It helps to avoid problem described in:
https://github.com/golang/go/issues/39186
See test/tsan/java_finalizer2.cpp for a regression test.
Note the failuire is _extremely_ hard to hit, so if you are trying
to reproduce it, you may want to run something like:
$ go get golang.org/x/tools/cmd/stress
$ stress -p=64 ./a.out

The crux of the problem is roughly as follows.
A number of O(1) optimizations in the clocks algorithm assume proper
transitive cumulative propagation of clock values. The AcquireGlobal
operation may produce an inconsistent non-linearazable view of
thread clocks. Namely, it may acquire a later value from a thread
with a higher ID, but fail to acquire an earlier value from a thread
with a lower ID. If a thread that executed AcquireGlobal then releases
to a sync clock, it will spoil the sync clock with the inconsistent
values. If another thread later releases to the sync clock, the optimized
algorithm may break.

The exact sequence of events that leads to the failure.
- thread 1 executes AcquireGlobal
- thread 1 acquires value 1 for thread 2
- thread 2 increments clock to 2
- thread 2 releases to sync object 1
- thread 3 at time 1
- thread 3 acquires from sync object 1
- thread 1 acquires value 1 for thread 3
- thread 1 releases to sync object 2
- sync object 2 clock has 1 for thread 2 and 1 for thread 3
- thread 3 releases to sync object 2
- thread 3 sees value 1 in the clock for itself
  and decides that it has already released to the clock
  and did not acquire anything from other threads after that
  (the last_acquire_ check in release operation)
- thread 3 does not update the value for thread 2 in the clock from 1 to 2
- thread 4 acquires from sync object 2
- thread 4 detects a false race with thread 2
  as it should have been synchronized with thread 2 up to time 2,
  but because of the broken clock it is now synchronized only up to time 1

The global_acquire_ value helps to prevent this scenario.
Namely, thread 3 will not trust any own clock values up to global_acquire_
for the purposes of the last_acquire_ optimization.

Reviewed-in: https://reviews.llvm.org/D80474
Reported-by: nvanbenschoten (Nathan VanBenschoten)

Added: 
    compiler-rt/test/tsan/java_finalizer2.cpp

Modified: 
    compiler-rt/lib/tsan/rtl/tsan_clock.cpp
    compiler-rt/lib/tsan/rtl/tsan_clock.h
    compiler-rt/lib/tsan/rtl/tsan_rtl_mutex.cpp

Removed: 
    


################################################################################
diff  --git a/compiler-rt/lib/tsan/rtl/tsan_clock.cpp b/compiler-rt/lib/tsan/rtl/tsan_clock.cpp
index acbcf804194a..c91b29cb22b4 100644

--- a/compiler-rt/lib/tsan/rtl/tsan_clock.cpp
+++ b/compiler-rt/lib/tsan/rtl/tsan_clock.cpp
@@ -115,13 +115,14 @@ static void UnrefClockBlock(ClockCache *c, u32 idx, uptr blocks) {
 ThreadClock::ThreadClock(unsigned tid, unsigned reused)
     : tid_(tid)
     , reused_(reused + 1)  // 0 has special meaning
+    , last_acquire_()
+    , global_acquire_()
     , cached_idx_()
     , cached_size_()
     , cached_blocks_() {
   CHECK_LT(tid, kMaxTidInClock);
   CHECK_EQ(reused_, ((u64)reused_ << kClkBits) >> kClkBits);
   nclk_ = tid_ + 1;
-  last_acquire_ = 0;
   internal_memset(clk_, 0, sizeof(clk_));
 }
 
@@ -247,7 +248,7 @@ void ThreadClock::release(ClockCache *c, SyncClock *dst) {
   // Check if we had not acquired anything from other threads
   // since the last release on dst. If so, we need to update
   // only dst->elem(tid_).
-  if (dst->elem(tid_).epoch > last_acquire_) {
+  if (!HasAcquiredAfterRelease(dst)) {
     UpdateCurrentThread(c, dst);
     if (dst->release_store_tid_ != tid_ ||
         dst->release_store_reused_ != reused_)
@@ -318,7 +319,7 @@ void ThreadClock::ReleaseStore(ClockCache *c, SyncClock *dst) {
 
   if (dst->release_store_tid_ == tid_ &&
       dst->release_store_reused_ == reused_ &&
-      dst->elem(tid_).epoch > last_acquire_) {
+      !HasAcquiredAfterRelease(dst)) {
     CPP_STAT_INC(StatClockStoreFast);
     UpdateCurrentThread(c, dst);
     return;
@@ -400,6 +401,14 @@ bool ThreadClock::IsAlreadyAcquired(const SyncClock *src) const {
   return true;
 }
 
+// Checks whether the current thread has acquired anything
+// from other clocks after releasing to dst (directly or indirectly).
+bool ThreadClock::HasAcquiredAfterRelease(const SyncClock *dst) const {
+  const u64 my_epoch = dst->elem(tid_).epoch;
+  return my_epoch <= last_acquire_ ||
+      my_epoch <= atomic_load_relaxed(&global_acquire_);
+}
+
 // Sets a single element in the vector clock.
 // This function is called only from weird places like AcquireGlobal.
 void ThreadClock::set(ClockCache *c, unsigned tid, u64 v) {

diff  --git a/compiler-rt/lib/tsan/rtl/tsan_clock.h b/compiler-rt/lib/tsan/rtl/tsan_clock.h
index c66431f54ee4..736cdae06ba2 100644
--- a/compiler-rt/lib/tsan/rtl/tsan_clock.h
+++ b/compiler-rt/lib/tsan/rtl/tsan_clock.h
@@ -139,6 +139,7 @@ class ThreadClock {
   void acq_rel(ClockCache *c, SyncClock *dst);
   void ReleaseStore(ClockCache *c, SyncClock *dst);
   void ResetCached(ClockCache *c);
+  void NoteGlobalAcquire(u64 v);
 
   void DebugReset();
   void DebugDump(int(*printf)(const char *s, ...));
@@ -151,6 +152,53 @@ class ThreadClock {
   // Current thread time when it acquired something from other threads.
   u64 last_acquire_;
 
+  // Last time another thread has done a global acquire of this thread's clock.
+  // It helps to avoid problem described in:
+  // https://github.com/golang/go/issues/39186
+  // See test/tsan/java_finalizer2.cpp for a regression test.
+  // Note the failuire is _extremely_ hard to hit, so if you are trying
+  // to reproduce it, you may want to run something like:
+  // $ go get golang.org/x/tools/cmd/stress
+  // $ stress -p=64 ./a.out
+  //
+  // The crux of the problem is roughly as follows.
+  // A number of O(1) optimizations in the clocks algorithm assume proper
+  // transitive cumulative propagation of clock values. The AcquireGlobal
+  // operation may produce an inconsistent non-linearazable view of
+  // thread clocks. Namely, it may acquire a later value from a thread
+  // with a higher ID, but fail to acquire an earlier value from a thread
+  // with a lower ID. If a thread that executed AcquireGlobal then releases
+  // to a sync clock, it will spoil the sync clock with the inconsistent
+  // values. If another thread later releases to the sync clock, the optimized
+  // algorithm may break.
+  //
+  // The exact sequence of events that leads to the failure.
+  // - thread 1 executes AcquireGlobal
+  // - thread 1 acquires value 1 for thread 2
+  // - thread 2 increments clock to 2
+  // - thread 2 releases to sync object 1
+  // - thread 3 at time 1
+  // - thread 3 acquires from sync object 1
+  // - thread 3 increments clock to 2
+  // - thread 1 acquires value 2 for thread 3
+  // - thread 1 releases to sync object 2
+  // - sync object 2 clock has 1 for thread 2 and 2 for thread 3
+  // - thread 3 releases to sync object 2
+  // - thread 3 sees value 2 in the clock for itself
+  //   and decides that it has already released to the clock
+  //   and did not acquire anything from other threads after that
+  //   (the last_acquire_ check in release operation)
+  // - thread 3 does not update the value for thread 2 in the clock from 1 to 2
+  // - thread 4 acquires from sync object 2
+  // - thread 4 detects a false race with thread 2
+  //   as it should have been synchronized with thread 2 up to time 2,
+  //   but because of the broken clock it is now synchronized only up to time 1
+  //
+  // The global_acquire_ value helps to prevent this scenario.
+  // Namely, thread 3 will not trust any own clock values up to global_acquire_
+  // for the purposes of the last_acquire_ optimization.
+  atomic_uint64_t global_acquire_;
+
   // Cached SyncClock (without dirty entries and release_store_tid_).
   // We reuse it for subsequent store-release operations without intervening
   // acquire operations. Since it is shared (and thus constant), clock value
@@ -165,6 +213,7 @@ class ThreadClock {
   u64 clk_[kMaxTidInClock];  // Fixed size vector clock.
 
   bool IsAlreadyAcquired(const SyncClock *src) const;
+  bool HasAcquiredAfterRelease(const SyncClock *dst) const;
   void UpdateCurrentThread(ClockCache *c, SyncClock *dst) const;
 };
 
@@ -186,6 +235,14 @@ ALWAYS_INLINE uptr ThreadClock::size() const {
   return nclk_;
 }
 
+ALWAYS_INLINE void ThreadClock::NoteGlobalAcquire(u64 v) {
+  // Here we rely on the fact that AcquireGlobal is protected by
+  // ThreadRegistryLock, thus only one thread at a time executes it
+  // and values passed to this function should not go backwards.
+  CHECK_LE(atomic_load_relaxed(&global_acquire_), v);
+  atomic_store_relaxed(&global_acquire_, v);
+}
+
 ALWAYS_INLINE SyncClock::Iter SyncClock::begin() {
   return Iter(this);
 }

diff  --git a/compiler-rt/lib/tsan/rtl/tsan_rtl_mutex.cpp b/compiler-rt/lib/tsan/rtl/tsan_rtl_mutex.cpp
index bca194f064b4..ebd0d7221818 100644
--- a/compiler-rt/lib/tsan/rtl/tsan_rtl_mutex.cpp
+++ b/compiler-rt/lib/tsan/rtl/tsan_rtl_mutex.cpp
@@ -415,8 +415,10 @@ static void UpdateClockCallback(ThreadContextBase *tctx_base, void *arg) {
   ThreadState *thr = reinterpret_cast<ThreadState*>(arg);
   ThreadContext *tctx = static_cast<ThreadContext*>(tctx_base);
   u64 epoch = tctx->epoch1;
-  if (tctx->status == ThreadStatusRunning)
+  if (tctx->status == ThreadStatusRunning) {
     epoch = tctx->thr->fast_state.epoch();
+    tctx->thr->clock.NoteGlobalAcquire(epoch);
+  }
   thr->clock.set(&thr->proc()->clock_cache, tctx->tid, epoch);
 }
 

diff  --git a/compiler-rt/test/tsan/java_finalizer2.cpp b/compiler-rt/test/tsan/java_finalizer2.cpp
new file mode 100644
index 000000000000..f2590f7c40b9
--- /dev/null
+++ b/compiler-rt/test/tsan/java_finalizer2.cpp
@@ -0,0 +1,82 @@
+// RUN: %clangxx_tsan -O1 %s -o %t && %run %t 2>&1 | FileCheck %s
+// Regression test for https://github.com/golang/go/issues/39186
+#include "java.h"
+#include <string.h>
+
+struct Heap {
+  uint64_t data;
+  uint64_t ready;
+  uint64_t finalized;
+  uint64_t wg;
+  pthread_barrier_t barrier_finalizer;
+  pthread_barrier_t barrier_ballast;
+};
+
+void *Thread1(void *p) {
+  Heap* heap = (Heap*)p;
+  pthread_barrier_wait(&heap->barrier_finalizer);
+  __tsan_java_finalize();
+  __atomic_fetch_add(&heap->wg, 1, __ATOMIC_RELEASE);
+  __atomic_store_n(&heap->finalized, 1, __ATOMIC_RELAXED);
+  return 0;
+}
+
+void *Thread2(void *p) {
+  Heap* heap = (Heap*)p;
+  pthread_barrier_wait(&heap->barrier_finalizer);
+  heap->data = 1;
+  __atomic_store_n(&heap->ready, 1, __ATOMIC_RELEASE);
+  return 0;
+}
+
+void *Thread3(void *p) {
+  Heap* heap = (Heap*)p;
+  pthread_barrier_wait(&heap->barrier_finalizer);
+  while (__atomic_load_n(&heap->ready, __ATOMIC_ACQUIRE) != 1)
+    pthread_yield();
+  while (__atomic_load_n(&heap->finalized, __ATOMIC_RELAXED) != 1)
+    pthread_yield();
+  __atomic_fetch_add(&heap->wg, 1, __ATOMIC_RELEASE);
+  return 0;
+}
+
+void *Ballast(void *p) {
+  Heap* heap = (Heap*)p;
+  pthread_barrier_wait(&heap->barrier_ballast);
+  return 0;
+}
+
+int main() {
+  Heap* heap = (Heap*)calloc(sizeof(Heap), 1);
+  __tsan_java_init((jptr)heap, sizeof(*heap));
+  __tsan_java_alloc((jptr)heap, sizeof(*heap));
+  // Ballast threads merely make the bug a bit easier to trigger.
+  const int kBallastThreads = 100;
+  pthread_barrier_init(&heap->barrier_finalizer, 0, 4);
+  pthread_barrier_init(&heap->barrier_ballast, 0, kBallastThreads + 1);
+  pthread_t th[3];
+  pthread_create(&th[0], 0, Thread1, heap);
+  pthread_create(&th[1], 0, Thread2, heap);
+  pthread_t ballast[kBallastThreads];
+  for (int i = 0; i < kBallastThreads; i++)
+    pthread_create(&ballast[i], 0, Ballast, heap);
+  pthread_create(&th[2], 0, Thread3, heap);
+  pthread_barrier_wait(&heap->barrier_ballast);
+  for (int i = 0; i < kBallastThreads; i++)
+    pthread_join(ballast[i], 0);
+  pthread_barrier_wait(&heap->barrier_finalizer);
+  while (__atomic_load_n(&heap->wg, __ATOMIC_ACQUIRE) != 2)
+    pthread_yield();
+  if (heap->data != 1)
+    exit(printf("no data\n"));
+  for (int i = 0; i < 3; i++)
+    pthread_join(th[i], 0);
+  pthread_barrier_destroy(&heap->barrier_ballast);
+  pthread_barrier_destroy(&heap->barrier_finalizer);
+  __tsan_java_free((jptr)heap, sizeof(*heap));
+  fprintf(stderr, "DONE\n");
+  return __tsan_java_fini();
+}
+
+// CHECK-NOT: WARNING: ThreadSanitizer: data race
+// CHECK: DONE