[compiler-rt] TSan: Support relaxed accesses and fences (PR #142579)

Tue Jun 3 03:48:43 PDT 2025

https://github.com/rymrg created https://github.com/llvm/llvm-project/pull/142579

This PR adds support for relaxed accesses and fences. Since correct instrumentation increases the required overhead, the feature is put behind a feature flag.

The PR originates from the paper "Dynamic Robustness Verification against Weak Memory" DOI: https://doi.org/10.1145/3729277
The race detection extension is available at arXiv in appendix B: https://doi.org/10.48550/arXiv.2504.15036


https://github.com/google/sanitizers/issues/1415

>From 30e25502d6557f75e81ba37c780975b325e2dfde Mon Sep 17 00:00:00 2001
From: rymrg <54061433+rymrg at users.noreply.github.com>
Date: Tue, 3 Jun 2025 13:33:18 +0300
Subject: [PATCH] TSan: Support relaxed accesses and fences

---
 compiler-rt/lib/tsan/rtl/tsan_flags.inc       |   3 +
 .../lib/tsan/rtl/tsan_interface_atomic.cpp    | 104 ++++++++++++++----
 compiler-rt/lib/tsan/rtl/tsan_rtl.h           |   2 +
 3 files changed, 86 insertions(+), 23 deletions(-)

diff --git a/compiler-rt/lib/tsan/rtl/tsan_flags.inc b/compiler-rt/lib/tsan/rtl/tsan_flags.inc
index 731d776cc893e..a4f240ee8612c 100644
--- a/compiler-rt/lib/tsan/rtl/tsan_flags.inc
+++ b/compiler-rt/lib/tsan/rtl/tsan_flags.inc
@@ -80,3 +80,6 @@ TSAN_FLAG(bool, shared_ptr_interceptor, true,
 TSAN_FLAG(bool, print_full_thread_history, false,
           "If set, prints thread creation stacks for the threads involved in "
           "the report and their ancestors up to the main thread.")
+TSAN_FLAG(bool, correct_race_detection, false,
+          "If set, remove optimizations and execute correct race detection "
+          "supporting fences and release sequence.")
diff --git a/compiler-rt/lib/tsan/rtl/tsan_interface_atomic.cpp b/compiler-rt/lib/tsan/rtl/tsan_interface_atomic.cpp
index 527e5a9b4a8d8..96bcd3815304a 100644
--- a/compiler-rt/lib/tsan/rtl/tsan_interface_atomic.cpp
+++ b/compiler-rt/lib/tsan/rtl/tsan_interface_atomic.cpp
@@ -27,6 +27,10 @@
 
 using namespace __tsan;
 
+static bool correctRaceDetection(){
+	return flags()->correct_race_detection;
+}
+
 #if !SANITIZER_GO && __TSAN_HAS_INT128
 // Protects emulation of 128-bit atomic operations.
 static StaticSpinMutex mutex128;
@@ -227,18 +231,36 @@ namespace {
 template <typename T, T (*F)(volatile T *v, T op)>
 static T AtomicRMW(ThreadState *thr, uptr pc, volatile T *a, T v, morder mo) {
   MemoryAccess(thr, pc, (uptr)a, AccessSize<T>(), kAccessWrite | kAccessAtomic);
-  if (LIKELY(mo == mo_relaxed))
-    return F(a, v);
+  if (!correctRaceDetection()){
+    if (LIKELY(mo == mo_relaxed))
+      return F(a, v);
+  }
   SlotLocker locker(thr);
   {
     auto s = ctx->metamap.GetSyncOrCreate(thr, pc, (uptr)a, false);
-    RWLock lock(&s->mtx, IsReleaseOrder(mo));
-    if (IsAcqRelOrder(mo))
-      thr->clock.ReleaseAcquire(&s->clock);
-    else if (IsReleaseOrder(mo))
-      thr->clock.Release(&s->clock);
-    else if (IsAcquireOrder(mo))
-      thr->clock.Acquire(s->clock);
+	bool fullLock = correctRaceDetection() || IsReleaseOrder(mo);
+    RWLock lock(&s->mtx, fullLock);
+	if (!correctRaceDetection()){
+      if (IsAcqRelOrder(mo))
+        thr->clock.ReleaseAcquire(&s->clock);
+      else if (IsReleaseOrder(mo))
+        thr->clock.Release(&s->clock);
+      else if (IsAcquireOrder(mo))
+        thr->clock.Acquire(s->clock);
+	} else {
+      if (mo == mo_relaxed){
+	  	thr->clockA.Acquire(s->clock);
+	  	thr->clockR.Release(&s->clock);
+	  } else if (IsAcqRelOrder(mo)) {
+	  	thr->clock.ReleaseAcquire(&s->clock);
+	  } else if (IsReleaseOrder(mo)) {
+	  	thr->clockA.Acquire(s->clock);
+	  	thr->clock.Release(&s->clock);
+	  } else if (IsAcquireOrder(mo)) {
+	  	thr->clock.Acquire(s->clock);
+	  	thr->clockR.Release(&s->clock);
+	  }
+	}
     v = F(a, v);
   }
   if (IsReleaseOrder(mo))
@@ -264,7 +286,7 @@ struct OpLoad {
     DCHECK(IsLoadOrder(mo));
     // This fast-path is critical for performance.
     // Assume the access is atomic.
-    if (!IsAcquireOrder(mo)) {
+    if (!correctRaceDetection() && !IsAcquireOrder(mo)) {
       MemoryAccess(thr, pc, (uptr)a, AccessSize<T>(),
                    kAccessRead | kAccessAtomic);
       return NoTsanAtomic(mo, a);
@@ -276,7 +298,11 @@ struct OpLoad {
     if (s) {
       SlotLocker locker(thr);
       ReadLock lock(&s->mtx);
-      thr->clock.Acquire(s->clock);
+	  if (IsAcquireOrder(mo)) {
+		  thr->clock.Acquire(s->clock);
+	  } else if (correctRaceDetection()) {
+		  thr->clockA.Acquire(s->clock);
+	  }
       // Re-read under sync mutex because we need a consistent snapshot
       // of the value and the clock we acquire.
       v = NoTsanAtomic(mo, a);
@@ -309,7 +335,7 @@ struct OpStore {
     // Assume the access is atomic.
     // Strictly saying even relaxed store cuts off release sequence,
     // so must reset the clock.
-    if (!IsReleaseOrder(mo)) {
+    if (!correctRaceDetection() && !IsReleaseOrder(mo)) {
       NoTsanAtomic(mo, a, v);
       return;
     }
@@ -317,10 +343,14 @@ struct OpStore {
     {
       auto s = ctx->metamap.GetSyncOrCreate(thr, pc, (uptr)a, false);
       Lock lock(&s->mtx);
-      thr->clock.ReleaseStore(&s->clock);
+	  if (IsReleaseOrder(mo))
+        thr->clock.ReleaseStore(&s->clock);
+	  else if (correctRaceDetection())
+		thr->clockR.ReleaseStore(&s->clock);
       NoTsanAtomic(mo, a, v);
     }
-    IncrementEpoch(thr);
+	if (IsReleaseOrder(mo))
+      IncrementEpoch(thr);
   }
 };
 
@@ -441,7 +471,7 @@ struct OpCAS {
 
     MemoryAccess(thr, pc, (uptr)a, AccessSize<T>(),
                  kAccessWrite | kAccessAtomic);
-    if (LIKELY(mo == mo_relaxed && fmo == mo_relaxed)) {
+    if (LIKELY(!correctRaceDetection() && mo == mo_relaxed && fmo == mo_relaxed)) {
       T cc = *c;
       T pr = func_cas(a, cc, v);
       if (pr == cc)
@@ -454,7 +484,8 @@ struct OpCAS {
     bool success;
     {
       auto s = ctx->metamap.GetSyncOrCreate(thr, pc, (uptr)a, false);
-      RWLock lock(&s->mtx, release);
+	  bool fullLock = correctRaceDetection() || release;
+      RWLock lock(&s->mtx, fullLock);
       T cc = *c;
       T pr = func_cas(a, cc, v);
       success = pr == cc;
@@ -462,12 +493,27 @@ struct OpCAS {
         *c = pr;
         mo = fmo;
       }
-      if (success && IsAcqRelOrder(mo))
-        thr->clock.ReleaseAcquire(&s->clock);
-      else if (success && IsReleaseOrder(mo))
-        thr->clock.Release(&s->clock);
-      else if (IsAcquireOrder(mo))
-        thr->clock.Acquire(s->clock);
+	  if (!correctRaceDetection()){
+        if (success && IsAcqRelOrder(mo))
+          thr->clock.ReleaseAcquire(&s->clock);
+        else if (success && IsReleaseOrder(mo))
+          thr->clock.Release(&s->clock);
+        else if (IsAcquireOrder(mo))
+          thr->clock.Acquire(s->clock);
+	  } else {
+		  if (!IsAcquireOrder(mo)){
+			  thr->clockA.Acquire(s->clock);
+		  } else {
+			  thr->clock.Acquire(s->clock);
+		  }
+		  if (success){
+			  if (!IsReleaseOrder(mo)){
+				  thr->clockR.Release(&s->clock);
+			  } else {
+				  thr->clock.Release(&s->clock);
+			  }
+		  }
+	  }
     }
     if (success && release)
       IncrementEpoch(thr);
@@ -487,7 +533,19 @@ struct OpFence {
   static void NoTsanAtomic(morder mo) { __sync_synchronize(); }
 
   static void Atomic(ThreadState *thr, uptr pc, morder mo) {
-    // FIXME(dvyukov): not implemented.
+	  if (correctRaceDetection()){
+		SlotLocker locker(thr);
+        if (IsAcquireOrder(mo))
+            thr->clock.Acquire(&thr->clockA);
+		if (mo == mo_seq_cst){
+			auto s = ctx->metamap.GetSyncOrCreate(thr, pc, 0, false);
+			thr->clock.ReleaseAcquire(&s->clock);
+		}
+        if (IsReleaseOrder(mo)){
+            thr->clockR.Acquire(&thr->clock);
+            IncrementEpoch(thr);
+        }
+	  }
     __sync_synchronize();
   }
 };
diff --git a/compiler-rt/lib/tsan/rtl/tsan_rtl.h b/compiler-rt/lib/tsan/rtl/tsan_rtl.h
index 4dc5e630c5249..6cd40c9eff07c 100644
--- a/compiler-rt/lib/tsan/rtl/tsan_rtl.h
+++ b/compiler-rt/lib/tsan/rtl/tsan_rtl.h
@@ -177,6 +177,8 @@ struct alignas(SANITIZER_CACHE_LINE_SIZE) ThreadState {
   atomic_sint32_t pending_signals;
 
   VectorClock clock;
+  VectorClock clockR;
+  VectorClock clockA;
 
   // This is a slow path flag. On fast path, fast_state.GetIgnoreBit() is read.
   // We do not distinguish beteween ignoring reads and writes