[compiler-rt] [llvm] [tsan] Introduce Adaptive Delay Scheduling to TSAN (PR #178836)
Chris Cotter via llvm-commits
llvm-commits at lists.llvm.org
Thu Feb 12 20:14:46 PST 2026
================
@@ -0,0 +1,467 @@
+//===-- tsan_fuzzing_scheduler.h --------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of ThreadSanitizer (TSan), a race detector.
+//
+//===----------------------------------------------------------------------===//
+
+#include "tsan_fuzzing_scheduler.h"
+
+#include "interception/interception.h"
+#include "sanitizer_common/sanitizer_allocator_internal.h"
+#include "sanitizer_common/sanitizer_common.h"
+#include "sanitizer_common/sanitizer_errno_codes.h"
+#include "tsan_interface.h"
+#include "tsan_rtl.h"
+
+extern "C" int pthread_detach(void*);
+
+namespace __interception {
+extern int (*real_pthread_detach)(void*);
+} // namespace __interception
+
+namespace __tsan {
+
+namespace {
+
+#ifdef __clang__
+# pragma clang diagnostic push
+# pragma clang diagnostic ignored "-Wnon-virtual-dtor"
+#endif
+struct NullFuzzingScheduler : IFuzzingScheduler {
+#ifdef __clang__
+# pragma clang diagnostic pop
+#endif
+ void Init() override {}
+ void AtomicOpFence(int mo) override {}
+ void AtomicOpAddr(uptr addr, int mo) override {}
+ void MutexCvOp() override {}
+ int DetachThread(void* th) override { return REAL(pthread_detach)(th); }
+ void BeforeChildThreadRuns() override {}
+ void AfterThreadCreation() override {}
+ void JoinOp() override {}
+};
+
+// =============================================================================
+// DelaySpec: Represents a delay configuration parsed from flag strings
+// =============================================================================
+//
+// Delay can be specified as:
+// - "spin=N" : Spin for up to N cycles (very short delays)
+// - "yield" : Call sched_yield() once
+// - "sleep_us=N" : Sleep for up to N microseconds
+
+enum class DelayType { Spin, Yield, SleepUs };
+
+struct DelaySpec {
+ DelayType type;
+ int value; // spin cycles or sleep_us value; ignored for yield
+
+ // Estimated nanoseconds per spin cycle (volatile loop iteration)
+ static constexpr u64 kNsPerSpinCycle = 5;
+ // Estimated nanoseconds for a yield (context switch overhead)
+ static constexpr u64 kNsPerYield = 500;
+
+ static DelaySpec Parse(const char* str) {
+ DelaySpec spec;
+ if (internal_strncmp(str, "spin=", 5) == 0) {
+ spec.type = DelayType::Spin;
+ spec.value = internal_atoll(str + 5);
+ if (spec.value <= 0)
+ spec.value = 10;
+ } else if (internal_strcmp(str, "yield") == 0) {
+ spec.type = DelayType::Yield;
+ spec.value = 0;
+ } else if (internal_strncmp(str, "sleep_us=", 9) == 0) {
+ spec.type = DelayType::SleepUs;
+ spec.value = internal_atoll(str + 9);
+ if (spec.value <= 0)
+ spec.value = 1;
+ } else {
+ // Default to yield if unrecognized
+ Printf("WARNING: Unrecognized delay spec '%s', defaulting to yield\n",
+ str);
+ spec.type = DelayType::Yield;
+ spec.value = 0;
+ }
+ return spec;
+ }
+
+ u64 EstimatedNs() const {
+ switch (type) {
+ case DelayType::Spin:
+ return value * kNsPerSpinCycle;
+ case DelayType::Yield:
+ return kNsPerYield;
+ case DelayType::SleepUs:
+ return value * 1000ULL;
+ }
+ return 0;
+ }
+
+ const char* TypeName() const {
+ switch (type) {
+ case DelayType::Spin:
+ return "spin";
+ case DelayType::Yield:
+ return "yield";
+ case DelayType::SleepUs:
+ return "sleep_us";
+ }
+ return "unknown";
+ }
+};
+
+// =============================================================================
+// AdaptiveDelayScheduler: Time-budget aware delay injection for race exposure
+// =============================================================================
+//
+// This scheduler injects delays to expose data races while maintaining a
+// configurable overhead target. It uses several strategies:
+//
+// 1. Time-Budget Controller: Tracks cumulative delays vs wall-clock time
+// and adjusts delay probability to maintain target overhead.
+//
+// 2. Tiered Delays: Different delay strategies for different op types:
+// - Relaxed atomics: Very rare sampling, tiny spin delays
+// - Sync atomics (acq/rel/seq_cst): Moderate sampling, small usleep
+// - Mutex/CV ops: Higher sampling, larger delays
+// - Thread create/join: Always delay (rare but high value)
+//
+// 3. Address-based Sampling: Exponential backoff per address to avoid
+// repeatedly delaying hot atomics.
+
+#ifdef __clang__
+# pragma clang diagnostic push
+# pragma clang diagnostic ignored "-Wnon-virtual-dtor"
+#endif
+struct AdaptiveDelayScheduler : NullFuzzingScheduler {
+#ifdef __clang__
+# pragma clang diagnostic pop
+#endif
+
+ ALWAYS_INLINE static FuzzingSchedulerTlsData* TLS() {
+ return &cur_thread()->fuzzingSchedulerTlsData;
+ }
+ ALWAYS_INLINE static unsigned int* GetRandomSeed() {
+ return &cur_thread()->fuzzingSchedulerTlsData.tls_random_seed_;
+ }
+ ALWAYS_INLINE static void SetRandomSeed(unsigned int seed) {
+ cur_thread()->fuzzingSchedulerTlsData.tls_random_seed_ = seed;
+ }
+
+ // The public facing option is adaptive_delay_aggressiveness, which is an
+ // opaque value for the user to tune the amount of delay injected into the
+ // program. Internally, the implementation maps the aggressiveness to a target
+ // percent delay for the overall program runtime. It's not easy to implement
+ // a true wall clock delay target (e.g., 25% program wall time slowdown)
+ // because 1) spin loops and yield are hard to calculate actual wall time
+ // slowness and 2) usleep(N) is often slower than advertised. Thus, we keep
+ // the user facing parameter opaque to not under deliver on a promise of
+ // percent wall time slowdown.
+ struct TimeBudget {
+ int target_overhead_pct_;
+ Percent target_low_;
+ Percent target_high_;
+
+ void Init(int target_pct) {
+ target_overhead_pct_ = target_pct;
+ target_low_ = Percent::FromPct(
+ target_overhead_pct_ >= 5 ? target_overhead_pct_ - 5 : 0);
+ target_high_ = Percent::FromPct(target_overhead_pct_ + 5);
+ }
+
+ static constexpr u64 BucketDurationNs = 30'000'000'000ULL;
+
+ void RecordDelay(u64 delay_ns) {
+ u64 now = NanoTime();
+ u64 elapsed_ns = now - TLS()->bucket_start_ns_;
+
+ if (elapsed_ns >= BucketDurationNs) {
+ // Shift: old bucket is discarded, new becomes old, start fresh new
+ TLS()->delay_buckets_ns_[0] = TLS()->delay_buckets_ns_[1];
+ TLS()->delay_buckets_ns_[1] = 0;
+ TLS()->bucket_start_ns_ = now;
+ }
+
+ TLS()->delay_buckets_ns_[1] += delay_ns;
+ }
+
+ Percent GetOverheadPercent() {
+ u64 now = NanoTime();
+ u64 elapsed_ns = now - TLS()->bucket_start_ns_;
+
+ // Need at least 1ms to calculate
+ if (elapsed_ns < 1'000'000ULL)
+ return Percent::FromPct(0);
+
+ if (elapsed_ns > BucketDurationNs * 2) {
+ // Both buckets are stale
+ return Percent::FromPct(0);
+ } else if (elapsed_ns > BucketDurationNs) {
+ // bucket[0] is stale, use only bucket[1] (current bucket)
+ u64 total_delay_ns = TLS()->delay_buckets_ns_[1];
+ return Percent::FromRatio(total_delay_ns, elapsed_ns);
+ } else {
+ u64 total_delay_ns =
+ TLS()->delay_buckets_ns_[0] + TLS()->delay_buckets_ns_[1];
+ u64 window_ns = BucketDurationNs + elapsed_ns;
+ return Percent::FromRatio(total_delay_ns, window_ns);
+ }
+ }
+
+ bool ShouldDelay() {
+ Percent ratio = GetOverheadPercent();
+
+ if (ratio < target_low_)
+ return true;
+ if (ratio > target_high_)
+ return false;
+
+ // Linear interpolation: at target_low -> 100%, at target_high -> 0%
+ Percent prob = (target_high_ - ratio) / (target_high_ - target_low_);
+ return prob.RandomCheck(GetRandomSeed());
+ }
+ };
+
+ // Address Sampler with Exponential Backoff
+ struct AddressSampler {
+ static constexpr u64 TABLE_SIZE = 2048;
+ struct Entry {
+ atomic_uintptr_t addr_;
+ atomic_uint32_t count_;
+ };
+ Entry table_[TABLE_SIZE];
+ static constexpr u32 ExponentialBackoffCap = 64;
+
+ void Init() {
+ for (u64 i = 0; i < TABLE_SIZE; ++i) {
+ atomic_store(&table_[i].addr_, 0, memory_order_relaxed);
+ atomic_store(&table_[i].count_, 0, memory_order_relaxed);
+ }
+ }
+
+ static ALWAYS_INLINE u64 splitmix64(u64 x) {
+ x = (x ^ (x >> 30)) * 0xBF58476D1CE4E5B9ULL;
+ x = (x ^ (x >> 27)) * 0x94D049BB133111EBULL;
+ x = x ^ (x >> 31);
+ return x;
+ }
+
+ // Uses exponential backoff: delay on 1st, 2nd, 4th, 8th, 16th, ...
+ bool ShouldDelayAddr(uptr addr) {
+ u64 idx = splitmix64(addr >> 3) & (TABLE_SIZE - 1);
+ Entry& e = table_[idx];
+
+ // This function is not thread safe.
+ // If two threads access the same hashed entry in parallel,
+ // worst case, we may end up returning true too often. This is
+ // acceptable...instead of full locking.
+
+ uptr stored_addr = atomic_load(&e.addr_, memory_order_relaxed);
+ if (stored_addr != addr) {
+ // Hash Collision - reset
+ atomic_store(&e.addr_, addr, memory_order_relaxed);
+ atomic_store(&e.count_, 1, memory_order_relaxed);
+ return true;
+ }
+
+ u32 count = atomic_fetch_add(&e.count_, 1, memory_order_relaxed) + 1;
+
+ if ((count & (count - 1)) == 0 && count <= ExponentialBackoffCap)
+ return true;
+ return false;
+ }
+ };
+
+ TimeBudget budget_;
+ AddressSampler sampler_;
+
+ int relaxed_sample_rate_;
+ int sync_atomic_sample_rate_;
+ int mutex_sample_rate_;
+ DelaySpec atomic_delay_;
+ DelaySpec sync_delay_;
+
+ void Init() override { InitTls(); }
+
+ void InitTls() {
+ TLS()->bucket_start_ns_ = NanoTime();
+ TLS()->delay_buckets_ns_[0] = 0;
+ TLS()->delay_buckets_ns_[1] = 0;
+
+ SetRandomSeed(flags()->adaptive_delay_random_seed);
+ if (*GetRandomSeed() == 0)
+ SetRandomSeed(NanoTime());
+ TLS()->tls_initialized_ = true;
+ }
+
+ bool IsTlsInitialized() const { return TLS()->tls_initialized_; }
+
+ AdaptiveDelayScheduler() {
+ relaxed_sample_rate_ = flags()->adaptive_delay_relaxed_sample_rate;
+ sync_atomic_sample_rate_ = flags()->adaptive_delay_sync_atomic_sample_rate;
+ mutex_sample_rate_ = flags()->adaptive_delay_mutex_sample_rate;
+ atomic_delay_ = DelaySpec::Parse(flags()->adaptive_delay_max_atomic);
+ sync_delay_ = DelaySpec::Parse(flags()->adaptive_delay_max_sync);
+
+ int delay_aggressiveness = flags()->adaptive_delay_aggressiveness;
+ if (delay_aggressiveness < 1)
+ delay_aggressiveness = 1;
+
+ budget_.Init(delay_aggressiveness);
+ sampler_.Init();
+
+ Printf("INFO: ThreadSanitizer AdaptiveDelayScheduler initialized\n");
+ Printf(" Delay aggressiveness: %d\n", delay_aggressiveness);
+ Printf(" Random seed: %u\n", *GetRandomSeed());
+ Printf(" Relaxed atomic sample rate: 1/%d\n", relaxed_sample_rate_);
+ Printf(" Sync atomic sample rate: 1/%d\n", sync_atomic_sample_rate_);
+ Printf(" Mutex sample rate: 1/%d\n", mutex_sample_rate_);
+ Printf(" Atomic delay: %s=%d (~%llu ns)\n", atomic_delay_.TypeName(),
+ atomic_delay_.value, atomic_delay_.EstimatedNs());
+ Printf(" Sync delay: %s=%d (~%llu ns)\n", sync_delay_.TypeName(),
+ sync_delay_.value, sync_delay_.EstimatedNs());
+ }
+
+ void DoSpinDelay(int cycles) {
+ volatile int v = 0;
+ for (int i = 0; i < cycles; ++i) v = i;
+ (void)v;
+ }
+
+ void DoYieldDelay() { internal_sched_yield(); }
+
+ void DoSleepUsDelay(int max_us) {
+ int delay_us = 1 + (Rand(GetRandomSeed()) % max_us);
+ internal_usleep(delay_us);
+ budget_.RecordDelay(delay_us * 1000ULL);
+ }
+
+ void ExecuteDelay(const DelaySpec& spec) {
+ switch (spec.type) {
+ case DelayType::Spin: {
+ int cycles = 1 + (Rand(GetRandomSeed()) % spec.value);
+ DoSpinDelay(cycles);
+ budget_.RecordDelay(cycles * DelaySpec::kNsPerSpinCycle);
+ break;
+ }
+ case DelayType::Yield:
+ DoYieldDelay();
+ budget_.RecordDelay(DelaySpec::kNsPerYield);
+ break;
+ case DelayType::SleepUs:
+ DoSleepUsDelay(spec.value);
+ break;
+ }
+ }
+
+ void AtomicRelaxedOpDelay() {
+ if ((Rand(GetRandomSeed()) % relaxed_sample_rate_) != 0)
+ return;
+ if (!budget_.ShouldDelay())
+ return;
+
+ DoSpinDelay(10 + (Rand(GetRandomSeed()) % 10));
+ static constexpr int spin_delay_estimate_ns = 50;
+ budget_.RecordDelay(spin_delay_estimate_ns);
+ }
+
+ void AtomicSyncOpDelay(uptr* addr) {
+ if ((Rand(GetRandomSeed()) % sync_atomic_sample_rate_) != 0)
+ return;
+ if (!budget_.ShouldDelay())
+ return;
+
+ if (addr && !sampler_.ShouldDelayAddr(*addr))
+ return;
+
+ ExecuteDelay(atomic_delay_);
+ }
+
+ void AtomicOpFence(int mo) override {
+ CHECK(IsTlsInitialized());
+
+ if (mo < mo_acquire)
+ AtomicRelaxedOpDelay();
+ else
+ AtomicSyncOpDelay(nullptr);
+ }
+
+ void AtomicOpAddr(uptr addr, int mo) override {
+ CHECK(IsTlsInitialized());
+
+ if (mo < mo_acquire)
+ AtomicRelaxedOpDelay();
+ else
+ AtomicSyncOpDelay(&addr);
+ }
+
+ void UnsampledDelay() {
+ CHECK(IsTlsInitialized());
+
+ if (!budget_.ShouldDelay())
+ return;
+
+ ExecuteDelay(sync_delay_);
+ }
+
+ void MutexCvOp() override {
+ CHECK(IsTlsInitialized());
+
+ if ((Rand(GetRandomSeed()) % mutex_sample_rate_) != 0)
+ return;
+ if (!budget_.ShouldDelay())
+ return;
+
+ ExecuteDelay(sync_delay_);
+ }
+
+ void JoinOp() override { UnsampledDelay(); }
+
+ void BeforeChildThreadRuns() override {
+ InitTls();
+ UnsampledDelay();
+ }
+
+ void AfterThreadCreation() override { UnsampledDelay(); }
+
+ int DetachThread(void* th) override {
+ int res = REAL(pthread_detach)(th);
----------------
ccotter wrote:
This is also a relic from when I started out with the impl in https://reviews.llvm.org/D65383, but have since diverged. I'll clean this up (btw, I've referred to that phab quite a bit and I do want to say it has been a great inspiration and starting point!).
https://github.com/llvm/llvm-project/pull/178836
More information about the llvm-commits
mailing list