[libcxx-commits] [libcxx] [libc++] Replace mutex+condvar with atomics in __call_once (PR #192433)

Thu Apr 16 21:51:37 PDT 2026

https://github.com/shoniecaplan updated https://github.com/llvm/llvm-project/pull/192433

>From 604012db4729c1c6cfa1f9f55efab064c4b8de66 Mon Sep 17 00:00:00 2001
From: Shonie Caplan <shonie4caplan at gmail.com>
Date: Fri, 17 Apr 2026 13:50:53 +0900
Subject: [PATCH] [libc++] Replace mutex+condvar with atomics in __call_once

Replace the mutex and condition variable in __call_once with atomic.

Tried to keep down the number of compare-and-swaps with a load before it.

Like 30% improvement on uncontended calls and ~7-15% on contended ones.
---
 libcxx/src/call_once.cpp                   |  60 +++++----
 libcxx/test/benchmarks/call_once.bench.cpp | 141 +++++++++++++++++++++
 2 files changed, 178 insertions(+), 23 deletions(-)
 create mode 100644 libcxx/test/benchmarks/call_once.bench.cpp

diff --git a/libcxx/src/call_once.cpp b/libcxx/src/call_once.cpp
index 237969aacbab9..e12ddc5eb0917 100644
--- a/libcxx/src/call_once.cpp
+++ b/libcxx/src/call_once.cpp
@@ -16,6 +16,8 @@
 
 #include "include/atomic_support.h"
 
+#include <__atomic/contention_t.h>
+
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 // If dispatch_once_f ever handles C++ exceptions, and if one can get to it
@@ -25,8 +27,9 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 // keep in sync with:  7741191.
 
 #if _LIBCPP_HAS_THREADS
-static constinit __libcpp_mutex_t mut  = _LIBCPP_MUTEX_INITIALIZER;
-static constinit __libcpp_condvar_t cv = _LIBCPP_CONDVAR_INITIALIZER;
+_LIBCPP_EXPORTED_FROM_ABI void __cxx_atomic_notify_all(void const volatile*) _NOEXCEPT;
+_LIBCPP_EXPORTED_FROM_ABI __cxx_contention_t __libcpp_atomic_monitor(void const volatile*) _NOEXCEPT;
+_LIBCPP_EXPORTED_FROM_ABI void __libcpp_atomic_wait(void const volatile*, __cxx_contention_t) _NOEXCEPT;
 #endif
 
 void __call_once(volatile once_flag::_State_type& flag, void* arg, void (*func)(void*)) {
@@ -42,27 +45,38 @@ void __call_once(volatile once_flag::_State_type& flag, void* arg, void (*func)(
 
 #else // !_LIBCPP_HAS_THREADS
 
-  __libcpp_mutex_lock(&mut);
-  while (flag == once_flag::_Pending)
-    __libcpp_condvar_wait(&cv, &mut);
-  if (flag == once_flag::_Unset) {
-    auto guard = std::__make_exception_guard([&flag] {
-      __libcpp_mutex_lock(&mut);
-      __libcpp_relaxed_store(&flag, once_flag::_Unset);
-      __libcpp_mutex_unlock(&mut);
-      __libcpp_condvar_broadcast(&cv);
-    });
-
-    __libcpp_relaxed_store(&flag, once_flag::_Pending);
-    __libcpp_mutex_unlock(&mut);
-    func(arg);
-    __libcpp_mutex_lock(&mut);
-    __libcpp_atomic_store(&flag, once_flag::_Complete, _AO_Release);
-    __libcpp_mutex_unlock(&mut);
-    __libcpp_condvar_broadcast(&cv);
-    guard.__complete();
-  } else {
-    __libcpp_mutex_unlock(&mut);
+  auto flag_read = __atomic_load_n(&flag, __ATOMIC_ACQUIRE);
+
+WAIT:
+  while (flag_read == once_flag::_Pending) {
+    __cxx_contention_t monitor = __libcpp_atomic_monitor(&flag);
+    flag_read                  = __atomic_load_n(&flag, __ATOMIC_ACQUIRE);
+    if (flag_read == once_flag::_Pending) {
+      __libcpp_atomic_wait(&flag, monitor);
+      flag_read = __atomic_load_n(&flag, __ATOMIC_ACQUIRE);
+    }
+  }
+
+  if (flag_read == once_flag::_Unset) {
+    once_flag::_State_type expected = once_flag::_Unset;
+    if (__atomic_compare_exchange_n(&flag, &expected, once_flag::_Pending, false, __ATOMIC_ACQUIRE, __ATOMIC_ACQUIRE)) {
+      auto guard = std::__make_exception_guard([&flag] {
+        __libcpp_atomic_store(&flag, once_flag::_Unset, _AO_Release);
+        __cxx_atomic_notify_all(&flag);
+      });
+
+      func(arg);
+
+      __libcpp_atomic_store(&flag, once_flag::_Complete, _AO_Release);
+      __cxx_atomic_notify_all(&flag);
+      guard.__complete();
+
+    } else {
+      if (expected == once_flag::_Pending) {
+        flag_read = expected;
+        goto WAIT;
+      }
+    }
   }
 
 #endif // !_LIBCPP_HAS_THREADS
diff --git a/libcxx/test/benchmarks/call_once.bench.cpp b/libcxx/test/benchmarks/call_once.bench.cpp
new file mode 100644
index 0000000000000..2b0eabf87f912
--- /dev/null
+++ b/libcxx/test/benchmarks/call_once.bench.cpp
@@ -0,0 +1,141 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03
+
+#include <mutex>
+#include <thread>
+#include <vector>
+#include <atomic>
+#include <benchmark/benchmark.h>
+
+// Steady state: flag already _Complete, never enters __call_once.
+// Measures the inline header fast-path only.
+static void BM_call_once_steady(benchmark::State& state) {
+  std::once_flag f;
+  std::call_once(f, [] {});
+  for (auto _ : state) {
+    std::call_once(f, [] {});
+    benchmark::ClobberMemory();
+  }
+}
+BENCHMARK(BM_call_once_steady);
+
+// Steady state under contention: N threads hammer an already-complete flag.
+// Measures whether the acquire-load scales across cores.
+static void BM_call_once_steady_contended(benchmark::State& state) {
+  std::once_flag f;
+  std::call_once(f, [] {});
+
+  for (auto _ : state)
+    std::call_once(f, [] {});
+}
+BENCHMARK(BM_call_once_steady_contended)->Threads(2)->Threads(4)->Threads(8)->Threads(16);
+
+// Cold path: fresh flag each iteration, single thread.
+// Measures one full trip through __call_once (CAS or mutex path).
+static void BM_call_once_cold(benchmark::State& state) {
+  for (auto _ : state) {
+    std::once_flag f;
+    std::call_once(f, [] {});
+    benchmark::DoNotOptimize(f);
+  }
+}
+BENCHMARK(BM_call_once_cold);
+
+// Contended: N threads race on a fresh flag.
+// One wins and runs func, the rest wait then return.
+// Measures the full contended path including wait/wake.
+static void BM_call_once_contended(benchmark::State& state) {
+  const int nthreads = state.range(0);
+
+  for (auto _ : state) {
+    state.PauseTiming();
+
+    std::once_flag flag;
+    std::atomic<bool> go{false};
+    std::atomic<int> ready{0};
+    std::vector<std::thread> threads;
+    threads.reserve(nthreads);
+
+    for (int i = 0; i < nthreads; ++i) {
+      threads.emplace_back([&] {
+        ready.fetch_add(1, std::memory_order_relaxed);
+        while (!go.load(std::memory_order_acquire)) {
+        }
+        std::call_once(flag, [] {});
+      });
+    }
+
+    // Wait for all threads to be ready
+    while (ready.load(std::memory_order_relaxed) < nthreads) {
+    }
+
+    state.ResumeTiming();
+    go.store(true, std::memory_order_release);
+
+    for (auto& t : threads)
+      t.join();
+  }
+
+  state.SetItemsProcessed(state.iterations() * nthreads);
+}
+BENCHMARK(BM_call_once_contended)->Arg(2)->Arg(4)->Arg(8)->Arg(16);
+
+// Contended with slow init: func takes real time, waiters must block.
+// Shows cost of wait/wake mechanism under realistic conditions.
+static void BM_call_once_slow_init(benchmark::State& state) {
+  const int nthreads = state.range(0);
+
+  for (auto _ : state) {
+    state.PauseTiming();
+
+    std::once_flag flag;
+    std::atomic<bool> go{false};
+    std::atomic<int> ready{0};
+    int shared_data = 0;
+    std::vector<std::thread> threads;
+    threads.reserve(nthreads);
+
+    for (int i = 0; i < nthreads; ++i) {
+      threads.emplace_back([&] {
+        ready.fetch_add(1, std::memory_order_relaxed);
+        while (!go.load(std::memory_order_acquire)) {
+        }
+        std::call_once(flag, [&] { benchmark::DoNotOptimize(shared_data = 42); });
+        benchmark::DoNotOptimize(shared_data);
+      });
+    }
+
+    while (ready.load(std::memory_order_relaxed) < nthreads) {
+    }
+
+    state.ResumeTiming();
+    go.store(true, std::memory_order_release);
+
+    for (auto& t : threads)
+      t.join();
+  }
+}
+BENCHMARK(BM_call_once_slow_init)->Arg(2)->Arg(4)->Arg(8)->Arg(16);
+
+// Throughput: many fresh flags in sequence, single thread.
+// Measures raw cold-path throughput without thread overhead.
+static void BM_call_once_throughput(benchmark::State& state) {
+  for (auto _ : state) {
+    for (int i = 0; i < 1000; ++i) {
+      std::once_flag f;
+      std::call_once(f, [] {});
+      benchmark::DoNotOptimize(f);
+    }
+  }
+  state.SetItemsProcessed(state.iterations() * 1000);
+}
+BENCHMARK(BM_call_once_throughput);
+
+BENCHMARK_MAIN();