[libcxx-commits] [libcxx] [libc++] Replace mutex+condvar with atomics in __call_once (PR #192433)
Shonie Caplan via libcxx-commits
libcxx-commits at lists.llvm.org
Thu Apr 16 21:51:37 PDT 2026
https://github.com/shoniecaplan updated https://github.com/llvm/llvm-project/pull/192433
>From 604012db4729c1c6cfa1f9f55efab064c4b8de66 Mon Sep 17 00:00:00 2001
From: Shonie Caplan <shonie4caplan at gmail.com>
Date: Fri, 17 Apr 2026 13:50:53 +0900
Subject: [PATCH] [libc++] Replace mutex+condvar with atomics in __call_once
Replace the mutex and condition variable in __call_once with atomic.
Tried to keep down the number of compare-and-swaps with a load before it.
Like 30% improvement on uncontended calls and ~7-15% on contended ones.
---
libcxx/src/call_once.cpp | 60 +++++----
libcxx/test/benchmarks/call_once.bench.cpp | 141 +++++++++++++++++++++
2 files changed, 178 insertions(+), 23 deletions(-)
create mode 100644 libcxx/test/benchmarks/call_once.bench.cpp
diff --git a/libcxx/src/call_once.cpp b/libcxx/src/call_once.cpp
index 237969aacbab9..e12ddc5eb0917 100644
--- a/libcxx/src/call_once.cpp
+++ b/libcxx/src/call_once.cpp
@@ -16,6 +16,8 @@
#include "include/atomic_support.h"
+#include <__atomic/contention_t.h>
+
_LIBCPP_BEGIN_NAMESPACE_STD
// If dispatch_once_f ever handles C++ exceptions, and if one can get to it
@@ -25,8 +27,9 @@ _LIBCPP_BEGIN_NAMESPACE_STD
// keep in sync with: 7741191.
#if _LIBCPP_HAS_THREADS
-static constinit __libcpp_mutex_t mut = _LIBCPP_MUTEX_INITIALIZER;
-static constinit __libcpp_condvar_t cv = _LIBCPP_CONDVAR_INITIALIZER;
+_LIBCPP_EXPORTED_FROM_ABI void __cxx_atomic_notify_all(void const volatile*) _NOEXCEPT;
+_LIBCPP_EXPORTED_FROM_ABI __cxx_contention_t __libcpp_atomic_monitor(void const volatile*) _NOEXCEPT;
+_LIBCPP_EXPORTED_FROM_ABI void __libcpp_atomic_wait(void const volatile*, __cxx_contention_t) _NOEXCEPT;
#endif
void __call_once(volatile once_flag::_State_type& flag, void* arg, void (*func)(void*)) {
@@ -42,27 +45,38 @@ void __call_once(volatile once_flag::_State_type& flag, void* arg, void (*func)(
#else // !_LIBCPP_HAS_THREADS
- __libcpp_mutex_lock(&mut);
- while (flag == once_flag::_Pending)
- __libcpp_condvar_wait(&cv, &mut);
- if (flag == once_flag::_Unset) {
- auto guard = std::__make_exception_guard([&flag] {
- __libcpp_mutex_lock(&mut);
- __libcpp_relaxed_store(&flag, once_flag::_Unset);
- __libcpp_mutex_unlock(&mut);
- __libcpp_condvar_broadcast(&cv);
- });
-
- __libcpp_relaxed_store(&flag, once_flag::_Pending);
- __libcpp_mutex_unlock(&mut);
- func(arg);
- __libcpp_mutex_lock(&mut);
- __libcpp_atomic_store(&flag, once_flag::_Complete, _AO_Release);
- __libcpp_mutex_unlock(&mut);
- __libcpp_condvar_broadcast(&cv);
- guard.__complete();
- } else {
- __libcpp_mutex_unlock(&mut);
+ auto flag_read = __atomic_load_n(&flag, __ATOMIC_ACQUIRE);
+
+WAIT:
+ while (flag_read == once_flag::_Pending) {
+ __cxx_contention_t monitor = __libcpp_atomic_monitor(&flag);
+ flag_read = __atomic_load_n(&flag, __ATOMIC_ACQUIRE);
+ if (flag_read == once_flag::_Pending) {
+ __libcpp_atomic_wait(&flag, monitor);
+ flag_read = __atomic_load_n(&flag, __ATOMIC_ACQUIRE);
+ }
+ }
+
+ if (flag_read == once_flag::_Unset) {
+ once_flag::_State_type expected = once_flag::_Unset;
+ if (__atomic_compare_exchange_n(&flag, &expected, once_flag::_Pending, false, __ATOMIC_ACQUIRE, __ATOMIC_ACQUIRE)) {
+ auto guard = std::__make_exception_guard([&flag] {
+ __libcpp_atomic_store(&flag, once_flag::_Unset, _AO_Release);
+ __cxx_atomic_notify_all(&flag);
+ });
+
+ func(arg);
+
+ __libcpp_atomic_store(&flag, once_flag::_Complete, _AO_Release);
+ __cxx_atomic_notify_all(&flag);
+ guard.__complete();
+
+ } else {
+ if (expected == once_flag::_Pending) {
+ flag_read = expected;
+ goto WAIT;
+ }
+ }
}
#endif // !_LIBCPP_HAS_THREADS
diff --git a/libcxx/test/benchmarks/call_once.bench.cpp b/libcxx/test/benchmarks/call_once.bench.cpp
new file mode 100644
index 0000000000000..2b0eabf87f912
--- /dev/null
+++ b/libcxx/test/benchmarks/call_once.bench.cpp
@@ -0,0 +1,141 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03
+
+#include <mutex>
+#include <thread>
+#include <vector>
+#include <atomic>
+#include <benchmark/benchmark.h>
+
+// Steady state: flag already _Complete, never enters __call_once.
+// Measures the inline header fast-path only.
+static void BM_call_once_steady(benchmark::State& state) {
+ std::once_flag f;
+ std::call_once(f, [] {});
+ for (auto _ : state) {
+ std::call_once(f, [] {});
+ benchmark::ClobberMemory();
+ }
+}
+BENCHMARK(BM_call_once_steady);
+
+// Steady state under contention: N threads hammer an already-complete flag.
+// Measures whether the acquire-load scales across cores.
+static void BM_call_once_steady_contended(benchmark::State& state) {
+ std::once_flag f;
+ std::call_once(f, [] {});
+
+ for (auto _ : state)
+ std::call_once(f, [] {});
+}
+BENCHMARK(BM_call_once_steady_contended)->Threads(2)->Threads(4)->Threads(8)->Threads(16);
+
+// Cold path: fresh flag each iteration, single thread.
+// Measures one full trip through __call_once (CAS or mutex path).
+static void BM_call_once_cold(benchmark::State& state) {
+ for (auto _ : state) {
+ std::once_flag f;
+ std::call_once(f, [] {});
+ benchmark::DoNotOptimize(f);
+ }
+}
+BENCHMARK(BM_call_once_cold);
+
+// Contended: N threads race on a fresh flag.
+// One wins and runs func, the rest wait then return.
+// Measures the full contended path including wait/wake.
+static void BM_call_once_contended(benchmark::State& state) {
+ const int nthreads = state.range(0);
+
+ for (auto _ : state) {
+ state.PauseTiming();
+
+ std::once_flag flag;
+ std::atomic<bool> go{false};
+ std::atomic<int> ready{0};
+ std::vector<std::thread> threads;
+ threads.reserve(nthreads);
+
+ for (int i = 0; i < nthreads; ++i) {
+ threads.emplace_back([&] {
+ ready.fetch_add(1, std::memory_order_relaxed);
+ while (!go.load(std::memory_order_acquire)) {
+ }
+ std::call_once(flag, [] {});
+ });
+ }
+
+ // Wait for all threads to be ready
+ while (ready.load(std::memory_order_relaxed) < nthreads) {
+ }
+
+ state.ResumeTiming();
+ go.store(true, std::memory_order_release);
+
+ for (auto& t : threads)
+ t.join();
+ }
+
+ state.SetItemsProcessed(state.iterations() * nthreads);
+}
+BENCHMARK(BM_call_once_contended)->Arg(2)->Arg(4)->Arg(8)->Arg(16);
+
+// Contended with slow init: func takes real time, waiters must block.
+// Shows cost of wait/wake mechanism under realistic conditions.
+static void BM_call_once_slow_init(benchmark::State& state) {
+ const int nthreads = state.range(0);
+
+ for (auto _ : state) {
+ state.PauseTiming();
+
+ std::once_flag flag;
+ std::atomic<bool> go{false};
+ std::atomic<int> ready{0};
+ int shared_data = 0;
+ std::vector<std::thread> threads;
+ threads.reserve(nthreads);
+
+ for (int i = 0; i < nthreads; ++i) {
+ threads.emplace_back([&] {
+ ready.fetch_add(1, std::memory_order_relaxed);
+ while (!go.load(std::memory_order_acquire)) {
+ }
+ std::call_once(flag, [&] { benchmark::DoNotOptimize(shared_data = 42); });
+ benchmark::DoNotOptimize(shared_data);
+ });
+ }
+
+ while (ready.load(std::memory_order_relaxed) < nthreads) {
+ }
+
+ state.ResumeTiming();
+ go.store(true, std::memory_order_release);
+
+ for (auto& t : threads)
+ t.join();
+ }
+}
+BENCHMARK(BM_call_once_slow_init)->Arg(2)->Arg(4)->Arg(8)->Arg(16);
+
+// Throughput: many fresh flags in sequence, single thread.
+// Measures raw cold-path throughput without thread overhead.
+static void BM_call_once_throughput(benchmark::State& state) {
+ for (auto _ : state) {
+ for (int i = 0; i < 1000; ++i) {
+ std::once_flag f;
+ std::call_once(f, [] {});
+ benchmark::DoNotOptimize(f);
+ }
+ }
+ state.SetItemsProcessed(state.iterations() * 1000);
+}
+BENCHMARK(BM_call_once_throughput);
+
+BENCHMARK_MAIN();
More information about the libcxx-commits
mailing list