[libcxx-commits] [libcxx] [libc++] Replace mutex+condvar with atomics in __call_once (PR #192433)
Shonie Caplan via libcxx-commits
libcxx-commits at lists.llvm.org
Thu Apr 23 04:40:21 PDT 2026
https://github.com/shoniecaplan updated https://github.com/llvm/llvm-project/pull/192433
>From 604012db4729c1c6cfa1f9f55efab064c4b8de66 Mon Sep 17 00:00:00 2001
From: Shonie Caplan <shonie4caplan at gmail.com>
Date: Fri, 17 Apr 2026 13:50:53 +0900
Subject: [PATCH 1/2] [libc++] Replace mutex+condvar with atomics in
__call_once
Replace the mutex and condition variable in __call_once with atomic.
Tried to keep down the number of compare-and-swaps with a load before it.
Like 30% improvement on uncontended calls and ~7-15% on contended ones.
---
libcxx/src/call_once.cpp | 60 +++++----
libcxx/test/benchmarks/call_once.bench.cpp | 141 +++++++++++++++++++++
2 files changed, 178 insertions(+), 23 deletions(-)
create mode 100644 libcxx/test/benchmarks/call_once.bench.cpp
diff --git a/libcxx/src/call_once.cpp b/libcxx/src/call_once.cpp
index 237969aacbab9..e12ddc5eb0917 100644
--- a/libcxx/src/call_once.cpp
+++ b/libcxx/src/call_once.cpp
@@ -16,6 +16,8 @@
#include "include/atomic_support.h"
+#include <__atomic/contention_t.h>
+
_LIBCPP_BEGIN_NAMESPACE_STD
// If dispatch_once_f ever handles C++ exceptions, and if one can get to it
@@ -25,8 +27,9 @@ _LIBCPP_BEGIN_NAMESPACE_STD
// keep in sync with: 7741191.
#if _LIBCPP_HAS_THREADS
-static constinit __libcpp_mutex_t mut = _LIBCPP_MUTEX_INITIALIZER;
-static constinit __libcpp_condvar_t cv = _LIBCPP_CONDVAR_INITIALIZER;
+_LIBCPP_EXPORTED_FROM_ABI void __cxx_atomic_notify_all(void const volatile*) _NOEXCEPT;
+_LIBCPP_EXPORTED_FROM_ABI __cxx_contention_t __libcpp_atomic_monitor(void const volatile*) _NOEXCEPT;
+_LIBCPP_EXPORTED_FROM_ABI void __libcpp_atomic_wait(void const volatile*, __cxx_contention_t) _NOEXCEPT;
#endif
void __call_once(volatile once_flag::_State_type& flag, void* arg, void (*func)(void*)) {
@@ -42,27 +45,38 @@ void __call_once(volatile once_flag::_State_type& flag, void* arg, void (*func)(
#else // !_LIBCPP_HAS_THREADS
- __libcpp_mutex_lock(&mut);
- while (flag == once_flag::_Pending)
- __libcpp_condvar_wait(&cv, &mut);
- if (flag == once_flag::_Unset) {
- auto guard = std::__make_exception_guard([&flag] {
- __libcpp_mutex_lock(&mut);
- __libcpp_relaxed_store(&flag, once_flag::_Unset);
- __libcpp_mutex_unlock(&mut);
- __libcpp_condvar_broadcast(&cv);
- });
-
- __libcpp_relaxed_store(&flag, once_flag::_Pending);
- __libcpp_mutex_unlock(&mut);
- func(arg);
- __libcpp_mutex_lock(&mut);
- __libcpp_atomic_store(&flag, once_flag::_Complete, _AO_Release);
- __libcpp_mutex_unlock(&mut);
- __libcpp_condvar_broadcast(&cv);
- guard.__complete();
- } else {
- __libcpp_mutex_unlock(&mut);
+ auto flag_read = __atomic_load_n(&flag, __ATOMIC_ACQUIRE);
+
+WAIT:
+ while (flag_read == once_flag::_Pending) {
+ __cxx_contention_t monitor = __libcpp_atomic_monitor(&flag);
+ flag_read = __atomic_load_n(&flag, __ATOMIC_ACQUIRE);
+ if (flag_read == once_flag::_Pending) {
+ __libcpp_atomic_wait(&flag, monitor);
+ flag_read = __atomic_load_n(&flag, __ATOMIC_ACQUIRE);
+ }
+ }
+
+ if (flag_read == once_flag::_Unset) {
+ once_flag::_State_type expected = once_flag::_Unset;
+ if (__atomic_compare_exchange_n(&flag, &expected, once_flag::_Pending, false, __ATOMIC_ACQUIRE, __ATOMIC_ACQUIRE)) {
+ auto guard = std::__make_exception_guard([&flag] {
+ __libcpp_atomic_store(&flag, once_flag::_Unset, _AO_Release);
+ __cxx_atomic_notify_all(&flag);
+ });
+
+ func(arg);
+
+ __libcpp_atomic_store(&flag, once_flag::_Complete, _AO_Release);
+ __cxx_atomic_notify_all(&flag);
+ guard.__complete();
+
+ } else {
+ if (expected == once_flag::_Pending) {
+ flag_read = expected;
+ goto WAIT;
+ }
+ }
}
#endif // !_LIBCPP_HAS_THREADS
diff --git a/libcxx/test/benchmarks/call_once.bench.cpp b/libcxx/test/benchmarks/call_once.bench.cpp
new file mode 100644
index 0000000000000..2b0eabf87f912
--- /dev/null
+++ b/libcxx/test/benchmarks/call_once.bench.cpp
@@ -0,0 +1,141 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03
+
+#include <mutex>
+#include <thread>
+#include <vector>
+#include <atomic>
+#include <benchmark/benchmark.h>
+
+// Steady state: flag already _Complete, never enters __call_once.
+// Measures the inline header fast-path only.
+static void BM_call_once_steady(benchmark::State& state) {
+ std::once_flag f;
+ std::call_once(f, [] {});
+ for (auto _ : state) {
+ std::call_once(f, [] {});
+ benchmark::ClobberMemory();
+ }
+}
+BENCHMARK(BM_call_once_steady);
+
+// Steady state under contention: N threads hammer an already-complete flag.
+// Measures whether the acquire-load scales across cores.
+static void BM_call_once_steady_contended(benchmark::State& state) {
+ std::once_flag f;
+ std::call_once(f, [] {});
+
+ for (auto _ : state)
+ std::call_once(f, [] {});
+}
+BENCHMARK(BM_call_once_steady_contended)->Threads(2)->Threads(4)->Threads(8)->Threads(16);
+
+// Cold path: fresh flag each iteration, single thread.
+// Measures one full trip through __call_once (CAS or mutex path).
+static void BM_call_once_cold(benchmark::State& state) {
+ for (auto _ : state) {
+ std::once_flag f;
+ std::call_once(f, [] {});
+ benchmark::DoNotOptimize(f);
+ }
+}
+BENCHMARK(BM_call_once_cold);
+
+// Contended: N threads race on a fresh flag.
+// One wins and runs func, the rest wait then return.
+// Measures the full contended path including wait/wake.
+static void BM_call_once_contended(benchmark::State& state) {
+ const int nthreads = state.range(0);
+
+ for (auto _ : state) {
+ state.PauseTiming();
+
+ std::once_flag flag;
+ std::atomic<bool> go{false};
+ std::atomic<int> ready{0};
+ std::vector<std::thread> threads;
+ threads.reserve(nthreads);
+
+ for (int i = 0; i < nthreads; ++i) {
+ threads.emplace_back([&] {
+ ready.fetch_add(1, std::memory_order_relaxed);
+ while (!go.load(std::memory_order_acquire)) {
+ }
+ std::call_once(flag, [] {});
+ });
+ }
+
+ // Wait for all threads to be ready
+ while (ready.load(std::memory_order_relaxed) < nthreads) {
+ }
+
+ state.ResumeTiming();
+ go.store(true, std::memory_order_release);
+
+ for (auto& t : threads)
+ t.join();
+ }
+
+ state.SetItemsProcessed(state.iterations() * nthreads);
+}
+BENCHMARK(BM_call_once_contended)->Arg(2)->Arg(4)->Arg(8)->Arg(16);
+
+// Contended with slow init: func takes real time, waiters must block.
+// Shows cost of wait/wake mechanism under realistic conditions.
+static void BM_call_once_slow_init(benchmark::State& state) {
+ const int nthreads = state.range(0);
+
+ for (auto _ : state) {
+ state.PauseTiming();
+
+ std::once_flag flag;
+ std::atomic<bool> go{false};
+ std::atomic<int> ready{0};
+ int shared_data = 0;
+ std::vector<std::thread> threads;
+ threads.reserve(nthreads);
+
+ for (int i = 0; i < nthreads; ++i) {
+ threads.emplace_back([&] {
+ ready.fetch_add(1, std::memory_order_relaxed);
+ while (!go.load(std::memory_order_acquire)) {
+ }
+ std::call_once(flag, [&] { benchmark::DoNotOptimize(shared_data = 42); });
+ benchmark::DoNotOptimize(shared_data);
+ });
+ }
+
+ while (ready.load(std::memory_order_relaxed) < nthreads) {
+ }
+
+ state.ResumeTiming();
+ go.store(true, std::memory_order_release);
+
+ for (auto& t : threads)
+ t.join();
+ }
+}
+BENCHMARK(BM_call_once_slow_init)->Arg(2)->Arg(4)->Arg(8)->Arg(16);
+
+// Throughput: many fresh flags in sequence, single thread.
+// Measures raw cold-path throughput without thread overhead.
+static void BM_call_once_throughput(benchmark::State& state) {
+ for (auto _ : state) {
+ for (int i = 0; i < 1000; ++i) {
+ std::once_flag f;
+ std::call_once(f, [] {});
+ benchmark::DoNotOptimize(f);
+ }
+ }
+ state.SetItemsProcessed(state.iterations() * 1000);
+}
+BENCHMARK(BM_call_once_throughput);
+
+BENCHMARK_MAIN();
>From 1004a558d41258897a4728cc838537cf1d9915d1 Mon Sep 17 00:00:00 2001
From: Shonie Caplan <shonie4caplan at gmail.com>
Date: Thu, 23 Apr 2026 20:39:04 +0900
Subject: [PATCH 2/2] squash: update to use atomic functions
---
libcxx/include/__mutex/once_flag.h | 12 +++++-----
libcxx/src/call_once.cpp | 36 ++++++++----------------------
2 files changed, 16 insertions(+), 32 deletions(-)
diff --git a/libcxx/include/__mutex/once_flag.h b/libcxx/include/__mutex/once_flag.h
index ad15b2eb6df68..5ca8d47f0bf2d 100644
--- a/libcxx/include/__mutex/once_flag.h
+++ b/libcxx/include/__mutex/once_flag.h
@@ -9,6 +9,8 @@
#ifndef _LIBCPP___MUTEX_ONCE_FLAG_H
#define _LIBCPP___MUTEX_ONCE_FLAG_H
+#include <__atomic/atomic.h>
+#include <__atomic/memory_order.h>
#include <__config>
#include <__memory/addressof.h>
#include <__tuple/tuple_size.h>
@@ -63,7 +65,7 @@ struct once_flag {
static const _State_type _Complete = ~_State_type(0);
private:
- _State_type __state_;
+ atomic<_State_type> __state_;
#ifndef _LIBCPP_CXX03_LANG
template <class _Callable, class... _Args>
@@ -113,7 +115,7 @@ void _LIBCPP_HIDE_FROM_ABI __call_once_proxy(void* __vp) {
(*__p)();
}
-_LIBCPP_EXPORTED_FROM_ABI void __call_once(volatile once_flag::_State_type&, void*, void (*)(void*));
+_LIBCPP_EXPORTED_FROM_ABI void __call_once(atomic<once_flag::_State_type>&, void*, void (*)(void*));
template <class _ValueType>
inline _LIBCPP_HIDE_FROM_ABI _ValueType __libcpp_acquire_load(_ValueType const* __value) {
@@ -128,7 +130,7 @@ inline _LIBCPP_HIDE_FROM_ABI _ValueType __libcpp_acquire_load(_ValueType const*
template <class _Callable, class... _Args>
inline _LIBCPP_HIDE_FROM_ABI void call_once(once_flag& __flag, _Callable&& __func, _Args&&... __args) {
- if (__libcpp_acquire_load(&__flag.__state_) != once_flag::_Complete) {
+ if (__flag.__state_.load(memory_order_acquire) != once_flag::_Complete) {
typedef tuple<_Callable&&, _Args&&...> _Gp;
_Gp __f(std::forward<_Callable>(__func), std::forward<_Args>(__args)...);
__call_once_param<_Gp> __p(__f);
@@ -140,7 +142,7 @@ inline _LIBCPP_HIDE_FROM_ABI void call_once(once_flag& __flag, _Callable&& __fun
template <class _Callable>
inline _LIBCPP_HIDE_FROM_ABI void call_once(once_flag& __flag, _Callable& __func) {
- if (__libcpp_acquire_load(&__flag.__state_) != once_flag::_Complete) {
+ if (__flag.__state_.load(memory_order_acquire) != once_flag::_Complete) {
__call_once_param<_Callable> __p(__func);
std::__call_once(__flag.__state_, std::addressof(__p), std::addressof(__call_once_proxy<_Callable>));
}
@@ -148,7 +150,7 @@ inline _LIBCPP_HIDE_FROM_ABI void call_once(once_flag& __flag, _Callable& __func
template <class _Callable>
inline _LIBCPP_HIDE_FROM_ABI void call_once(once_flag& __flag, const _Callable& __func) {
- if (__libcpp_acquire_load(&__flag.__state_) != once_flag::_Complete) {
+ if (__flag.__state_.load(memory_order_acquire) != once_flag::_Complete) {
__call_once_param<const _Callable> __p(__func);
std::__call_once(__flag.__state_, std::addressof(__p), std::addressof(__call_once_proxy<const _Callable>));
}
diff --git a/libcxx/src/call_once.cpp b/libcxx/src/call_once.cpp
index e12ddc5eb0917..db18f2182c290 100644
--- a/libcxx/src/call_once.cpp
+++ b/libcxx/src/call_once.cpp
@@ -10,14 +10,6 @@
#include <__mutex/once_flag.h>
#include <__utility/exception_guard.h>
-#if _LIBCPP_HAS_THREADS
-# include <__thread/support.h>
-#endif
-
-#include "include/atomic_support.h"
-
-#include <__atomic/contention_t.h>
-
_LIBCPP_BEGIN_NAMESPACE_STD
// If dispatch_once_f ever handles C++ exceptions, and if one can get to it
@@ -26,13 +18,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD
// call into dispatch_once_f instead of here. Relevant radar this code needs to
// keep in sync with: 7741191.
-#if _LIBCPP_HAS_THREADS
-_LIBCPP_EXPORTED_FROM_ABI void __cxx_atomic_notify_all(void const volatile*) _NOEXCEPT;
-_LIBCPP_EXPORTED_FROM_ABI __cxx_contention_t __libcpp_atomic_monitor(void const volatile*) _NOEXCEPT;
-_LIBCPP_EXPORTED_FROM_ABI void __libcpp_atomic_wait(void const volatile*, __cxx_contention_t) _NOEXCEPT;
-#endif
-
-void __call_once(volatile once_flag::_State_type& flag, void* arg, void (*func)(void*)) {
+void __call_once(atomic<once_flag::_State_type>& flag, void* arg, void (*func)(void*)) {
#if !_LIBCPP_HAS_THREADS
if (flag == once_flag::_Unset) {
@@ -45,30 +31,26 @@ void __call_once(volatile once_flag::_State_type& flag, void* arg, void (*func)(
#else // !_LIBCPP_HAS_THREADS
- auto flag_read = __atomic_load_n(&flag, __ATOMIC_ACQUIRE);
+ auto flag_read = flag.load(memory_order_acquire);
WAIT:
while (flag_read == once_flag::_Pending) {
- __cxx_contention_t monitor = __libcpp_atomic_monitor(&flag);
- flag_read = __atomic_load_n(&flag, __ATOMIC_ACQUIRE);
- if (flag_read == once_flag::_Pending) {
- __libcpp_atomic_wait(&flag, monitor);
- flag_read = __atomic_load_n(&flag, __ATOMIC_ACQUIRE);
- }
+ flag.wait(once_flag::_Pending, memory_order_acquire);
+ flag_read = flag.load(memory_order_acquire);
}
if (flag_read == once_flag::_Unset) {
once_flag::_State_type expected = once_flag::_Unset;
- if (__atomic_compare_exchange_n(&flag, &expected, once_flag::_Pending, false, __ATOMIC_ACQUIRE, __ATOMIC_ACQUIRE)) {
+ if (flag.compare_exchange_strong(expected, once_flag::_Pending, memory_order_acquire, memory_order_acquire)) {
auto guard = std::__make_exception_guard([&flag] {
- __libcpp_atomic_store(&flag, once_flag::_Unset, _AO_Release);
- __cxx_atomic_notify_all(&flag);
+ flag.store(once_flag::_Unset, memory_order_release);
+ flag.notify_all();
});
func(arg);
- __libcpp_atomic_store(&flag, once_flag::_Complete, _AO_Release);
- __cxx_atomic_notify_all(&flag);
+ flag.store(once_flag::_Complete, memory_order_release);
+ flag.notify_all();
guard.__complete();
} else {
More information about the libcxx-commits
mailing list