[libcxx-commits] [libcxx] [libc++] Replace mutex+condvar with atomics in __call_once (PR #192433)

Shonie Caplan via libcxx-commits libcxx-commits at lists.llvm.org
Thu Apr 23 04:40:21 PDT 2026


https://github.com/shoniecaplan updated https://github.com/llvm/llvm-project/pull/192433

>From 604012db4729c1c6cfa1f9f55efab064c4b8de66 Mon Sep 17 00:00:00 2001
From: Shonie Caplan <shonie4caplan at gmail.com>
Date: Fri, 17 Apr 2026 13:50:53 +0900
Subject: [PATCH 1/2] [libc++] Replace mutex+condvar with atomics in
 __call_once

Replace the mutex and condition variable in __call_once with atomic.

Tried to keep down the number of compare-and-swaps with a load before it.

Like 30% improvement on uncontended calls and ~7-15% on contended ones.
---
 libcxx/src/call_once.cpp                   |  60 +++++----
 libcxx/test/benchmarks/call_once.bench.cpp | 141 +++++++++++++++++++++
 2 files changed, 178 insertions(+), 23 deletions(-)
 create mode 100644 libcxx/test/benchmarks/call_once.bench.cpp

diff --git a/libcxx/src/call_once.cpp b/libcxx/src/call_once.cpp
index 237969aacbab9..e12ddc5eb0917 100644
--- a/libcxx/src/call_once.cpp
+++ b/libcxx/src/call_once.cpp
@@ -16,6 +16,8 @@
 
 #include "include/atomic_support.h"
 
+#include <__atomic/contention_t.h>
+
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 // If dispatch_once_f ever handles C++ exceptions, and if one can get to it
@@ -25,8 +27,9 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 // keep in sync with:  7741191.
 
 #if _LIBCPP_HAS_THREADS
-static constinit __libcpp_mutex_t mut  = _LIBCPP_MUTEX_INITIALIZER;
-static constinit __libcpp_condvar_t cv = _LIBCPP_CONDVAR_INITIALIZER;
+_LIBCPP_EXPORTED_FROM_ABI void __cxx_atomic_notify_all(void const volatile*) _NOEXCEPT;
+_LIBCPP_EXPORTED_FROM_ABI __cxx_contention_t __libcpp_atomic_monitor(void const volatile*) _NOEXCEPT;
+_LIBCPP_EXPORTED_FROM_ABI void __libcpp_atomic_wait(void const volatile*, __cxx_contention_t) _NOEXCEPT;
 #endif
 
 void __call_once(volatile once_flag::_State_type& flag, void* arg, void (*func)(void*)) {
@@ -42,27 +45,38 @@ void __call_once(volatile once_flag::_State_type& flag, void* arg, void (*func)(
 
 #else // !_LIBCPP_HAS_THREADS
 
-  __libcpp_mutex_lock(&mut);
-  while (flag == once_flag::_Pending)
-    __libcpp_condvar_wait(&cv, &mut);
-  if (flag == once_flag::_Unset) {
-    auto guard = std::__make_exception_guard([&flag] {
-      __libcpp_mutex_lock(&mut);
-      __libcpp_relaxed_store(&flag, once_flag::_Unset);
-      __libcpp_mutex_unlock(&mut);
-      __libcpp_condvar_broadcast(&cv);
-    });
-
-    __libcpp_relaxed_store(&flag, once_flag::_Pending);
-    __libcpp_mutex_unlock(&mut);
-    func(arg);
-    __libcpp_mutex_lock(&mut);
-    __libcpp_atomic_store(&flag, once_flag::_Complete, _AO_Release);
-    __libcpp_mutex_unlock(&mut);
-    __libcpp_condvar_broadcast(&cv);
-    guard.__complete();
-  } else {
-    __libcpp_mutex_unlock(&mut);
+  auto flag_read = __atomic_load_n(&flag, __ATOMIC_ACQUIRE);
+
+WAIT:
+  while (flag_read == once_flag::_Pending) {
+    __cxx_contention_t monitor = __libcpp_atomic_monitor(&flag);
+    flag_read                  = __atomic_load_n(&flag, __ATOMIC_ACQUIRE);
+    if (flag_read == once_flag::_Pending) {
+      __libcpp_atomic_wait(&flag, monitor);
+      flag_read = __atomic_load_n(&flag, __ATOMIC_ACQUIRE);
+    }
+  }
+
+  if (flag_read == once_flag::_Unset) {
+    once_flag::_State_type expected = once_flag::_Unset;
+    if (__atomic_compare_exchange_n(&flag, &expected, once_flag::_Pending, false, __ATOMIC_ACQUIRE, __ATOMIC_ACQUIRE)) {
+      auto guard = std::__make_exception_guard([&flag] {
+        __libcpp_atomic_store(&flag, once_flag::_Unset, _AO_Release);
+        __cxx_atomic_notify_all(&flag);
+      });
+
+      func(arg);
+
+      __libcpp_atomic_store(&flag, once_flag::_Complete, _AO_Release);
+      __cxx_atomic_notify_all(&flag);
+      guard.__complete();
+
+    } else {
+      if (expected == once_flag::_Pending) {
+        flag_read = expected;
+        goto WAIT;
+      }
+    }
   }
 
 #endif // !_LIBCPP_HAS_THREADS
diff --git a/libcxx/test/benchmarks/call_once.bench.cpp b/libcxx/test/benchmarks/call_once.bench.cpp
new file mode 100644
index 0000000000000..2b0eabf87f912
--- /dev/null
+++ b/libcxx/test/benchmarks/call_once.bench.cpp
@@ -0,0 +1,141 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03
+
+#include <mutex>
+#include <thread>
+#include <vector>
+#include <atomic>
+#include <benchmark/benchmark.h>
+
+// Steady state: flag already _Complete, never enters __call_once.
+// Measures the inline header fast-path only.
+static void BM_call_once_steady(benchmark::State& state) {
+  std::once_flag f;
+  std::call_once(f, [] {});
+  for (auto _ : state) {
+    std::call_once(f, [] {});
+    benchmark::ClobberMemory();
+  }
+}
+BENCHMARK(BM_call_once_steady);
+
+// Steady state under contention: N threads hammer an already-complete flag.
+// Measures whether the acquire-load scales across cores.
+static void BM_call_once_steady_contended(benchmark::State& state) {
+  std::once_flag f;
+  std::call_once(f, [] {});
+
+  for (auto _ : state)
+    std::call_once(f, [] {});
+}
+BENCHMARK(BM_call_once_steady_contended)->Threads(2)->Threads(4)->Threads(8)->Threads(16);
+
+// Cold path: fresh flag each iteration, single thread.
+// Measures one full trip through __call_once (CAS or mutex path).
+static void BM_call_once_cold(benchmark::State& state) {
+  for (auto _ : state) {
+    std::once_flag f;
+    std::call_once(f, [] {});
+    benchmark::DoNotOptimize(f);
+  }
+}
+BENCHMARK(BM_call_once_cold);
+
+// Contended: N threads race on a fresh flag.
+// One wins and runs func, the rest wait then return.
+// Measures the full contended path including wait/wake.
+static void BM_call_once_contended(benchmark::State& state) {
+  const int nthreads = state.range(0);
+
+  for (auto _ : state) {
+    state.PauseTiming();
+
+    std::once_flag flag;
+    std::atomic<bool> go{false};
+    std::atomic<int> ready{0};
+    std::vector<std::thread> threads;
+    threads.reserve(nthreads);
+
+    for (int i = 0; i < nthreads; ++i) {
+      threads.emplace_back([&] {
+        ready.fetch_add(1, std::memory_order_relaxed);
+        while (!go.load(std::memory_order_acquire)) {
+        }
+        std::call_once(flag, [] {});
+      });
+    }
+
+    // Wait for all threads to be ready
+    while (ready.load(std::memory_order_relaxed) < nthreads) {
+    }
+
+    state.ResumeTiming();
+    go.store(true, std::memory_order_release);
+
+    for (auto& t : threads)
+      t.join();
+  }
+
+  state.SetItemsProcessed(state.iterations() * nthreads);
+}
+BENCHMARK(BM_call_once_contended)->Arg(2)->Arg(4)->Arg(8)->Arg(16);
+
+// Contended with slow init: func takes real time, waiters must block.
+// Shows cost of wait/wake mechanism under realistic conditions.
+static void BM_call_once_slow_init(benchmark::State& state) {
+  const int nthreads = state.range(0);
+
+  for (auto _ : state) {
+    state.PauseTiming();
+
+    std::once_flag flag;
+    std::atomic<bool> go{false};
+    std::atomic<int> ready{0};
+    int shared_data = 0;
+    std::vector<std::thread> threads;
+    threads.reserve(nthreads);
+
+    for (int i = 0; i < nthreads; ++i) {
+      threads.emplace_back([&] {
+        ready.fetch_add(1, std::memory_order_relaxed);
+        while (!go.load(std::memory_order_acquire)) {
+        }
+        std::call_once(flag, [&] { benchmark::DoNotOptimize(shared_data = 42); });
+        benchmark::DoNotOptimize(shared_data);
+      });
+    }
+
+    while (ready.load(std::memory_order_relaxed) < nthreads) {
+    }
+
+    state.ResumeTiming();
+    go.store(true, std::memory_order_release);
+
+    for (auto& t : threads)
+      t.join();
+  }
+}
+BENCHMARK(BM_call_once_slow_init)->Arg(2)->Arg(4)->Arg(8)->Arg(16);
+
+// Throughput: many fresh flags in sequence, single thread.
+// Measures raw cold-path throughput without thread overhead.
+static void BM_call_once_throughput(benchmark::State& state) {
+  for (auto _ : state) {
+    for (int i = 0; i < 1000; ++i) {
+      std::once_flag f;
+      std::call_once(f, [] {});
+      benchmark::DoNotOptimize(f);
+    }
+  }
+  state.SetItemsProcessed(state.iterations() * 1000);
+}
+BENCHMARK(BM_call_once_throughput);
+
+BENCHMARK_MAIN();

>From 1004a558d41258897a4728cc838537cf1d9915d1 Mon Sep 17 00:00:00 2001
From: Shonie Caplan <shonie4caplan at gmail.com>
Date: Thu, 23 Apr 2026 20:39:04 +0900
Subject: [PATCH 2/2] squash: update to use atomic functions

---
 libcxx/include/__mutex/once_flag.h | 12 +++++-----
 libcxx/src/call_once.cpp           | 36 ++++++++----------------------
 2 files changed, 16 insertions(+), 32 deletions(-)

diff --git a/libcxx/include/__mutex/once_flag.h b/libcxx/include/__mutex/once_flag.h
index ad15b2eb6df68..5ca8d47f0bf2d 100644
--- a/libcxx/include/__mutex/once_flag.h
+++ b/libcxx/include/__mutex/once_flag.h
@@ -9,6 +9,8 @@
 #ifndef _LIBCPP___MUTEX_ONCE_FLAG_H
 #define _LIBCPP___MUTEX_ONCE_FLAG_H
 
+#include <__atomic/atomic.h>
+#include <__atomic/memory_order.h>
 #include <__config>
 #include <__memory/addressof.h>
 #include <__tuple/tuple_size.h>
@@ -63,7 +65,7 @@ struct once_flag {
   static const _State_type _Complete = ~_State_type(0);
 
 private:
-  _State_type __state_;
+  atomic<_State_type> __state_;
 
 #ifndef _LIBCPP_CXX03_LANG
   template <class _Callable, class... _Args>
@@ -113,7 +115,7 @@ void _LIBCPP_HIDE_FROM_ABI __call_once_proxy(void* __vp) {
   (*__p)();
 }
 
-_LIBCPP_EXPORTED_FROM_ABI void __call_once(volatile once_flag::_State_type&, void*, void (*)(void*));
+_LIBCPP_EXPORTED_FROM_ABI void __call_once(atomic<once_flag::_State_type>&, void*, void (*)(void*));
 
 template <class _ValueType>
 inline _LIBCPP_HIDE_FROM_ABI _ValueType __libcpp_acquire_load(_ValueType const* __value) {
@@ -128,7 +130,7 @@ inline _LIBCPP_HIDE_FROM_ABI _ValueType __libcpp_acquire_load(_ValueType const*
 
 template <class _Callable, class... _Args>
 inline _LIBCPP_HIDE_FROM_ABI void call_once(once_flag& __flag, _Callable&& __func, _Args&&... __args) {
-  if (__libcpp_acquire_load(&__flag.__state_) != once_flag::_Complete) {
+  if (__flag.__state_.load(memory_order_acquire) != once_flag::_Complete) {
     typedef tuple<_Callable&&, _Args&&...> _Gp;
     _Gp __f(std::forward<_Callable>(__func), std::forward<_Args>(__args)...);
     __call_once_param<_Gp> __p(__f);
@@ -140,7 +142,7 @@ inline _LIBCPP_HIDE_FROM_ABI void call_once(once_flag& __flag, _Callable&& __fun
 
 template <class _Callable>
 inline _LIBCPP_HIDE_FROM_ABI void call_once(once_flag& __flag, _Callable& __func) {
-  if (__libcpp_acquire_load(&__flag.__state_) != once_flag::_Complete) {
+  if (__flag.__state_.load(memory_order_acquire) != once_flag::_Complete) {
     __call_once_param<_Callable> __p(__func);
     std::__call_once(__flag.__state_, std::addressof(__p), std::addressof(__call_once_proxy<_Callable>));
   }
@@ -148,7 +150,7 @@ inline _LIBCPP_HIDE_FROM_ABI void call_once(once_flag& __flag, _Callable& __func
 
 template <class _Callable>
 inline _LIBCPP_HIDE_FROM_ABI void call_once(once_flag& __flag, const _Callable& __func) {
-  if (__libcpp_acquire_load(&__flag.__state_) != once_flag::_Complete) {
+  if (__flag.__state_.load(memory_order_acquire) != once_flag::_Complete) {
     __call_once_param<const _Callable> __p(__func);
     std::__call_once(__flag.__state_, std::addressof(__p), std::addressof(__call_once_proxy<const _Callable>));
   }
diff --git a/libcxx/src/call_once.cpp b/libcxx/src/call_once.cpp
index e12ddc5eb0917..db18f2182c290 100644
--- a/libcxx/src/call_once.cpp
+++ b/libcxx/src/call_once.cpp
@@ -10,14 +10,6 @@
 #include <__mutex/once_flag.h>
 #include <__utility/exception_guard.h>
 
-#if _LIBCPP_HAS_THREADS
-#  include <__thread/support.h>
-#endif
-
-#include "include/atomic_support.h"
-
-#include <__atomic/contention_t.h>
-
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 // If dispatch_once_f ever handles C++ exceptions, and if one can get to it
@@ -26,13 +18,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 // call into dispatch_once_f instead of here. Relevant radar this code needs to
 // keep in sync with:  7741191.
 
-#if _LIBCPP_HAS_THREADS
-_LIBCPP_EXPORTED_FROM_ABI void __cxx_atomic_notify_all(void const volatile*) _NOEXCEPT;
-_LIBCPP_EXPORTED_FROM_ABI __cxx_contention_t __libcpp_atomic_monitor(void const volatile*) _NOEXCEPT;
-_LIBCPP_EXPORTED_FROM_ABI void __libcpp_atomic_wait(void const volatile*, __cxx_contention_t) _NOEXCEPT;
-#endif
-
-void __call_once(volatile once_flag::_State_type& flag, void* arg, void (*func)(void*)) {
+void __call_once(atomic<once_flag::_State_type>& flag, void* arg, void (*func)(void*)) {
 #if !_LIBCPP_HAS_THREADS
 
   if (flag == once_flag::_Unset) {
@@ -45,30 +31,26 @@ void __call_once(volatile once_flag::_State_type& flag, void* arg, void (*func)(
 
 #else // !_LIBCPP_HAS_THREADS
 
-  auto flag_read = __atomic_load_n(&flag, __ATOMIC_ACQUIRE);
+  auto flag_read = flag.load(memory_order_acquire);
 
 WAIT:
   while (flag_read == once_flag::_Pending) {
-    __cxx_contention_t monitor = __libcpp_atomic_monitor(&flag);
-    flag_read                  = __atomic_load_n(&flag, __ATOMIC_ACQUIRE);
-    if (flag_read == once_flag::_Pending) {
-      __libcpp_atomic_wait(&flag, monitor);
-      flag_read = __atomic_load_n(&flag, __ATOMIC_ACQUIRE);
-    }
+    flag.wait(once_flag::_Pending, memory_order_acquire);
+    flag_read = flag.load(memory_order_acquire);
   }
 
   if (flag_read == once_flag::_Unset) {
     once_flag::_State_type expected = once_flag::_Unset;
-    if (__atomic_compare_exchange_n(&flag, &expected, once_flag::_Pending, false, __ATOMIC_ACQUIRE, __ATOMIC_ACQUIRE)) {
+    if (flag.compare_exchange_strong(expected, once_flag::_Pending, memory_order_acquire, memory_order_acquire)) {
       auto guard = std::__make_exception_guard([&flag] {
-        __libcpp_atomic_store(&flag, once_flag::_Unset, _AO_Release);
-        __cxx_atomic_notify_all(&flag);
+        flag.store(once_flag::_Unset, memory_order_release);
+        flag.notify_all();
       });
 
       func(arg);
 
-      __libcpp_atomic_store(&flag, once_flag::_Complete, _AO_Release);
-      __cxx_atomic_notify_all(&flag);
+      flag.store(once_flag::_Complete, memory_order_release);
+      flag.notify_all();
       guard.__complete();
 
     } else {



More information about the libcxx-commits mailing list