[libcxx-commits] [libcxx] [libc++] Speed up classic locale (take 2) (PR #73533)

Louis Dionne via libcxx-commits libcxx-commits at lists.llvm.org
Tue Nov 28 13:30:47 PST 2023


https://github.com/ldionne updated https://github.com/llvm/llvm-project/pull/73533

>From 700678cc6fc68f35c972e75306835ccbf9528d43 Mon Sep 17 00:00:00 2001
From: Dmitry Vyukov <dvyukov at google.com>
Date: Tue, 28 Nov 2023 10:07:03 -0500
Subject: [PATCH 1/2] [libc++] Speed up classic locale (#73533)

Locale objects use atomic reference counting, which may be very
expensive in parallel applications. The classic locale is used by
default by all streams and can be very contended. But it's never
destroyed, so the reference counting is also completely pointless on the
classic locale. Currently ~70% of time in the parallel stringstream
benchmarks is spent in locale ctor/dtor. And the execution radically
slows down with more threads.

Avoid reference counting on the classic locale. With this change
parallel benchmarks start to scale with threads.

This is a re-application of f8afc53d641c (aka PR #72112) which was
reverted in 4e0c48b907f1 because it broke the sanitizer builds due
to an initialization order fiasco. This issue has now been fixed by
ensuring that the locale is constinit'ed.

Co-authored-by: Louis Dionne <ldionne.2 at gmail.com>
---
 libcxx/benchmarks/stringstream.bench.cpp | 63 ++++++++++++++-
 libcxx/include/CMakeLists.txt            |  1 +
 libcxx/include/__locale                  |  1 +
 libcxx/include/__utility/no_destroy.h    | 63 +++++++++++++++
 libcxx/include/module.modulemap.in       |  1 +
 libcxx/src/locale.cpp                    | 98 +++++++++++-------------
 6 files changed, 172 insertions(+), 55 deletions(-)
 create mode 100644 libcxx/include/__utility/no_destroy.h

diff --git a/libcxx/benchmarks/stringstream.bench.cpp b/libcxx/benchmarks/stringstream.bench.cpp
index ea602557ccd770e..3cbe5ac5997a3c3 100644
--- a/libcxx/benchmarks/stringstream.bench.cpp
+++ b/libcxx/benchmarks/stringstream.bench.cpp
@@ -1,11 +1,12 @@
 #include "benchmark/benchmark.h"
 #include "test_macros.h"
 
+#include <mutex>
 #include <sstream>
 
 TEST_NOINLINE double istream_numbers();
 
-double istream_numbers() {
+double istream_numbers(std::locale* loc) {
   const char* a[] = {"-6  69 -71  2.4882e-02 -100 101 -2.00005 5000000 -50000000",
                      "-25 71   7 -9.3262e+01 -100 101 -2.00005 5000000 -50000000",
                      "-14 53  46 -6.7026e-02 -100 101 -2.00005 5000000 -50000000"};
@@ -14,17 +15,73 @@ double istream_numbers() {
   double f1 = 0.0, f2 = 0.0, q = 0.0;
   for (int i = 0; i < 3; i++) {
     std::istringstream s(a[i]);
+    if (loc)
+      s.imbue(*loc);
     s >> a1 >> a2 >> a3 >> f1 >> a4 >> a5 >> f2 >> a6 >> a7;
     q += (a1 + a2 + a3 + a4 + a5 + a6 + a7 + f1 + f2) / 1000000;
   }
   return q;
 }
 
+struct LocaleSelector {
+  std::locale* imbue;
+  std::locale old;
+  static std::mutex mutex;
+
+  LocaleSelector(benchmark::State& state) {
+    std::lock_guard guard(mutex);
+    switch (state.range(0)) {
+    case 0: {
+      old   = std::locale::global(std::locale::classic());
+      imbue = nullptr;
+      break;
+    }
+    case 1: {
+      old = std::locale::global(std::locale::classic());
+      thread_local std::locale loc("en_US.UTF-8");
+      imbue = &loc;
+      break;
+    }
+    case 2: {
+      old = std::locale::global(std::locale::classic());
+      static std::locale loc("en_US.UTF-8");
+      imbue = &loc;
+      break;
+    }
+    case 3: {
+      old   = std::locale::global(std::locale("en_US.UTF-8"));
+      imbue = nullptr;
+      break;
+    }
+    }
+  }
+
+  ~LocaleSelector() {
+    std::lock_guard guard(mutex);
+    std::locale::global(old);
+  }
+};
+
+std::mutex LocaleSelector::mutex;
+
 static void BM_Istream_numbers(benchmark::State& state) {
+  LocaleSelector sel(state);
   double i = 0;
   while (state.KeepRunning())
-    benchmark::DoNotOptimize(i += istream_numbers());
+    benchmark::DoNotOptimize(i += istream_numbers(sel.imbue));
+}
+BENCHMARK(BM_Istream_numbers)->DenseRange(0, 3)->UseRealTime()->Threads(1)->ThreadPerCpu();
+
+static void BM_Ostream_number(benchmark::State& state) {
+  LocaleSelector sel(state);
+  while (state.KeepRunning()) {
+    std::ostringstream ss;
+    if (sel.imbue)
+      ss.imbue(*sel.imbue);
+    ss << 0;
+    benchmark::DoNotOptimize(ss.str().c_str());
+  }
 }
+BENCHMARK(BM_Ostream_number)->DenseRange(0, 3)->UseRealTime()->Threads(1)->ThreadPerCpu();
 
-BENCHMARK(BM_Istream_numbers)->RangeMultiplier(2)->Range(1024, 4096);
 BENCHMARK_MAIN();
diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt
index b0bc3718576f66b..15d4537518d31e8 100644
--- a/libcxx/include/CMakeLists.txt
+++ b/libcxx/include/CMakeLists.txt
@@ -849,6 +849,7 @@ set(files
   __utility/integer_sequence.h
   __utility/is_pointer_in_range.h
   __utility/move.h
+  __utility/no_destroy.h
   __utility/pair.h
   __utility/piecewise_construct.h
   __utility/priority_tag.h
diff --git a/libcxx/include/__locale b/libcxx/include/__locale
index fcae49e23fac59e..dbb9a1f1c81b3b0 100644
--- a/libcxx/include/__locale
+++ b/libcxx/include/__locale
@@ -15,6 +15,7 @@
 #include <__memory/shared_ptr.h> // __shared_count
 #include <__mutex/once_flag.h>
 #include <__type_traits/make_unsigned.h>
+#include <__utility/no_destroy.h>
 #include <cctype>
 #include <clocale>
 #include <cstdint>
diff --git a/libcxx/include/__utility/no_destroy.h b/libcxx/include/__utility/no_destroy.h
new file mode 100644
index 000000000000000..3563774301efba0
--- /dev/null
+++ b/libcxx/include/__utility/no_destroy.h
@@ -0,0 +1,63 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___UTILITY_NO_DESTROY_H
+#define _LIBCPP___UTILITY_NO_DESTROY_H
+
+#include <__config>
+#include <__type_traits/is_constant_evaluated.h>
+#include <__utility/forward.h>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#  pragma GCC system_header
+#endif
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+struct __uninitialized_tag {};
+
+// This class stores an object of type _Tp but never destroys it.
+//
+// This is akin to using __attribute__((no_destroy)), except that it is possible
+// to control the lifetime of the object with more flexibility by deciding e.g.
+// whether to initialize the object at construction or to defer to a later
+// initialization using __emplace.
+template <class _Tp>
+struct __no_destroy {
+  _LIBCPP_CONSTEXPR _LIBCPP_HIDE_FROM_ABI explicit __no_destroy(__uninitialized_tag) : __dummy_() {
+    if (__libcpp_is_constant_evaluated()) {
+      __dummy_ = char();
+    }
+  }
+  _LIBCPP_HIDE_FROM_ABI ~__no_destroy() {
+    // nothing
+  }
+
+  template <class... _Args>
+  _LIBCPP_CONSTEXPR _LIBCPP_HIDE_FROM_ABI explicit __no_destroy(_Args&&... __args)
+      : __obj_(std::forward<_Args>(__args)...) {}
+
+  template <class... _Args>
+  _LIBCPP_CONSTEXPR _LIBCPP_HIDE_FROM_ABI _Tp& __emplace(_Args&&... __args) {
+    new (&__obj_) _Tp(std::forward<_Args>(__args)...);
+    return __obj_;
+  }
+
+  _LIBCPP_CONSTEXPR _LIBCPP_HIDE_FROM_ABI _Tp& __get() { return __obj_; }
+  _LIBCPP_CONSTEXPR _LIBCPP_HIDE_FROM_ABI _Tp const& __get() const { return __obj_; }
+
+private:
+  union {
+    _Tp __obj_;
+    char __dummy_; // so we can initialize a member even with __uninitialized_tag for constexpr-friendliness
+  };
+};
+
+_LIBCPP_END_NAMESPACE_STD
+
+#endif // _LIBCPP___UTILITY_NO_DESTROY_H
diff --git a/libcxx/include/module.modulemap.in b/libcxx/include/module.modulemap.in
index 90381f4f7f720d4..cdf92c45ebca974 100644
--- a/libcxx/include/module.modulemap.in
+++ b/libcxx/include/module.modulemap.in
@@ -2042,6 +2042,7 @@ module std_private_utility_move                   [system] {
   export std_private_type_traits_is_nothrow_move_constructible
   export std_private_type_traits_remove_reference
 }
+module std_private_utility_no_destroy             [system] { header "__utility/no_destroy.h" }
 module std_private_utility_pair                   [system] {
   header "__utility/pair.h"
   export std_private_ranges_subrange_fwd
diff --git a/libcxx/src/locale.cpp b/libcxx/src/locale.cpp
index 15cc2d7df6aeff8..d3de81b95c34b9a 100644
--- a/libcxx/src/locale.cpp
+++ b/libcxx/src/locale.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include <__utility/no_destroy.h>
 #include <algorithm>
 #include <clocale>
 #include <codecvt>
@@ -81,9 +82,8 @@ locale_t __cloc() {
 
 namespace {
 
-struct release
-{
-    void operator()(locale::facet* p) {p->__release_shared();}
+struct releaser {
+  void operator()(locale::facet* p) { p->__release_shared(); }
 };
 
 template <class T, class ...Args>
@@ -155,7 +155,11 @@ class _LIBCPP_HIDDEN locale::__imp
         {return static_cast<size_t>(id) < facets_.size() && facets_[static_cast<size_t>(id)];}
     const locale::facet* use_facet(long id) const;
 
-private:
+    void acquire();
+    void release();
+    static __no_destroy<__imp> classic_locale_imp_;
+
+  private:
     void install(facet* f, long id);
     template <class F> void install(F* f) {install(f, f->id.__get());}
     template <class F> void install_from(const __imp& other);
@@ -500,7 +504,7 @@ locale::__imp::__imp(const __imp& other, facet* f, long id)
       name_("*")
 {
     f->__add_shared();
-    unique_ptr<facet, release> hold(f);
+    unique_ptr<facet, releaser> hold(f);
     facets_ = other.facets_;
     for (unsigned i = 0; i < other.facets_.size(); ++i)
         if (facets_[i])
@@ -519,7 +523,7 @@ void
 locale::__imp::install(facet* f, long id)
 {
     f->__add_shared();
-    unique_ptr<facet, release> hold(f);
+    unique_ptr<facet, releaser> hold(f);
     if (static_cast<size_t>(id) >= facets_.size())
         facets_.resize(static_cast<size_t>(id+1));
     if (facets_[static_cast<size_t>(id)])
@@ -537,89 +541,79 @@ locale::__imp::use_facet(long id) const
 
 // locale
 
-// This class basically implements __attribute__((no_destroy)), which isn't supported
-// by GCC as of writing this.
-template <class T>
-struct __no_destroy {
-    template <class... Args>
-    explicit __no_destroy(Args&&... args) {
-        T* obj = reinterpret_cast<T*>(&buf);
-        new (obj) T(std::forward<Args>(args)...);
-    }
-
-    T& get() { return *reinterpret_cast<T*>(&buf); }
-    T const& get() const { return *reinterpret_cast<T const*>(&buf); }
-
-  private:
-    alignas(T) byte buf[sizeof(T)];
-};
+// We don't do reference counting on the classic locale.
+// It's never destroyed anyway, but atomic reference counting may be very
+// expensive in parallel applications. The classic locale is used by default
+// in all streams. Note: if a new global locale is installed, then we lose
+// the benefit of no reference counting.
+constinit __no_destroy<locale::__imp>
+    locale::__imp::classic_locale_imp_(__uninitialized_tag{}); // initialized below in classic()
 
 const locale& locale::classic() {
-    static const __no_destroy<locale> c(__private_tag{}, &make<__imp>(1u));
-    return c.get();
+  static const __no_destroy<locale> classic_locale(__private_tag{}, [] {
+    // executed exactly once on first initialization of `classic_locale`
+    locale::__imp::classic_locale_imp_.__emplace(1u);
+    return &locale::__imp::classic_locale_imp_.__get();
+  }());
+  return classic_locale.__get();
 }
 
 locale& locale::__global() {
-    static __no_destroy<locale> g(locale::classic());
-    return g.get();
+  static __no_destroy<locale> g(locale::classic());
+  return g.__get();
 }
 
-locale::locale() noexcept
-    : __locale_(__global().__locale_)
-{
-    __locale_->__add_shared();
+void locale::__imp::acquire() {
+  if (this != &locale::__imp::classic_locale_imp_.__get())
+    __add_shared();
 }
 
-locale::locale(const locale& l) noexcept
-    : __locale_(l.__locale_)
-{
-    __locale_->__add_shared();
+void locale::__imp::release() {
+  if (this != &locale::__imp::classic_locale_imp_.__get())
+    __release_shared();
 }
 
-locale::~locale()
-{
-    __locale_->__release_shared();
-}
+locale::locale() noexcept : __locale_(__global().__locale_) { __locale_->acquire(); }
+
+locale::locale(const locale& l) noexcept : __locale_(l.__locale_) { __locale_->acquire(); }
+
+locale::~locale() { __locale_->release(); }
 
 const locale&
 locale::operator=(const locale& other) noexcept
 {
-    other.__locale_->__add_shared();
-    __locale_->__release_shared();
-    __locale_ = other.__locale_;
-    return *this;
+  other.__locale_->acquire();
+  __locale_->release();
+  __locale_ = other.__locale_;
+  return *this;
 }
 
 locale::locale(const char* name)
     : __locale_(name ? new __imp(name)
                      : (__throw_runtime_error("locale constructed with null"), nullptr))
 {
-    __locale_->__add_shared();
+  __locale_->acquire();
 }
 
-locale::locale(const string& name)
-    : __locale_(new __imp(name))
-{
-    __locale_->__add_shared();
-}
+locale::locale(const string& name) : __locale_(new __imp(name)) { __locale_->acquire(); }
 
 locale::locale(const locale& other, const char* name, category c)
     : __locale_(name ? new __imp(*other.__locale_, name, c)
                      : (__throw_runtime_error("locale constructed with null"), nullptr))
 {
-    __locale_->__add_shared();
+  __locale_->acquire();
 }
 
 locale::locale(const locale& other, const string& name, category c)
     : __locale_(new __imp(*other.__locale_, name, c))
 {
-    __locale_->__add_shared();
+  __locale_->acquire();
 }
 
 locale::locale(const locale& other, const locale& one, category c)
     : __locale_(new __imp(*other.__locale_, *one.__locale_, c))
 {
-    __locale_->__add_shared();
+  __locale_->acquire();
 }
 
 string
@@ -635,7 +629,7 @@ locale::__install_ctor(const locale& other, facet* f, long id)
         __locale_ = new __imp(*other.__locale_, f, id);
     else
         __locale_ = other.__locale_;
-    __locale_->__add_shared();
+    __locale_->acquire();
 }
 
 locale

>From 8a6841b5d26d129eb9dc66e036adaad71ff5e8c2 Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2 at gmail.com>
Date: Tue, 28 Nov 2023 16:30:24 -0500
Subject: [PATCH 2/2] Adjust constexpr

---
 libcxx/include/__utility/no_destroy.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/libcxx/include/__utility/no_destroy.h b/libcxx/include/__utility/no_destroy.h
index 3563774301efba0..f9c1eb7bed4569c 100644
--- a/libcxx/include/__utility/no_destroy.h
+++ b/libcxx/include/__utility/no_destroy.h
@@ -29,7 +29,7 @@ struct __uninitialized_tag {};
 // initialization using __emplace.
 template <class _Tp>
 struct __no_destroy {
-  _LIBCPP_CONSTEXPR _LIBCPP_HIDE_FROM_ABI explicit __no_destroy(__uninitialized_tag) : __dummy_() {
+  _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI explicit __no_destroy(__uninitialized_tag) : __dummy_() {
     if (__libcpp_is_constant_evaluated()) {
       __dummy_ = char();
     }
@@ -43,13 +43,13 @@ struct __no_destroy {
       : __obj_(std::forward<_Args>(__args)...) {}
 
   template <class... _Args>
-  _LIBCPP_CONSTEXPR _LIBCPP_HIDE_FROM_ABI _Tp& __emplace(_Args&&... __args) {
+  _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI _Tp& __emplace(_Args&&... __args) {
     new (&__obj_) _Tp(std::forward<_Args>(__args)...);
     return __obj_;
   }
 
-  _LIBCPP_CONSTEXPR _LIBCPP_HIDE_FROM_ABI _Tp& __get() { return __obj_; }
-  _LIBCPP_CONSTEXPR _LIBCPP_HIDE_FROM_ABI _Tp const& __get() const { return __obj_; }
+  _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI _Tp& __get() { return __obj_; }
+  _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI _Tp const& __get() const { return __obj_; }
 
 private:
   union {



More information about the libcxx-commits mailing list