[libcxx-commits] [libcxx] [libc++] Speed up classic locale (PR #72112)

Louis Dionne via libcxx-commits libcxx-commits at lists.llvm.org
Thu Nov 23 08:24:41 PST 2023


https://github.com/ldionne updated https://github.com/llvm/llvm-project/pull/72112

>From e45ddc0e249b0b4f8987fd0b3d6d15e130df383b Mon Sep 17 00:00:00 2001
From: Dmitry Vyukov <dvyukov at google.com>
Date: Mon, 13 Nov 2023 12:29:22 +0100
Subject: [PATCH 1/4] [libc++] Speed up classic locale
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Locale objects use atomic reference counting, which may be very expensive
in parallel applications. The classic locale is used by default by all
streams and can be very contended. But it's never destroyed, so the reference
counting is also completely pointless on the classic locale. Currently ~70%
of time in the parallel stringstream benchmarks is spent in locale ctor/dtor.
And the execution radically slows down with more threads.

Avoid reference counting on the classic locale. With this change parallel
benchmarks start to scale with threads.

                              │   baseline   │    optimized                            │
                              │    sec/op    │    sec/op      vs base                  │
Istream_numbers/0/threads:1      4.672µ ± 0%   4.419µ ± 0%     -5.42% (p=0.000 n=30+39)
Istream_numbers/0/threads:72   539.817µ ± 0%   9.842µ ± 1%    -98.18% (p=0.000 n=30+40)
Istream_numbers/1/threads:1      4.890µ ± 0%   4.750µ ± 0%     -2.85% (p=0.000 n=30+40)
Istream_numbers/1/threads:72     66.44µ ± 1%   10.14µ ± 1%    -84.74% (p=0.000 n=30+40)
Istream_numbers/2/threads:1      4.888µ ± 0%   4.746µ ± 0%     -2.92% (p=0.000 n=30+40)
Istream_numbers/2/threads:72     494.8µ ± 0%   410.2µ ± 1%    -17.11% (p=0.000 n=30+40)
Istream_numbers/3/threads:1      4.697µ ± 0%   4.695µ ± 5%          ~ (p=0.391 n=30+37)
Istream_numbers/3/threads:72     421.5µ ± 7%   421.9µ ± 9%          ~ (p=0.665 n=30)
Ostream_number/0/threads:1       183.0n ± 0%   141.0n ± 2%    -22.95% (p=0.000 n=30)
Ostream_number/0/threads:72    24196.5n ± 1%   343.5n ± 3%    -98.58% (p=0.000 n=30)
Ostream_number/1/threads:1       250.0n ± 0%   196.0n ± 2%    -21.60% (p=0.000 n=30)
Ostream_number/1/threads:72    16260.5n ± 0%   407.0n ± 2%    -97.50% (p=0.000 n=30)
Ostream_number/2/threads:1       254.0n ± 0%   196.0n ± 1%    -22.83% (p=0.000 n=30)
Ostream_number/2/threads:72      28.49µ ± 1%   18.89µ ± 5%    -33.72% (p=0.000 n=30)
Ostream_number/3/threads:1       185.0n ± 0%   185.0n ± 0%      0.00% (p=0.017 n=30)
Ostream_number/3/threads:72      19.38µ ± 4%   19.33µ ± 5%          ~ (p=0.425 n=30)
---
 libcxx/benchmarks/stringstream.bench.cpp | 62 ++++++++++++++++++++++--
 libcxx/src/locale.cpp                    | 47 ++++++++++++------
 2 files changed, 92 insertions(+), 17 deletions(-)

diff --git a/libcxx/benchmarks/stringstream.bench.cpp b/libcxx/benchmarks/stringstream.bench.cpp
index ea602557ccd770e..c10ee3a8cc5b83c 100644
--- a/libcxx/benchmarks/stringstream.bench.cpp
+++ b/libcxx/benchmarks/stringstream.bench.cpp
@@ -1,11 +1,12 @@
 #include "benchmark/benchmark.h"
 #include "test_macros.h"
 
+#include <mutex>
 #include <sstream>
 
 TEST_NOINLINE double istream_numbers();
 
-double istream_numbers() {
+double istream_numbers(std::locale* l) {
   const char* a[] = {"-6  69 -71  2.4882e-02 -100 101 -2.00005 5000000 -50000000",
                      "-25 71   7 -9.3262e+01 -100 101 -2.00005 5000000 -50000000",
                      "-14 53  46 -6.7026e-02 -100 101 -2.00005 5000000 -50000000"};
@@ -14,17 +15,72 @@ double istream_numbers() {
   double f1 = 0.0, f2 = 0.0, q = 0.0;
   for (int i = 0; i < 3; i++) {
     std::istringstream s(a[i]);
+    if (l)
+      s.imbue(*l);
     s >> a1 >> a2 >> a3 >> f1 >> a4 >> a5 >> f2 >> a6 >> a7;
     q += (a1 + a2 + a3 + a4 + a5 + a6 + a7 + f1 + f2) / 1000000;
   }
   return q;
 }
 
+struct LocaleSelector {
+  std::locale* imbue;
+  std::locale old;
+
+  LocaleSelector(benchmark::State& state) {
+    static std::mutex mu;
+    std::lock_guard l(mu);
+    switch (state.range(0)) {
+    case 0: {
+      old   = std::locale::global(std::locale::classic());
+      imbue = nullptr;
+      break;
+    }
+    case 1: {
+      old = std::locale::global(std::locale::classic());
+      thread_local std::locale l("en_US.UTF-8");
+      imbue = &l;
+      break;
+    }
+    case 2: {
+      old = std::locale::global(std::locale::classic());
+      static std::locale l("en_US.UTF-8");
+      imbue = &l;
+      break;
+    }
+    case 3: {
+      old   = std::locale::global(std::locale("en_US.UTF-8"));
+      imbue = nullptr;
+      break;
+    }
+    }
+  }
+
+  ~LocaleSelector() {
+    static std::mutex mu;
+    std::lock_guard l(mu);
+    std::locale::global(old);
+  }
+};
+
 static void BM_Istream_numbers(benchmark::State& state) {
+  LocaleSelector sel(state);
   double i = 0;
   while (state.KeepRunning())
-    benchmark::DoNotOptimize(i += istream_numbers());
+    benchmark::DoNotOptimize(i += istream_numbers(sel.imbue));
+}
+BENCHMARK(BM_Istream_numbers)->DenseRange(0, 3)->UseRealTime()->Threads(1)->ThreadPerCpu();
+
+static void BM_Ostream_number(benchmark::State& state) {
+  LocaleSelector sel(state);
+  while (state.KeepRunning()) {
+    std::ostringstream ss;
+    if (sel.imbue)
+      ss.imbue(*sel.imbue);
+    ss << 0;
+    benchmark::DoNotOptimize(ss.str().c_str());
+  }
 }
+BENCHMARK(BM_Ostream_number)->DenseRange(0, 3)->UseRealTime()->Threads(1)->ThreadPerCpu();
 
-BENCHMARK(BM_Istream_numbers)->RangeMultiplier(2)->Range(1024, 4096);
 BENCHMARK_MAIN();
diff --git a/libcxx/src/locale.cpp b/libcxx/src/locale.cpp
index f3b281d98825e44..64e202677dd404c 100644
--- a/libcxx/src/locale.cpp
+++ b/libcxx/src/locale.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include <algorithm>
+#include <atomic>
 #include <clocale>
 #include <codecvt>
 #include <cstddef>
@@ -81,7 +82,7 @@ locale_t __cloc() {
 
 namespace {
 
-struct release
+struct releaser
 {
     void operator()(locale::facet* p) {p->__release_shared();}
 };
@@ -155,10 +156,13 @@ class _LIBCPP_HIDDEN locale::__imp
         {return static_cast<size_t>(id) < facets_.size() && facets_[static_cast<size_t>(id)];}
     const locale::facet* use_facet(long id) const;
 
+    void acquire();
+    void release();
 private:
     void install(facet* f, long id);
     template <class F> void install(F* f) {install(f, f->id.__get());}
     template <class F> void install_from(const __imp& other);
+    static std::atomic<__imp*> classic_;
 };
 
 locale::__imp::__imp(size_t refs)
@@ -500,7 +504,7 @@ locale::__imp::__imp(const __imp& other, facet* f, long id)
       name_("*")
 {
     f->__add_shared();
-    unique_ptr<facet, release> hold(f);
+    unique_ptr<facet, releaser> hold(f);
     facets_ = other.facets_;
     for (unsigned i = 0; i < other.facets_.size(); ++i)
         if (facets_[i])
@@ -519,7 +523,7 @@ void
 locale::__imp::install(facet* f, long id)
 {
     f->__add_shared();
-    unique_ptr<facet, release> hold(f);
+    unique_ptr<facet, releaser> hold(f);
     if (static_cast<size_t>(id) >= facets_.size())
         facets_.resize(static_cast<size_t>(id+1));
     if (facets_[static_cast<size_t>(id)])
@@ -554,38 +558,53 @@ struct __no_destroy {
     alignas(T) byte buf[sizeof(T)];
 };
 
+std::atomic<locale::__imp*> locale::__imp::classic_;
+
 const locale& locale::classic() {
     static const __no_destroy<locale> c(__private_tag{}, &make<__imp>(1u));
+    // TODO:
+    classic_.store(c->__locale_, std::memory_order_relaxed);
     return c.get();
 }
 
 locale& locale::__global() {
     static __no_destroy<locale> g(locale::classic());
     return g.get();
+
+void locale::__imp::acquire()
+{
+    if (this != classic_.load(std::memory_order_relaxed))
+        __add_shared();
+}
+
+void locale::__imp::release()
+{
+    if (this != classic_.load(std::memory_order_relaxed))
+        __release_shared();
 }
 
 locale::locale() noexcept
     : __locale_(__global().__locale_)
 {
-    __locale_->__add_shared();
+    __locale_->acquire();
 }
 
 locale::locale(const locale& l) noexcept
     : __locale_(l.__locale_)
 {
-    __locale_->__add_shared();
+    __locale_->acquire();
 }
 
 locale::~locale()
 {
-    __locale_->__release_shared();
+    __locale_->release();
 }
 
 const locale&
 locale::operator=(const locale& other) noexcept
 {
-    other.__locale_->__add_shared();
-    __locale_->__release_shared();
+    other.__locale_->acquire();
+    __locale_->release();
     __locale_ = other.__locale_;
     return *this;
 }
@@ -594,32 +613,32 @@ locale::locale(const char* name)
     : __locale_(name ? new __imp(name)
                      : (__throw_runtime_error("locale constructed with null"), nullptr))
 {
-    __locale_->__add_shared();
+    __locale_->acquire();
 }
 
 locale::locale(const string& name)
     : __locale_(new __imp(name))
 {
-    __locale_->__add_shared();
+    __locale_->acquire();
 }
 
 locale::locale(const locale& other, const char* name, category c)
     : __locale_(name ? new __imp(*other.__locale_, name, c)
                      : (__throw_runtime_error("locale constructed with null"), nullptr))
 {
-    __locale_->__add_shared();
+    __locale_->acquire();
 }
 
 locale::locale(const locale& other, const string& name, category c)
     : __locale_(new __imp(*other.__locale_, name, c))
 {
-    __locale_->__add_shared();
+    __locale_->acquire();
 }
 
 locale::locale(const locale& other, const locale& one, category c)
     : __locale_(new __imp(*other.__locale_, *one.__locale_, c))
 {
-    __locale_->__add_shared();
+    __locale_->acquire();
 }
 
 string
@@ -635,7 +654,7 @@ locale::__install_ctor(const locale& other, facet* f, long id)
         __locale_ = new __imp(*other.__locale_, f, id);
     else
         __locale_ = other.__locale_;
-    __locale_->__add_shared();
+    __locale_->acquire();
 }
 
 locale

>From 8199841d8af2424f50c25f82caf18b968413170c Mon Sep 17 00:00:00 2001
From: Dmitry Vyukov <dvyukov at google.com>
Date: Thu, 16 Nov 2023 15:02:16 +0100
Subject: [PATCH 2/4] add more comments

---
 libcxx/src/locale.cpp | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/libcxx/src/locale.cpp b/libcxx/src/locale.cpp
index 64e202677dd404c..998fe12eef4a6b6 100644
--- a/libcxx/src/locale.cpp
+++ b/libcxx/src/locale.cpp
@@ -162,6 +162,12 @@ class _LIBCPP_HIDDEN locale::__imp
     void install(facet* f, long id);
     template <class F> void install(F* f) {install(f, f->id.__get());}
     template <class F> void install_from(const __imp& other);
+
+    // We don't do reference counting on the classic locale.
+    // It's never destroyed anyway, but atomic reference counting may be very
+    // expensive in parallel applications. The classic locale is used by default
+    // in all streams. Note: if a new global locale is installed, then we lose
+    // the benefit of no reference counting.
     static std::atomic<__imp*> classic_;
 };
 
@@ -563,6 +569,15 @@ std::atomic<locale::__imp*> locale::__imp::classic_;
 const locale& locale::classic() {
     static const __no_destroy<locale> c(__private_tag{}, &make<__imp>(1u));
     // TODO:
+    // We use relaxed memory ordering because readers don't access
+    // the contents of the objects, they are interested in just the
+    // pointer value.
+    // If a locale uses the classic imp, then this store happens
+    // before acquire/release methods, and they must observe the
+    // right value and omit reference counting.
+    // If a locale uses a non-classic imp, then it does not matter
+    // what value it will load, the result of the comparison will
+    // be false in all cases.
     classic_.store(c->__locale_, std::memory_order_relaxed);
     return c.get();
 }

>From 905907b69098f116da4eb685599abe1c7c89c22b Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2 at gmail.com>
Date: Thu, 23 Nov 2023 10:35:36 -0500
Subject: [PATCH 3/4] Adjust after rebase

---
 libcxx/src/locale.cpp | 26 ++++++++++++++------------
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/libcxx/src/locale.cpp b/libcxx/src/locale.cpp
index 998fe12eef4a6b6..946b85b45aef43d 100644
--- a/libcxx/src/locale.cpp
+++ b/libcxx/src/locale.cpp
@@ -567,18 +567,20 @@ struct __no_destroy {
 std::atomic<locale::__imp*> locale::__imp::classic_;
 
 const locale& locale::classic() {
-    static const __no_destroy<locale> c(__private_tag{}, &make<__imp>(1u));
-    // TODO:
-    // We use relaxed memory ordering because readers don't access
-    // the contents of the objects, they are interested in just the
-    // pointer value.
-    // If a locale uses the classic imp, then this store happens
-    // before acquire/release methods, and they must observe the
-    // right value and omit reference counting.
-    // If a locale uses a non-classic imp, then it does not matter
-    // what value it will load, the result of the comparison will
-    // be false in all cases.
-    classic_.store(c->__locale_, std::memory_order_relaxed);
+    static const __no_destroy<locale> c(__private_tag{}, []{
+        __imp* ptr = &make<__imp>(1u);
+        // We use relaxed memory ordering because readers don't access
+        // the contents of the objects, they are interested in just the
+        // pointer value.
+        // If a locale uses the classic imp, then this store happens
+        // before acquire/release methods, and they must observe the
+        // right value and omit reference counting.
+        // If a locale uses a non-classic imp, then it does not matter
+        // what value it will load, the result of the comparison will
+        // be false in all cases.
+        classic_.store(ptr, std::memory_order_relaxed);
+        return ptr;
+    }());
     return c.get();
 }
 

>From 3a4039ddbefe6987c9c5b0ed7b2dd3fee5486f1a Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2 at gmail.com>
Date: Thu, 23 Nov 2023 11:10:28 -0500
Subject: [PATCH 4/4] Implement suggestion without needing atomics

---
 libcxx/include/CMakeLists.txt         |  1 +
 libcxx/include/__locale               |  1 +
 libcxx/include/__utility/no_destroy.h | 54 +++++++++++++++++++
 libcxx/src/locale.cpp                 | 75 +++++++++------------------
 4 files changed, 80 insertions(+), 51 deletions(-)
 create mode 100644 libcxx/include/__utility/no_destroy.h

diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt
index 889d7fedbf2965f..521c586c0e07957 100644
--- a/libcxx/include/CMakeLists.txt
+++ b/libcxx/include/CMakeLists.txt
@@ -850,6 +850,7 @@ set(files
   __utility/integer_sequence.h
   __utility/is_pointer_in_range.h
   __utility/move.h
+  __utility/no_destroy.h
   __utility/pair.h
   __utility/piecewise_construct.h
   __utility/priority_tag.h
diff --git a/libcxx/include/__locale b/libcxx/include/__locale
index fcae49e23fac59e..dbb9a1f1c81b3b0 100644
--- a/libcxx/include/__locale
+++ b/libcxx/include/__locale
@@ -15,6 +15,7 @@
 #include <__memory/shared_ptr.h> // __shared_count
 #include <__mutex/once_flag.h>
 #include <__type_traits/make_unsigned.h>
+#include <__utility/no_destroy.h>
 #include <cctype>
 #include <clocale>
 #include <cstdint>
diff --git a/libcxx/include/__utility/no_destroy.h b/libcxx/include/__utility/no_destroy.h
new file mode 100644
index 000000000000000..6611401760e466c
--- /dev/null
+++ b/libcxx/include/__utility/no_destroy.h
@@ -0,0 +1,54 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___UTILITY_NO_DESTROY_H
+#define _LIBCPP___UTILITY_NO_DESTROY_H
+
+#include <__config>
+#include <__utility/forward.h>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#  pragma GCC system_header
+#endif
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+struct __uninitialized_tag {};
+struct __zero_initialized_tag {};
+
+// This class stores an object of type T but never destroys it.
+//
+// This is akin to using __attribute__((no_destroy)), except that it is possible
+// to control the lifetime of the object with more flexibility by deciding e.g.
+// whether to initialize the object at construction or to defer to a later
+// initialization using __emplace.
+template <class _Tp>
+struct __no_destroy {
+  _LIBCPP_HIDE_FROM_ABI explicit __no_destroy(__uninitialized_tag) {}
+  _LIBCPP_HIDE_FROM_ABI explicit constexpr __no_destroy(__zero_initialized_tag) : __buf{} {}
+
+  template <class... _Args>
+  _LIBCPP_HIDE_FROM_ABI explicit __no_destroy(_Args&&... __args) {
+    new (&__buf) _Tp(std::forward<_Args>(__args)...);
+  }
+
+  template <class... _Args>
+  _LIBCPP_HIDE_FROM_ABI _Tp& __emplace(_Args&&... __args) {
+    return *new (&__buf) _Tp(std::forward<_Args>(__args)...);
+  }
+
+  _LIBCPP_HIDE_FROM_ABI _Tp& __get() { return *reinterpret_cast<_Tp*>(&__buf); }
+  _LIBCPP_HIDE_FROM_ABI _Tp const& __get() const { return *reinterpret_cast<_Tp const*>(&__buf); }
+
+private:
+  _ALIGNAS_TYPE(_Tp) char __buf[sizeof(_Tp)];
+};
+
+_LIBCPP_END_NAMESPACE_STD
+
+#endif // _LIBCPP___UTILITY_NO_DESTROY_H
diff --git a/libcxx/src/locale.cpp b/libcxx/src/locale.cpp
index 946b85b45aef43d..3af08ddf5ee6c30 100644
--- a/libcxx/src/locale.cpp
+++ b/libcxx/src/locale.cpp
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include <__utility/no_destroy.h>
 #include <algorithm>
-#include <atomic>
 #include <clocale>
 #include <codecvt>
 #include <cstddef>
@@ -158,17 +158,12 @@ class _LIBCPP_HIDDEN locale::__imp
 
     void acquire();
     void release();
+    static __no_destroy<__imp> classic_locale_imp_;
+
 private:
     void install(facet* f, long id);
     template <class F> void install(F* f) {install(f, f->id.__get());}
     template <class F> void install_from(const __imp& other);
-
-    // We don't do reference counting on the classic locale.
-    // It's never destroyed anyway, but atomic reference counting may be very
-    // expensive in parallel applications. The classic locale is used by default
-    // in all streams. Note: if a new global locale is installed, then we lose
-    // the benefit of no reference counting.
-    static std::atomic<__imp*> classic_;
 };
 
 locale::__imp::__imp(size_t refs)
@@ -547,57 +542,35 @@ locale::__imp::use_facet(long id) const
 
 // locale
 
-// This class basically implements __attribute__((no_destroy)), which isn't supported
-// by GCC as of writing this.
-template <class T>
-struct __no_destroy {
-    template <class... Args>
-    explicit __no_destroy(Args&&... args) {
-        T* obj = reinterpret_cast<T*>(&buf);
-        new (obj) T(std::forward<Args>(args)...);
-    }
-
-    T& get() { return *reinterpret_cast<T*>(&buf); }
-    T const& get() const { return *reinterpret_cast<T const*>(&buf); }
-
-  private:
-    alignas(T) byte buf[sizeof(T)];
-};
-
-std::atomic<locale::__imp*> locale::__imp::classic_;
+// We don't do reference counting on the classic locale.
+// It's never destroyed anyway, but atomic reference counting may be very
+// expensive in parallel applications. The classic locale is used by default
+// in all streams. Note: if a new global locale is installed, then we lose
+// the benefit of no reference counting.
+__no_destroy<locale::__imp> locale::__imp::classic_locale_imp_(__uninitialized_tag{}); // initialized below in classic()
 
 const locale& locale::classic() {
-    static const __no_destroy<locale> c(__private_tag{}, []{
-        __imp* ptr = &make<__imp>(1u);
-        // We use relaxed memory ordering because readers don't access
-        // the contents of the objects, they are interested in just the
-        // pointer value.
-        // If a locale uses the classic imp, then this store happens
-        // before acquire/release methods, and they must observe the
-        // right value and omit reference counting.
-        // If a locale uses a non-classic imp, then it does not matter
-        // what value it will load, the result of the comparison will
-        // be false in all cases.
-        classic_.store(ptr, std::memory_order_relaxed);
-        return ptr;
-    }());
-    return c.get();
+  static const __no_destroy<locale> classic_locale(__private_tag{}, [] {
+    // executed exactly once on first initialization of `classic_locale`
+    locale::__imp::classic_locale_imp_.__emplace(1u);
+    return &locale::__imp::classic_locale_imp_.__get();
+  }());
+  return classic_locale.__get();
 }
 
 locale& locale::__global() {
-    static __no_destroy<locale> g(locale::classic());
-    return g.get();
+  static __no_destroy<locale> g(locale::classic());
+  return g.__get();
+}
 
-void locale::__imp::acquire()
-{
-    if (this != classic_.load(std::memory_order_relaxed))
-        __add_shared();
+void locale::__imp::acquire() {
+  if (this != &locale::__imp::classic_locale_imp_.__get())
+    __add_shared();
 }
 
-void locale::__imp::release()
-{
-    if (this != classic_.load(std::memory_order_relaxed))
-        __release_shared();
+void locale::__imp::release() {
+  if (this != &locale::__imp::classic_locale_imp_.__get())
+    __release_shared();
 }
 
 locale::locale() noexcept



More information about the libcxx-commits mailing list