[libcxx-commits] [libcxx] [libc++] Speed up classic locale (PR #72112)

Dmitry Vyukov via libcxx-commits libcxx-commits at lists.llvm.org
Thu Nov 16 06:02:38 PST 2023


https://github.com/dvyukov updated https://github.com/llvm/llvm-project/pull/72112

>From 51487e2932ecf2828890f2f487bee8753494e621 Mon Sep 17 00:00:00 2001
From: Dmitry Vyukov <dvyukov at google.com>
Date: Mon, 13 Nov 2023 12:29:22 +0100
Subject: [PATCH 1/2] [libc++] Speed up classic locale
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Locale objects use atomic reference counting, which may be very expensive
in parallel applications. The classic locale is used by default by all
streams and can be very contended. But it's never destroyed, so the reference
counting is also completely pointless on the classic locale. Currently ~70%
of time in the parallel stringstream benchmarks is spent in locale ctor/dtor.
And the execution radically slows down with more threads.

Avoid reference counting on the classic locale. With this change parallel
benchmarks start to scale with threads.

                              │   baseline   │    optimized                            │
                              │    sec/op    │    sec/op      vs base                  │
Istream_numbers/0/threads:1      4.672µ ± 0%   4.419µ ± 0%     -5.42% (p=0.000 n=30+39)
Istream_numbers/0/threads:72   539.817µ ± 0%   9.842µ ± 1%    -98.18% (p=0.000 n=30+40)
Istream_numbers/1/threads:1      4.890µ ± 0%   4.750µ ± 0%     -2.85% (p=0.000 n=30+40)
Istream_numbers/1/threads:72     66.44µ ± 1%   10.14µ ± 1%    -84.74% (p=0.000 n=30+40)
Istream_numbers/2/threads:1      4.888µ ± 0%   4.746µ ± 0%     -2.92% (p=0.000 n=30+40)
Istream_numbers/2/threads:72     494.8µ ± 0%   410.2µ ± 1%    -17.11% (p=0.000 n=30+40)
Istream_numbers/3/threads:1      4.697µ ± 0%   4.695µ ± 5%          ~ (p=0.391 n=30+37)
Istream_numbers/3/threads:72     421.5µ ± 7%   421.9µ ± 9%          ~ (p=0.665 n=30)
Ostream_number/0/threads:1       183.0n ± 0%   141.0n ± 2%    -22.95% (p=0.000 n=30)
Ostream_number/0/threads:72    24196.5n ± 1%   343.5n ± 3%    -98.58% (p=0.000 n=30)
Ostream_number/1/threads:1       250.0n ± 0%   196.0n ± 2%    -21.60% (p=0.000 n=30)
Ostream_number/1/threads:72    16260.5n ± 0%   407.0n ± 2%    -97.50% (p=0.000 n=30)
Ostream_number/2/threads:1       254.0n ± 0%   196.0n ± 1%    -22.83% (p=0.000 n=30)
Ostream_number/2/threads:72      28.49µ ± 1%   18.89µ ± 5%    -33.72% (p=0.000 n=30)
Ostream_number/3/threads:1       185.0n ± 0%   185.0n ± 0%      0.00% (p=0.017 n=30)
Ostream_number/3/threads:72      19.38µ ± 4%   19.33µ ± 5%          ~ (p=0.425 n=30)
---
 libcxx/benchmarks/stringstream.bench.cpp | 62 ++++++++++++++++++++++--
 libcxx/src/locale.cpp                    | 48 ++++++++++++------
 2 files changed, 93 insertions(+), 17 deletions(-)

diff --git a/libcxx/benchmarks/stringstream.bench.cpp b/libcxx/benchmarks/stringstream.bench.cpp
index ea602557ccd770e..c10ee3a8cc5b83c 100644
--- a/libcxx/benchmarks/stringstream.bench.cpp
+++ b/libcxx/benchmarks/stringstream.bench.cpp
@@ -1,11 +1,12 @@
 #include "benchmark/benchmark.h"
 #include "test_macros.h"
 
+#include <mutex>
 #include <sstream>
 
 TEST_NOINLINE double istream_numbers();
 
-double istream_numbers() {
+double istream_numbers(std::locale* l) {
   const char* a[] = {"-6  69 -71  2.4882e-02 -100 101 -2.00005 5000000 -50000000",
                      "-25 71   7 -9.3262e+01 -100 101 -2.00005 5000000 -50000000",
                      "-14 53  46 -6.7026e-02 -100 101 -2.00005 5000000 -50000000"};
@@ -14,17 +15,72 @@ double istream_numbers() {
   double f1 = 0.0, f2 = 0.0, q = 0.0;
   for (int i = 0; i < 3; i++) {
     std::istringstream s(a[i]);
+    if (l)
+      s.imbue(*l);
     s >> a1 >> a2 >> a3 >> f1 >> a4 >> a5 >> f2 >> a6 >> a7;
     q += (a1 + a2 + a3 + a4 + a5 + a6 + a7 + f1 + f2) / 1000000;
   }
   return q;
 }
 
+struct LocaleSelector {
+  std::locale* imbue;
+  std::locale old;
+
+  LocaleSelector(benchmark::State& state) {
+    static std::mutex mu;
+    std::lock_guard l(mu);
+    switch (state.range(0)) {
+    case 0: {
+      old   = std::locale::global(std::locale::classic());
+      imbue = nullptr;
+      break;
+    }
+    case 1: {
+      old = std::locale::global(std::locale::classic());
+      thread_local std::locale l("en_US.UTF-8");
+      imbue = &l;
+      break;
+    }
+    case 2: {
+      old = std::locale::global(std::locale::classic());
+      static std::locale l("en_US.UTF-8");
+      imbue = &l;
+      break;
+    }
+    case 3: {
+      old   = std::locale::global(std::locale("en_US.UTF-8"));
+      imbue = nullptr;
+      break;
+    }
+    }
+  }
+
+  ~LocaleSelector() {
+    static std::mutex mu;
+    std::lock_guard l(mu);
+    std::locale::global(old);
+  }
+};
+
 static void BM_Istream_numbers(benchmark::State& state) {
+  LocaleSelector sel(state);
   double i = 0;
   while (state.KeepRunning())
-    benchmark::DoNotOptimize(i += istream_numbers());
+    benchmark::DoNotOptimize(i += istream_numbers(sel.imbue));
+}
+BENCHMARK(BM_Istream_numbers)->DenseRange(0, 3)->UseRealTime()->Threads(1)->ThreadPerCpu();
+
+static void BM_Ostream_number(benchmark::State& state) {
+  LocaleSelector sel(state);
+  while (state.KeepRunning()) {
+    std::ostringstream ss;
+    if (sel.imbue)
+      ss.imbue(*sel.imbue);
+    ss << 0;
+    benchmark::DoNotOptimize(ss.str().c_str());
+  }
 }
+BENCHMARK(BM_Ostream_number)->DenseRange(0, 3)->UseRealTime()->Threads(1)->ThreadPerCpu();
 
-BENCHMARK(BM_Istream_numbers)->RangeMultiplier(2)->Range(1024, 4096);
 BENCHMARK_MAIN();
diff --git a/libcxx/src/locale.cpp b/libcxx/src/locale.cpp
index c37e091dcc4671b..58b2d6c33606ba9 100644
--- a/libcxx/src/locale.cpp
+++ b/libcxx/src/locale.cpp
@@ -8,6 +8,7 @@
 
 #include <__utility/unreachable.h>
 #include <algorithm>
+#include <atomic>
 #include <clocale>
 #include <codecvt>
 #include <cstddef>
@@ -80,7 +81,7 @@ locale_t __cloc() {
 
 namespace {
 
-struct release
+struct releaser
 {
     void operator()(locale::facet* p) {p->__release_shared();}
 };
@@ -154,12 +155,16 @@ class _LIBCPP_HIDDEN locale::__imp
         {return static_cast<size_t>(id) < facets_.size() && facets_[static_cast<size_t>(id)];}
     const locale::facet* use_facet(long id) const;
 
+    void acquire();
+    void release();
+
     static const locale& make_classic();
     static       locale& make_global();
 private:
     void install(facet* f, long id);
     template <class F> void install(F* f) {install(f, f->id.__get());}
     template <class F> void install_from(const __imp& other);
+    static std::atomic<__imp*> classic_;
 };
 
 locale::__imp::__imp(size_t refs)
@@ -501,7 +506,7 @@ locale::__imp::__imp(const __imp& other, facet* f, long id)
       name_("*")
 {
     f->__add_shared();
-    unique_ptr<facet, release> hold(f);
+    unique_ptr<facet, releaser> hold(f);
     facets_ = other.facets_;
     for (unsigned i = 0; i < other.facets_.size(); ++i)
         if (facets_[i])
@@ -520,7 +525,7 @@ void
 locale::__imp::install(facet* f, long id)
 {
     f->__add_shared();
-    unique_ptr<facet, release> hold(f);
+    unique_ptr<facet, releaser> hold(f);
     if (static_cast<size_t>(id) >= facets_.size())
         facets_.resize(static_cast<size_t>(id+1));
     if (facets_[static_cast<size_t>(id)])
@@ -538,6 +543,8 @@ locale::__imp::use_facet(long id) const
 
 // locale
 
+std::atomic<locale::__imp*> locale::__imp::classic_;
+
 const locale&
 locale::__imp::make_classic()
 {
@@ -545,9 +552,22 @@ locale::__imp::make_classic()
     alignas(locale) static std::byte buf[sizeof(locale)];
     locale* c = reinterpret_cast<locale*>(&buf);
     c->__locale_ = &make<__imp>(1u);
+    classic_.store(c->__locale_, std::memory_order_relaxed);
     return *c;
 }
 
+void locale::__imp::acquire()
+{
+    if (this != classic_.load(std::memory_order_relaxed))
+        __add_shared();
+}
+
+void locale::__imp::release()
+{
+    if (this != classic_.load(std::memory_order_relaxed))
+        __release_shared();
+}
+
 const locale&
 locale::classic()
 {
@@ -574,25 +594,25 @@ locale::__global()
 locale::locale() noexcept
     : __locale_(__global().__locale_)
 {
-    __locale_->__add_shared();
+    __locale_->acquire();
 }
 
 locale::locale(const locale& l) noexcept
     : __locale_(l.__locale_)
 {
-    __locale_->__add_shared();
+    __locale_->acquire();
 }
 
 locale::~locale()
 {
-    __locale_->__release_shared();
+    __locale_->release();
 }
 
 const locale&
 locale::operator=(const locale& other) noexcept
 {
-    other.__locale_->__add_shared();
-    __locale_->__release_shared();
+    other.__locale_->acquire();
+    __locale_->release();
     __locale_ = other.__locale_;
     return *this;
 }
@@ -601,32 +621,32 @@ locale::locale(const char* name)
     : __locale_(name ? new __imp(name)
                      : (__throw_runtime_error("locale constructed with null"), nullptr))
 {
-    __locale_->__add_shared();
+    __locale_->acquire();
 }
 
 locale::locale(const string& name)
     : __locale_(new __imp(name))
 {
-    __locale_->__add_shared();
+    __locale_->acquire();
 }
 
 locale::locale(const locale& other, const char* name, category c)
     : __locale_(name ? new __imp(*other.__locale_, name, c)
                      : (__throw_runtime_error("locale constructed with null"), nullptr))
 {
-    __locale_->__add_shared();
+    __locale_->acquire();
 }
 
 locale::locale(const locale& other, const string& name, category c)
     : __locale_(new __imp(*other.__locale_, name, c))
 {
-    __locale_->__add_shared();
+    __locale_->acquire();
 }
 
 locale::locale(const locale& other, const locale& one, category c)
     : __locale_(new __imp(*other.__locale_, *one.__locale_, c))
 {
-    __locale_->__add_shared();
+    __locale_->acquire();
 }
 
 string
@@ -642,7 +662,7 @@ locale::__install_ctor(const locale& other, facet* f, long id)
         __locale_ = new __imp(*other.__locale_, f, id);
     else
         __locale_ = other.__locale_;
-    __locale_->__add_shared();
+    __locale_->acquire();
 }
 
 locale

>From c38f931e10bdf39b46268b8aec9ae64582cdf19c Mon Sep 17 00:00:00 2001
From: Dmitry Vyukov <dvyukov at google.com>
Date: Thu, 16 Nov 2023 15:02:16 +0100
Subject: [PATCH 2/2] add more comments

---
 libcxx/src/locale.cpp | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/libcxx/src/locale.cpp b/libcxx/src/locale.cpp
index 58b2d6c33606ba9..6668943e3187432 100644
--- a/libcxx/src/locale.cpp
+++ b/libcxx/src/locale.cpp
@@ -164,6 +164,12 @@ class _LIBCPP_HIDDEN locale::__imp
     void install(facet* f, long id);
     template <class F> void install(F* f) {install(f, f->id.__get());}
     template <class F> void install_from(const __imp& other);
+
+    // We don't do reference counting on the classic locale.
+    // It's never destroyed anyway, but atomic reference counting may be very
+    // expensive in parallel applications. The classic locale is used by default
+    // in all streams. Note: if a new global locale is installed, then we lose
+    // the benefit of no reference counting.
     static std::atomic<__imp*> classic_;
 };
 
@@ -552,6 +558,15 @@ locale::__imp::make_classic()
     alignas(locale) static std::byte buf[sizeof(locale)];
     locale* c = reinterpret_cast<locale*>(&buf);
     c->__locale_ = &make<__imp>(1u);
+    // We use relaxed memory ordering because readers don't access
+    // the contents of the objects, they are interested in just the
+    // pointer value.
+    // If a locale uses the classic imp, then this store happens
+    // before acquire/release methods, and they must observe the
+    // right value and omit reference counting.
+    // If a locale uses a non-classic imp, then it does not matter
+    // what value it will load, the result of the comparison will
+    // be false in all cases.
     classic_.store(c->__locale_, std::memory_order_relaxed);
     return *c;
 }



More information about the libcxx-commits mailing list