[libcxx-commits] [libcxx] [libc++] Speed up classic locale (PR #72112)

via libcxx-commits libcxx-commits at lists.llvm.org
Mon Nov 13 04:32:58 PST 2023


llvmbot wrote:


<!--LLVM PR SUMMARY COMMENT-->

@llvm/pr-subscribers-libcxx

Author: Dmitry Vyukov (dvyukov)

<details>
<summary>Changes</summary>

Locale objects use atomic reference counting, which may be very expensive in parallel applications. The classic locale is used by default by all streams and can be very contended. But it's never destroyed, so the reference counting is also completely pointless on the classic locale. Currently ~70% of time in the parallel stringstream benchmarks is spent in locale ctor/dtor. And the execution radically slows down with more threads.

Avoid reference counting on the classic locale. With this change parallel benchmarks start to scale with threads.

```
                              │   baseline   │    optimized                            │
                              │    sec/op    │    sec/op      vs base                  │
Istream_numbers/0/threads:1      4.672µ ± 0%   4.419µ ± 0%     -5.42% (p=0.000 n=30+39)
Istream_numbers/0/threads:72   539.817µ ± 0%   9.842µ ± 1%    -98.18% (p=0.000 n=30+40)
Istream_numbers/1/threads:1      4.890µ ± 0%   4.750µ ± 0%     -2.85% (p=0.000 n=30+40)
Istream_numbers/1/threads:72     66.44µ ± 1%   10.14µ ± 1%    -84.74% (p=0.000 n=30+40)
Istream_numbers/2/threads:1      4.888µ ± 0%   4.746µ ± 0%     -2.92% (p=0.000 n=30+40)
Istream_numbers/2/threads:72     494.8µ ± 0%   410.2µ ± 1%    -17.11% (p=0.000 n=30+40)
Istream_numbers/3/threads:1      4.697µ ± 0%   4.695µ ± 5%          ~ (p=0.391 n=30+37)
Istream_numbers/3/threads:72     421.5µ ± 7%   421.9µ ± 9%          ~ (p=0.665 n=30)
Ostream_number/0/threads:1       183.0n ± 0%   141.0n ± 2%    -22.95% (p=0.000 n=30)
Ostream_number/0/threads:72    24196.5n ± 1%   343.5n ± 3%    -98.58% (p=0.000 n=30)
Ostream_number/1/threads:1       250.0n ± 0%   196.0n ± 2%    -21.60% (p=0.000 n=30)
Ostream_number/1/threads:72    16260.5n ± 0%   407.0n ± 2%    -97.50% (p=0.000 n=30)
Ostream_number/2/threads:1       254.0n ± 0%   196.0n ± 1%    -22.83% (p=0.000 n=30)
Ostream_number/2/threads:72      28.49µ ± 1%   18.89µ ± 5%    -33.72% (p=0.000 n=30)
Ostream_number/3/threads:1       185.0n ± 0%   185.0n ± 0%      0.00% (p=0.017 n=30)
Ostream_number/3/threads:72      19.38µ ± 4%   19.33µ ± 5%          ~ (p=0.425 n=30)
```

---
Full diff: https://github.com/llvm/llvm-project/pull/72112.diff


2 Files Affected:

- (modified) libcxx/benchmarks/stringstream.bench.cpp (+59-3) 
- (modified) libcxx/src/locale.cpp (+34-14) 


``````````diff
diff --git a/libcxx/benchmarks/stringstream.bench.cpp b/libcxx/benchmarks/stringstream.bench.cpp
index ea602557ccd770e..c10ee3a8cc5b83c 100644
--- a/libcxx/benchmarks/stringstream.bench.cpp
+++ b/libcxx/benchmarks/stringstream.bench.cpp
@@ -1,11 +1,12 @@
 #include "benchmark/benchmark.h"
 #include "test_macros.h"
 
+#include <mutex>
 #include <sstream>
 
 TEST_NOINLINE double istream_numbers();
 
-double istream_numbers() {
+double istream_numbers(std::locale* l) {
   const char* a[] = {"-6  69 -71  2.4882e-02 -100 101 -2.00005 5000000 -50000000",
                      "-25 71   7 -9.3262e+01 -100 101 -2.00005 5000000 -50000000",
                      "-14 53  46 -6.7026e-02 -100 101 -2.00005 5000000 -50000000"};
@@ -14,17 +15,72 @@ double istream_numbers() {
   double f1 = 0.0, f2 = 0.0, q = 0.0;
   for (int i = 0; i < 3; i++) {
     std::istringstream s(a[i]);
+    if (l)
+      s.imbue(*l);
     s >> a1 >> a2 >> a3 >> f1 >> a4 >> a5 >> f2 >> a6 >> a7;
     q += (a1 + a2 + a3 + a4 + a5 + a6 + a7 + f1 + f2) / 1000000;
   }
   return q;
 }
 
+struct LocaleSelector {
+  std::locale* imbue;
+  std::locale old;
+
+  LocaleSelector(benchmark::State& state) {
+    static std::mutex mu;
+    std::lock_guard l(mu);
+    switch (state.range(0)) {
+    case 0: {
+      old   = std::locale::global(std::locale::classic());
+      imbue = nullptr;
+      break;
+    }
+    case 1: {
+      old = std::locale::global(std::locale::classic());
+      thread_local std::locale l("en_US.UTF-8");
+      imbue = &l;
+      break;
+    }
+    case 2: {
+      old = std::locale::global(std::locale::classic());
+      static std::locale l("en_US.UTF-8");
+      imbue = &l;
+      break;
+    }
+    case 3: {
+      old   = std::locale::global(std::locale("en_US.UTF-8"));
+      imbue = nullptr;
+      break;
+    }
+    }
+  }
+
+  ~LocaleSelector() {
+    static std::mutex mu;
+    std::lock_guard l(mu);
+    std::locale::global(old);
+  }
+};
+
 static void BM_Istream_numbers(benchmark::State& state) {
+  LocaleSelector sel(state);
   double i = 0;
   while (state.KeepRunning())
-    benchmark::DoNotOptimize(i += istream_numbers());
+    benchmark::DoNotOptimize(i += istream_numbers(sel.imbue));
+}
+BENCHMARK(BM_Istream_numbers)->DenseRange(0, 3)->UseRealTime()->Threads(1)->ThreadPerCpu();
+
+static void BM_Ostream_number(benchmark::State& state) {
+  LocaleSelector sel(state);
+  while (state.KeepRunning()) {
+    std::ostringstream ss;
+    if (sel.imbue)
+      ss.imbue(*sel.imbue);
+    ss << 0;
+    benchmark::DoNotOptimize(ss.str().c_str());
+  }
 }
+BENCHMARK(BM_Ostream_number)->DenseRange(0, 3)->UseRealTime()->Threads(1)->ThreadPerCpu();
 
-BENCHMARK(BM_Istream_numbers)->RangeMultiplier(2)->Range(1024, 4096);
 BENCHMARK_MAIN();
diff --git a/libcxx/src/locale.cpp b/libcxx/src/locale.cpp
index c37e091dcc4671b..58b2d6c33606ba9 100644
--- a/libcxx/src/locale.cpp
+++ b/libcxx/src/locale.cpp
@@ -8,6 +8,7 @@
 
 #include <__utility/unreachable.h>
 #include <algorithm>
+#include <atomic>
 #include <clocale>
 #include <codecvt>
 #include <cstddef>
@@ -80,7 +81,7 @@ locale_t __cloc() {
 
 namespace {
 
-struct release
+struct releaser
 {
     void operator()(locale::facet* p) {p->__release_shared();}
 };
@@ -154,12 +155,16 @@ class _LIBCPP_HIDDEN locale::__imp
         {return static_cast<size_t>(id) < facets_.size() && facets_[static_cast<size_t>(id)];}
     const locale::facet* use_facet(long id) const;
 
+    void acquire();
+    void release();
+
     static const locale& make_classic();
     static       locale& make_global();
 private:
     void install(facet* f, long id);
     template <class F> void install(F* f) {install(f, f->id.__get());}
     template <class F> void install_from(const __imp& other);
+    static std::atomic<__imp*> classic_;
 };
 
 locale::__imp::__imp(size_t refs)
@@ -501,7 +506,7 @@ locale::__imp::__imp(const __imp& other, facet* f, long id)
       name_("*")
 {
     f->__add_shared();
-    unique_ptr<facet, release> hold(f);
+    unique_ptr<facet, releaser> hold(f);
     facets_ = other.facets_;
     for (unsigned i = 0; i < other.facets_.size(); ++i)
         if (facets_[i])
@@ -520,7 +525,7 @@ void
 locale::__imp::install(facet* f, long id)
 {
     f->__add_shared();
-    unique_ptr<facet, release> hold(f);
+    unique_ptr<facet, releaser> hold(f);
     if (static_cast<size_t>(id) >= facets_.size())
         facets_.resize(static_cast<size_t>(id+1));
     if (facets_[static_cast<size_t>(id)])
@@ -538,6 +543,8 @@ locale::__imp::use_facet(long id) const
 
 // locale
 
+std::atomic<locale::__imp*> locale::__imp::classic_;
+
 const locale&
 locale::__imp::make_classic()
 {
@@ -545,9 +552,22 @@ locale::__imp::make_classic()
     alignas(locale) static std::byte buf[sizeof(locale)];
     locale* c = reinterpret_cast<locale*>(&buf);
     c->__locale_ = &make<__imp>(1u);
+    classic_.store(c->__locale_, std::memory_order_relaxed);
     return *c;
 }
 
+void locale::__imp::acquire()
+{
+    if (this != classic_.load(std::memory_order_relaxed))
+        __add_shared();
+}
+
+void locale::__imp::release()
+{
+    if (this != classic_.load(std::memory_order_relaxed))
+        __release_shared();
+}
+
 const locale&
 locale::classic()
 {
@@ -574,25 +594,25 @@ locale::__global()
 locale::locale() noexcept
     : __locale_(__global().__locale_)
 {
-    __locale_->__add_shared();
+    __locale_->acquire();
 }
 
 locale::locale(const locale& l) noexcept
     : __locale_(l.__locale_)
 {
-    __locale_->__add_shared();
+    __locale_->acquire();
 }
 
 locale::~locale()
 {
-    __locale_->__release_shared();
+    __locale_->release();
 }
 
 const locale&
 locale::operator=(const locale& other) noexcept
 {
-    other.__locale_->__add_shared();
-    __locale_->__release_shared();
+    other.__locale_->acquire();
+    __locale_->release();
     __locale_ = other.__locale_;
     return *this;
 }
@@ -601,32 +621,32 @@ locale::locale(const char* name)
     : __locale_(name ? new __imp(name)
                      : (__throw_runtime_error("locale constructed with null"), nullptr))
 {
-    __locale_->__add_shared();
+    __locale_->acquire();
 }
 
 locale::locale(const string& name)
     : __locale_(new __imp(name))
 {
-    __locale_->__add_shared();
+    __locale_->acquire();
 }
 
 locale::locale(const locale& other, const char* name, category c)
     : __locale_(name ? new __imp(*other.__locale_, name, c)
                      : (__throw_runtime_error("locale constructed with null"), nullptr))
 {
-    __locale_->__add_shared();
+    __locale_->acquire();
 }
 
 locale::locale(const locale& other, const string& name, category c)
     : __locale_(new __imp(*other.__locale_, name, c))
 {
-    __locale_->__add_shared();
+    __locale_->acquire();
 }
 
 locale::locale(const locale& other, const locale& one, category c)
     : __locale_(new __imp(*other.__locale_, *one.__locale_, c))
 {
-    __locale_->__add_shared();
+    __locale_->acquire();
 }
 
 string
@@ -642,7 +662,7 @@ locale::__install_ctor(const locale& other, facet* f, long id)
         __locale_ = new __imp(*other.__locale_, f, id);
     else
         __locale_ = other.__locale_;
-    __locale_->__add_shared();
+    __locale_->acquire();
 }
 
 locale

``````````

</details>


https://github.com/llvm/llvm-project/pull/72112


More information about the libcxx-commits mailing list