[libcxx-commits] [libcxx] [libc++] Speed up classic locale (PR #70631)
Dmitry Vyukov via libcxx-commits
libcxx-commits at lists.llvm.org
Tue Oct 31 02:16:15 PDT 2023
https://github.com/dvyukov updated https://github.com/llvm/llvm-project/pull/70631
>From 517bebbd306675b326f096f7bc9c5ecffa8f0517 Mon Sep 17 00:00:00 2001
From: Dmitry Vyukov <dvyukov at google.com>
Date: Mon, 30 Oct 2023 08:40:47 +0100
Subject: [PATCH 1/3] [libc++] Speed up classic locale
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Locale objects use atomic reference counting, which may be very expensive
in parallel applications. The classic locale is used by default by all
streams and can be very contended. But it's never destroyed, so the
reference counting is also completely pointless on the classic locale.
Currently ~70% of time in the parallel stringstream benchmarks is spent
in locale ctor/dtor. And the execution radically slows down with more threads.
Avoid reference counting on the classic locale and inline common ctors/dtor.
With this change locale ctor/dtor time become negligible and the benchmark
starts to scale with threads.
│baseline sec/op│ optimized sec/op │
Ostream_number/threads:1 184.5n ± 0% 133.0n ± 1% -27.91% (p=0.000 n=30)
Ostream_number/threads:72 24188.0n ± 3% 321.0n ± 2% -98.67% (p=0.000 n=30)
Istream_numbers/1024/threads:1 4.667µ ± 1% 4.273µ ± 0% -8.43% (p=0.000 n=30)
Istream_numbers/1024/threads:72 559.657µ ± 1% 9.350µ ± 1% -98.33% (p=0.000 n=30)
---
libcxx/benchmarks/stringstream.bench.cpp | 11 ++-
libcxx/include/__locale | 42 ++++++++++-
libcxx/src/locale.cpp | 91 +++++++++++-------------
3 files changed, 92 insertions(+), 52 deletions(-)
diff --git a/libcxx/benchmarks/stringstream.bench.cpp b/libcxx/benchmarks/stringstream.bench.cpp
index ea602557ccd770e..1016f4f110867f9 100644
--- a/libcxx/benchmarks/stringstream.bench.cpp
+++ b/libcxx/benchmarks/stringstream.bench.cpp
@@ -25,6 +25,15 @@ static void BM_Istream_numbers(benchmark::State& state) {
while (state.KeepRunning())
benchmark::DoNotOptimize(i += istream_numbers());
}
+BENCHMARK(BM_Istream_numbers)->RangeMultiplier(2)->Range(1024, 4096)->UseRealTime()->Threads(1)->ThreadPerCpu();
+
+static void BM_Ostream_number(benchmark::State& state) {
+ while (state.KeepRunning()) {
+ std::ostringstream ss;
+ ss << 0;
+ benchmark::DoNotOptimize(ss.str().c_str());
+ }
+}
+BENCHMARK(BM_Ostream_number)->UseRealTime()->Threads(1)->ThreadPerCpu();
-BENCHMARK(BM_Istream_numbers)->RangeMultiplier(2)->Range(1024, 4096);
BENCHMARK_MAIN();
diff --git a/libcxx/include/__locale b/libcxx/include/__locale
index 90dcad3590c3d21..123c7000b9ff99e 100644
--- a/libcxx/include/__locale
+++ b/libcxx/include/__locale
@@ -126,9 +126,15 @@ public:
private:
class __imp;
__imp* __locale_;
+ static __imp* __classic_;
void __install_ctor(const locale&, facet*, long);
- static locale& __global();
+ static __imp*& __global();
+ static __imp* __make_global();
+ static __imp* __maybe_acquire(__imp* __i);
+ static __imp* __do_acquire(__imp* __i);
+ static void __maybe_release(__imp* __i);
+ static void __do_release(__imp* __i);
bool has_facet(id&) const;
const facet* use_facet(id&) const;
@@ -136,6 +142,40 @@ private:
template <class _Facet> friend const _Facet& use_facet(const locale&);
};
+inline locale::locale() _NOEXCEPT
+ : __locale_(__maybe_acquire(__global()))
+{
+}
+
+inline locale::locale(const locale& l) _NOEXCEPT
+ : __locale_(__maybe_acquire(l.__locale_))
+{
+}
+
+inline locale::~locale()
+{
+ __maybe_release(__locale_);
+}
+
+inline locale::__imp*& locale::__global()
+{
+ static __imp* __g = __make_global();
+ return __g;
+}
+
+inline locale::__imp* locale::__maybe_acquire(__imp* __i)
+{
+ if (__i != __classic_)
+ __do_acquire(__i);
+ return __i;
+}
+
+inline void locale::__maybe_release(__imp* __i)
+{
+ if (__i != __classic_)
+ __do_release(__i);
+}
+
class _LIBCPP_EXPORTED_FROM_ABI locale::facet
: public __shared_count
{
diff --git a/libcxx/src/locale.cpp b/libcxx/src/locale.cpp
index 317b4dec7d241e5..a5d883718aea3b0 100644
--- a/libcxx/src/locale.cpp
+++ b/libcxx/src/locale.cpp
@@ -154,7 +154,6 @@ class _LIBCPP_HIDDEN locale::__imp
const locale::facet* use_facet(long id) const;
static const locale& make_classic();
- static locale& make_global();
private:
void install(facet* f, long id);
template <class F> void install(F* f) {install(f, f->id.__get());}
@@ -537,6 +536,8 @@ locale::__imp::use_facet(long id) const
// locale
+locale::__imp* locale::__classic_;
+
const locale&
locale::__imp::make_classic()
{
@@ -544,6 +545,16 @@ locale::__imp::make_classic()
static aligned_storage<sizeof(locale)>::type buf;
locale* c = reinterpret_cast<locale*>(&buf);
c->__locale_ = &make<__imp>(1u);
+ // We don't do reference counting on the classic locale.
+ // It's never destroyed anyway, but atomic reference counting may be very
+ // expensive in parallel applications. The classic locale is used by default
+ // in all streams. Note: if a new global locale is installed, then we lose
+ // the benefit of no reference counting. Potentially we can omit reference
+ // counting on all locales that are ever installed as global (leak them).
+ // Programs are not expected to install unbounded number of unique global
+ // locales, and global locale cannot be installed if any threads are running
+ // so real programs shouldn't install them at all.
+ c->__classic_ = c->__locale_;
return *c;
}
@@ -554,78 +565,58 @@ locale::classic()
return c;
}
-locale&
-locale::__imp::make_global()
+locale::__imp*
+locale::__make_global()
{
// only one thread can get in here and it only gets in once
- static aligned_storage<sizeof(locale)>::type buf;
- auto *obj = ::new (&buf) locale(locale::classic());
- return *obj;
-}
-
-locale&
-locale::__global()
-{
- static locale& g = __imp::make_global();
- return g;
-}
-
-locale::locale() noexcept
- : __locale_(__global().__locale_)
-{
- __locale_->__add_shared();
-}
-
-locale::locale(const locale& l) noexcept
- : __locale_(l.__locale_)
-{
- __locale_->__add_shared();
-}
-
-locale::~locale()
-{
- __locale_->__release_shared();
+ return classic().__locale_;
}
const locale&
locale::operator=(const locale& other) noexcept
{
- other.__locale_->__add_shared();
- __locale_->__release_shared();
+ __maybe_acquire(other.__locale_);
+ __maybe_release(__locale_);
__locale_ = other.__locale_;
return *this;
}
locale::locale(const char* name)
- : __locale_(name ? new __imp(name)
- : (__throw_runtime_error("locale constructed with null"), nullptr))
+ : __locale_(__do_acquire(name ? new __imp(name)
+ : (__throw_runtime_error("locale constructed with null"), nullptr)))
{
- __locale_->__add_shared();
}
locale::locale(const string& name)
- : __locale_(new __imp(name))
+ : __locale_(__do_acquire(new __imp(name)))
{
- __locale_->__add_shared();
}
locale::locale(const locale& other, const char* name, category c)
- : __locale_(name ? new __imp(*other.__locale_, name, c)
- : (__throw_runtime_error("locale constructed with null"), nullptr))
+ : __locale_(__do_acquire(name ? new __imp(*other.__locale_, name, c)
+ : (__throw_runtime_error("locale constructed with null"), nullptr)))
{
- __locale_->__add_shared();
}
locale::locale(const locale& other, const string& name, category c)
- : __locale_(new __imp(*other.__locale_, name, c))
+ : __locale_(__do_acquire(new __imp(*other.__locale_, name, c)))
{
- __locale_->__add_shared();
}
locale::locale(const locale& other, const locale& one, category c)
- : __locale_(new __imp(*other.__locale_, *one.__locale_, c))
+ : __locale_(__do_acquire(new __imp(*other.__locale_, *one.__locale_, c)))
+{
+}
+
+locale::__imp* locale::__do_acquire(__imp* __i)
+{
+ __i->__add_shared();
+ return __i;
+}
+
+void locale::__do_release(__imp* __i)
{
- __locale_->__add_shared();
+ __i->__release_shared();
}
string
@@ -641,17 +632,17 @@ locale::__install_ctor(const locale& other, facet* f, long id)
__locale_ = new __imp(*other.__locale_, f, id);
else
__locale_ = other.__locale_;
- __locale_->__add_shared();
+ __maybe_acquire(__locale_);
}
locale
locale::global(const locale& loc)
{
- locale& g = __global();
- locale r = g;
- g = loc;
- if (g.name() != "*")
- setlocale(LC_ALL, g.name().c_str());
+ __imp*& g = __global();
+ locale r = loc;
+ swap(g, r.__locale_);
+ if (g->name() != "*")
+ setlocale(LC_ALL, g->name().c_str());
return r;
}
>From 8d48c090cfbb385f17076c2c752ee195f36384f8 Mon Sep 17 00:00:00 2001
From: Dmitry Vyukov <dvyukov at google.com>
Date: Tue, 31 Oct 2023 08:44:25 +0100
Subject: [PATCH 2/3] add _LIBCPP_HIDE_FROM_ABI to inline functions
---
libcxx/include/__locale | 12 ++++++------
1 file changed, 6 insertions(+), 6 deletions(-)
diff --git a/libcxx/include/__locale b/libcxx/include/__locale
index 123c7000b9ff99e..839c6c8eedc85ef 100644
--- a/libcxx/include/__locale
+++ b/libcxx/include/__locale
@@ -90,8 +90,8 @@ public:
all = collate | ctype | monetary | numeric | time | messages;
// construct/copy/destroy:
- locale() _NOEXCEPT;
- locale(const locale&) _NOEXCEPT;
+ _LIBCPP_HIDE_FROM_ABI locale() _NOEXCEPT;
+ _LIBCPP_HIDE_FROM_ABI locale(const locale&) _NOEXCEPT;
explicit locale(const char*);
explicit locale(const string&);
locale(const locale&, const char*, category);
@@ -100,7 +100,7 @@ public:
_LIBCPP_INLINE_VISIBILITY locale(const locale&, _Facet*);
locale(const locale&, const locale&, category);
- ~locale();
+ _LIBCPP_HIDE_FROM_ABI ~locale();
const locale& operator=(const locale&) _NOEXCEPT;
@@ -129,11 +129,11 @@ private:
static __imp* __classic_;
void __install_ctor(const locale&, facet*, long);
- static __imp*& __global();
+ static _LIBCPP_HIDE_FROM_ABI __imp*& __global();
static __imp* __make_global();
- static __imp* __maybe_acquire(__imp* __i);
+ static _LIBCPP_HIDE_FROM_ABI __imp* __maybe_acquire(__imp* __i);
static __imp* __do_acquire(__imp* __i);
- static void __maybe_release(__imp* __i);
+ static _LIBCPP_HIDE_FROM_ABI void __maybe_release(__imp* __i);
static void __do_release(__imp* __i);
bool has_facet(id&) const;
const facet* use_facet(id&) const;
>From 02f4235702682539bc99168a445b2bc19310b240 Mon Sep 17 00:00:00 2001
From: Dmitry Vyukov <dvyukov at google.com>
Date: Tue, 31 Oct 2023 08:44:42 +0100
Subject: [PATCH 3/3] extend benchmarks to non-classic locale
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
│ bench/stream.baseline.res │ bench/stream.optimized.res │
│ sec/op │ sec/op vs base │
Istream_numbers/0/real_time/threads:1 4.686µ ± 0% 4.213µ ± 0% -10.09% (p=0.000 n=24)
Istream_numbers/0/real_time/threads:72 339.712µ ± 4% 8.939µ ± 2% -97.37% (p=0.000 n=24)
Istream_numbers/1/real_time/threads:1 4.900µ ± 0% 4.773µ ± 0% -2.60% (p=0.000 n=24)
Istream_numbers/1/real_time/threads:72 49.308µ ± 3% 9.694µ ± 1% -80.34% (p=0.000 n=24)
Istream_numbers/2/real_time/threads:1 4.902µ ± 0% 4.769µ ± 0% -2.72% (p=0.000 n=24)
Istream_numbers/2/real_time/threads:72 416.6µ ± 9% 410.5µ ± 6% ~ (p=0.111 n=24)
Istream_numbers/3/real_time/threads:1 4.716µ ± 0% 4.737µ ± 0% +0.45% (p=0.000 n=24)
Istream_numbers/3/real_time/threads:72 419.7µ ± 6% 458.3µ ± 4% +9.19% (p=0.003 n=24)
Ostream_number/0/real_time/threads:1 184.0n ± 1% 135.0n ± 1% -26.63% (p=0.000 n=24)
Ostream_number/0/real_time/threads:72 17279.0n ± 4% 308.0n ± 1% -98.22% (p=0.000 n=24)
Ostream_number/1/real_time/threads:1 253.5n ± 1% 199.0n ± 1% -21.50% (p=0.000 n=24)
Ostream_number/1/real_time/threads:72 13957.5n ± 2% 382.0n ± 0% -97.26% (p=0.000 n=24)
Ostream_number/2/real_time/threads:1 253.0n ± 1% 202.0n ± 1% -20.16% (p=0.000 n=24)
Ostream_number/2/real_time/threads:72 28.04µ ± 4% 18.80µ ± 7% -32.96% (p=0.000 n=24)
Ostream_number/3/real_time/threads:1 187.0n ± 1% 190.5n ± 0% +1.87% (p=0.000 n=24)
Ostream_number/3/real_time/threads:72 20.25µ ± 7% 20.37µ ± 5% ~ (p=0.736 n=24)
---
libcxx/benchmarks/stringstream.bench.cpp | 55 ++++++++++++++++++++++--
1 file changed, 51 insertions(+), 4 deletions(-)
diff --git a/libcxx/benchmarks/stringstream.bench.cpp b/libcxx/benchmarks/stringstream.bench.cpp
index 1016f4f110867f9..c10ee3a8cc5b83c 100644
--- a/libcxx/benchmarks/stringstream.bench.cpp
+++ b/libcxx/benchmarks/stringstream.bench.cpp
@@ -1,11 +1,12 @@
#include "benchmark/benchmark.h"
#include "test_macros.h"
+#include <mutex>
#include <sstream>
TEST_NOINLINE double istream_numbers();
-double istream_numbers() {
+double istream_numbers(std::locale* l) {
const char* a[] = {"-6 69 -71 2.4882e-02 -100 101 -2.00005 5000000 -50000000",
"-25 71 7 -9.3262e+01 -100 101 -2.00005 5000000 -50000000",
"-14 53 46 -6.7026e-02 -100 101 -2.00005 5000000 -50000000"};
@@ -14,26 +15,72 @@ double istream_numbers() {
double f1 = 0.0, f2 = 0.0, q = 0.0;
for (int i = 0; i < 3; i++) {
std::istringstream s(a[i]);
+ if (l)
+ s.imbue(*l);
s >> a1 >> a2 >> a3 >> f1 >> a4 >> a5 >> f2 >> a6 >> a7;
q += (a1 + a2 + a3 + a4 + a5 + a6 + a7 + f1 + f2) / 1000000;
}
return q;
}
+struct LocaleSelector {
+ std::locale* imbue;
+ std::locale old;
+
+ LocaleSelector(benchmark::State& state) {
+ static std::mutex mu;
+ std::lock_guard l(mu);
+ switch (state.range(0)) {
+ case 0: {
+ old = std::locale::global(std::locale::classic());
+ imbue = nullptr;
+ break;
+ }
+ case 1: {
+ old = std::locale::global(std::locale::classic());
+ thread_local std::locale l("en_US.UTF-8");
+ imbue = &l;
+ break;
+ }
+ case 2: {
+ old = std::locale::global(std::locale::classic());
+ static std::locale l("en_US.UTF-8");
+ imbue = &l;
+ break;
+ }
+ case 3: {
+ old = std::locale::global(std::locale("en_US.UTF-8"));
+ imbue = nullptr;
+ break;
+ }
+ }
+ }
+
+ ~LocaleSelector() {
+ static std::mutex mu;
+ std::lock_guard l(mu);
+ std::locale::global(old);
+ }
+};
+
static void BM_Istream_numbers(benchmark::State& state) {
+ LocaleSelector sel(state);
double i = 0;
while (state.KeepRunning())
- benchmark::DoNotOptimize(i += istream_numbers());
+ benchmark::DoNotOptimize(i += istream_numbers(sel.imbue));
}
-BENCHMARK(BM_Istream_numbers)->RangeMultiplier(2)->Range(1024, 4096)->UseRealTime()->Threads(1)->ThreadPerCpu();
+BENCHMARK(BM_Istream_numbers)->DenseRange(0, 3)->UseRealTime()->Threads(1)->ThreadPerCpu();
static void BM_Ostream_number(benchmark::State& state) {
+ LocaleSelector sel(state);
while (state.KeepRunning()) {
std::ostringstream ss;
+ if (sel.imbue)
+ ss.imbue(*sel.imbue);
ss << 0;
benchmark::DoNotOptimize(ss.str().c_str());
}
}
-BENCHMARK(BM_Ostream_number)->UseRealTime()->Threads(1)->ThreadPerCpu();
+BENCHMARK(BM_Ostream_number)->DenseRange(0, 3)->UseRealTime()->Threads(1)->ThreadPerCpu();
BENCHMARK_MAIN();
More information about the libcxx-commits
mailing list