[libcxx-commits] [libcxx] 81b8135 - [libc++] Optimize bitset::to_string (#128832)

via libcxx-commits libcxx-commits at lists.llvm.org
Wed May 21 09:16:43 PDT 2025


Author: Peng Liu
Date: 2025-05-21T12:16:40-04:00
New Revision: 81b81354f8c117fab07823fef24b97b3a1f47834

URL: https://github.com/llvm/llvm-project/commit/81b81354f8c117fab07823fef24b97b3a1f47834
DIFF: https://github.com/llvm/llvm-project/commit/81b81354f8c117fab07823fef24b97b3a1f47834.diff

LOG: [libc++] Optimize bitset::to_string (#128832)

This patch optimizes `bitset::to_string` by replacing the existing bit-by-bit processing with a more efficient
bit traversal strategy. Instead of checking each bit sequentially, we leverage `std::__countr_zero` to efficiently
locate the next set bit, skipping over consecutive zero bits. This greatly accelerates the conversion process,
especially for sparse `bitset`s where zero bits dominate. To ensure similar improvements for dense `bitset`s, we
exploit symmetry by inverting the bit pattern, allowing us to apply the same optimized traversal technique. Even
for uniformly distributed `bitset`s, the proposed approach offers measurable performance gains over the existing
implementation.

Benchmarks demonstrate substantial improvements, achieving up to 13.5x speedup for sparse `bitset`s with
`Pr(true bit) = 0.1`, 16.1x for dense `bitset`s with `Pr(true bit) = 0.9`, and 8.3x for uniformly distributed
`bitset`s with `Pr(true bit) = 0.5)`.

Added: 
    libcxx/test/benchmarks/bitset.bench.cpp

Modified: 
    libcxx/docs/ReleaseNotes/21.rst
    libcxx/include/bitset

Removed: 
    


################################################################################
diff  --git a/libcxx/docs/ReleaseNotes/21.rst b/libcxx/docs/ReleaseNotes/21.rst
index 2e9bb2356831c..d0383a705190d 100644
--- a/libcxx/docs/ReleaseNotes/21.rst
+++ b/libcxx/docs/ReleaseNotes/21.rst
@@ -73,6 +73,9 @@ Improvements and New Features
 - The ``std::for_each_n`` algorithm has been optimized for segmented iterators, resulting in a performance improvement of
   up to 17.7x for ``std::deque<short>`` iterators, and up to 13.9x for ``std::join_view<vector<vector<short>>>`` iterators.
 
+- The ``bitset::to_string`` function has been optimized, resulting in a performance improvement of up to 8.3x for bitsets
+  with uniformly distributed zeros and ones, and up to 13.5x and 16.1x for sparse and dense bitsets, respectively.
+
 Deprecations and Removals
 -------------------------
 

diff  --git a/libcxx/include/bitset b/libcxx/include/bitset
index 9273ccabbb4e3..eee5a51a39e24 100644
--- a/libcxx/include/bitset
+++ b/libcxx/include/bitset
@@ -137,6 +137,8 @@ template <size_t N> struct hash<std::bitset<N>>;
 #  include <__algorithm/fill_n.h>
 #  include <__algorithm/find.h>
 #  include <__assert>
+#  include <__bit/countr.h>
+#  include <__bit/invert_if.h>
 #  include <__bit_reference>
 #  include <__config>
 #  include <__cstddef/ptr
diff _t.h>
@@ -228,6 +230,21 @@ protected:
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 bool any() const _NOEXCEPT;
   _LIBCPP_HIDE_FROM_ABI size_t __hash_code() const _NOEXCEPT;
 
+  template <bool _Sparse, class _CharT, class _Traits, class _Allocator>
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 basic_string<_CharT, _Traits, _Allocator>
+  __to_string(_CharT __zero, _CharT __one) const {
+    basic_string<_CharT, _Traits, _Allocator> __r(_Size, _Sparse ? __zero : __one);
+    for (size_t __i = 0, __bits = 0; __i < _N_words; ++__i, __bits += __bits_per_word) {
+      __storage_type __word = std::__invert_if<!_Sparse>(__first_[__i]);
+      if (__i == _N_words - 1 && _Size - __bits < __bits_per_word)
+        __word &= (__storage_type(1) << (_Size - __bits)) - 1;
+      for (; __word; __word &= (__word - 1))
+        __r[_Size - 1 - (__bits + std::__countr_zero(__word))] = _Sparse ? __one : __zero;
+    }
+
+    return __r;
+  }
+
 private:
 #  ifdef _LIBCPP_CXX03_LANG
   void __init(unsigned long long __v, false_type) _NOEXCEPT;
@@ -483,6 +500,20 @@ protected:
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 unsigned long to_ulong() const;
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 unsigned long long to_ullong() const;
 
+  template <bool _Sparse, class _CharT, class _Traits, class _Allocator>
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 basic_string<_CharT, _Traits, _Allocator>
+  __to_string(_CharT __zero, _CharT __one) const {
+    basic_string<_CharT, _Traits, _Allocator> __r(_Size, _Sparse ? __zero : __one);
+    __storage_type __word = std::__invert_if<!_Sparse>(__first_);
+    if (_Size < __bits_per_word)
+      __word &= (__storage_type(1) << _Size) - 1;
+    for (; __word; __word &= (__word - 1)) {
+      size_t __pos           = std::__countr_zero(__word);
+      __r[_Size - 1 - __pos] = _Sparse ? __one : __zero;
+    }
+    return __r;
+  }
+
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 bool all() const _NOEXCEPT;
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 bool any() const _NOEXCEPT;
 
@@ -594,6 +625,12 @@ protected:
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 unsigned long to_ulong() const { return 0; }
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 unsigned long long to_ullong() const { return 0; }
 
+  template <bool _Sparse, class _CharT, class _Traits, class _Allocator>
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 basic_string<_CharT, _Traits, _Allocator>
+  __to_string(_CharT, _CharT) const {
+    return basic_string<_CharT, _Traits, _Allocator>();
+  }
+
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 bool all() const _NOEXCEPT { return true; }
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 bool any() const _NOEXCEPT { return false; }
 
@@ -847,12 +884,11 @@ template <size_t _Size>
 template <class _CharT, class _Traits, class _Allocator>
 _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 basic_string<_CharT, _Traits, _Allocator>
 bitset<_Size>::to_string(_CharT __zero, _CharT __one) const {
-  basic_string<_CharT, _Traits, _Allocator> __r(_Size, __zero);
-  for (size_t __i = 0; __i != _Size; ++__i) {
-    if ((*this)[__i])
-      __r[_Size - 1 - __i] = __one;
-  }
-  return __r;
+  bool __sparse = size_t(std::count(__base::__make_iter(0), __base::__make_iter(_Size), true)) < _Size / 2;
+  if (__sparse)
+    return __base::template __to_string<true, _CharT, _Traits, _Allocator>(__zero, __one);
+  else
+    return __base::template __to_string<false, _CharT, _Traits, _Allocator>(__zero, __one);
 }
 
 template <size_t _Size>

diff  --git a/libcxx/test/benchmarks/bitset.bench.cpp b/libcxx/test/benchmarks/bitset.bench.cpp
new file mode 100644
index 0000000000000..5e95d3aad1cb2
--- /dev/null
+++ b/libcxx/test/benchmarks/bitset.bench.cpp
@@ -0,0 +1,106 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03
+
+#include "benchmark/benchmark.h"
+#include <bitset>
+#include <cmath>
+#include <cstddef>
+#include <random>
+
+template <std::size_t N>
+struct GenerateBitset {
+  // Construct a bitset with N bits, where each bit is set with probability p.
+  static std::bitset<N> generate(double p) {
+    std::bitset<N> b;
+    if (p <= 0.0)
+      return b;
+    if (p >= 1.0)
+      return ~b;
+
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    std::bernoulli_distribution d(p);
+    for (std::size_t i = 0; i < N; ++i)
+      b[i] = d(gen);
+
+    return b;
+  }
+
+  static std::bitset<N> sparse() { return generate(0.1); }
+  static std::bitset<N> dense() { return generate(0.9); }
+  static std::bitset<N> uniform() { return generate(0.5); }
+};
+
+template <std::size_t N>
+static void BM_BitsetToString(benchmark::State& state) {
+  double p         = state.range(0) / 100.0;
+  std::bitset<N> b = GenerateBitset<N>::generate(p);
+  benchmark::DoNotOptimize(b);
+
+  for (auto _ : state) {
+    benchmark::DoNotOptimize(b.to_string());
+  }
+}
+
+// Sparse bitset
+BENCHMARK(BM_BitsetToString<32>)->Arg(10)->Name("BM_BitsetToString<32>/Sparse (10%)");
+BENCHMARK(BM_BitsetToString<64>)->Arg(10)->Name("BM_BitsetToString<64>/Sparse (10%)");
+BENCHMARK(BM_BitsetToString<128>)->Arg(10)->Name("BM_BitsetToString<128>/Sparse (10%)");
+BENCHMARK(BM_BitsetToString<256>)->Arg(10)->Name("BM_BitsetToString<256>/Sparse (10%)");
+BENCHMARK(BM_BitsetToString<512>)->Arg(10)->Name("BM_BitsetToString<512>/Sparse (10%)");
+BENCHMARK(BM_BitsetToString<1024>)->Arg(10)->Name("BM_BitsetToString<1024>/Sparse (10%)");
+BENCHMARK(BM_BitsetToString<2048>)->Arg(10)->Name("BM_BitsetToString<2048>/Sparse (10%)");
+BENCHMARK(BM_BitsetToString<4096>)->Arg(10)->Name("BM_BitsetToString<4096>/Sparse (10%)");
+BENCHMARK(BM_BitsetToString<8192>)->Arg(10)->Name("BM_BitsetToString<8192>/Sparse (10%)");
+BENCHMARK(BM_BitsetToString<16384>)->Arg(10)->Name("BM_BitsetToString<16384>/Sparse (10%)");
+BENCHMARK(BM_BitsetToString<32768>)->Arg(10)->Name("BM_BitsetToString<32768>/Sparse (10%)");
+BENCHMARK(BM_BitsetToString<65536>)->Arg(10)->Name("BM_BitsetToString<65536>/Sparse (10%)");
+BENCHMARK(BM_BitsetToString<131072>)->Arg(10)->Name("BM_BitsetToString<131072>/Sparse (10%)");
+BENCHMARK(BM_BitsetToString<262144>)->Arg(10)->Name("BM_BitsetToString<262144>/Sparse (10%)");
+BENCHMARK(BM_BitsetToString<524288>)->Arg(10)->Name("BM_BitsetToString<524288>/Sparse (10%)");
+BENCHMARK(BM_BitsetToString<1048576>)->Arg(10)->Name("BM_BitsetToString<1048576>/Sparse (10%)"); // 1 << 20
+
+// Dense bitset
+BENCHMARK(BM_BitsetToString<32>)->Arg(90)->Name("BM_BitsetToString<32>/Dense (90%)");
+BENCHMARK(BM_BitsetToString<64>)->Arg(90)->Name("BM_BitsetToString<64>/Dense (90%)");
+BENCHMARK(BM_BitsetToString<128>)->Arg(90)->Name("BM_BitsetToString<128>/Dense (90%)");
+BENCHMARK(BM_BitsetToString<256>)->Arg(90)->Name("BM_BitsetToString<256>/Dense (90%)");
+BENCHMARK(BM_BitsetToString<512>)->Arg(90)->Name("BM_BitsetToString<512>/Dense (90%)");
+BENCHMARK(BM_BitsetToString<1024>)->Arg(90)->Name("BM_BitsetToString<1024>/Dense (90%)");
+BENCHMARK(BM_BitsetToString<2048>)->Arg(90)->Name("BM_BitsetToString<2048>/Dense (90%)");
+BENCHMARK(BM_BitsetToString<4096>)->Arg(90)->Name("BM_BitsetToString<4096>/Dense (90%)");
+BENCHMARK(BM_BitsetToString<8192>)->Arg(90)->Name("BM_BitsetToString<8192>/Dense (90%)");
+BENCHMARK(BM_BitsetToString<16384>)->Arg(90)->Name("BM_BitsetToString<16384>/Dense (90%)");
+BENCHMARK(BM_BitsetToString<32768>)->Arg(90)->Name("BM_BitsetToString<32768>/Dense (90%)");
+BENCHMARK(BM_BitsetToString<65536>)->Arg(90)->Name("BM_BitsetToString<65536>/Dense (90%)");
+BENCHMARK(BM_BitsetToString<131072>)->Arg(90)->Name("BM_BitsetToString<131072>/Dense (90%)");
+BENCHMARK(BM_BitsetToString<262144>)->Arg(90)->Name("BM_BitsetToString<262144>/Dense (90%)");
+BENCHMARK(BM_BitsetToString<524288>)->Arg(90)->Name("BM_BitsetToString<524288>/Dense (90%)");
+BENCHMARK(BM_BitsetToString<1048576>)->Arg(90)->Name("BM_BitsetToString<1048576>/Dense (90%)"); // 1 << 20
+
+// Uniform bitset
+BENCHMARK(BM_BitsetToString<32>)->Arg(50)->Name("BM_BitsetToString<32>/Uniform (50%)");
+BENCHMARK(BM_BitsetToString<64>)->Arg(50)->Name("BM_BitsetToString<64>/Uniform (50%)");
+BENCHMARK(BM_BitsetToString<128>)->Arg(50)->Name("BM_BitsetToString<128>/Uniform (50%)");
+BENCHMARK(BM_BitsetToString<256>)->Arg(50)->Name("BM_BitsetToString<256>/Uniform (50%)");
+BENCHMARK(BM_BitsetToString<512>)->Arg(50)->Name("BM_BitsetToString<512>/Uniform (50%)");
+BENCHMARK(BM_BitsetToString<1024>)->Arg(50)->Name("BM_BitsetToString<1024>/Uniform (50%)");
+BENCHMARK(BM_BitsetToString<2048>)->Arg(50)->Name("BM_BitsetToString<2048>/Uniform (50%)");
+BENCHMARK(BM_BitsetToString<4096>)->Arg(50)->Name("BM_BitsetToString<4096>/Uniform (50%)");
+BENCHMARK(BM_BitsetToString<8192>)->Arg(50)->Name("BM_BitsetToString<8192>/Uniform (50%)");
+BENCHMARK(BM_BitsetToString<16384>)->Arg(50)->Name("BM_BitsetToString<16384>/Uniform (50%)");
+BENCHMARK(BM_BitsetToString<32768>)->Arg(50)->Name("BM_BitsetToString<32768>/Uniform (50%)");
+BENCHMARK(BM_BitsetToString<65536>)->Arg(50)->Name("BM_BitsetToString<65536>/Uniform (50%)");
+BENCHMARK(BM_BitsetToString<131072>)->Arg(50)->Name("BM_BitsetToString<131072>/Uniform (50%)");
+BENCHMARK(BM_BitsetToString<262144>)->Arg(50)->Name("BM_BitsetToString<262144>/Uniform (50%)");
+BENCHMARK(BM_BitsetToString<524288>)->Arg(50)->Name("BM_BitsetToString<524288>/Uniform (50%)");
+BENCHMARK(BM_BitsetToString<1048576>)->Arg(50)->Name("BM_BitsetToString<1048576>/Uniform (50%)"); // 1 << 20
+
+BENCHMARK_MAIN();


        


More information about the libcxx-commits mailing list