[libcxx-commits] [libcxx] [libc++] Speed-up vector<bool> range-based operations [3/3] (PR #120134)

Peng Liu via libcxx-commits libcxx-commits at lists.llvm.org
Wed Dec 18 07:14:35 PST 2024


https://github.com/winner245 updated https://github.com/llvm/llvm-project/pull/120134

>From 42390667a47bb40e4b1635d9aa4952e5134318dd Mon Sep 17 00:00:00 2001
From: Peng Liu <winner245 at hotmail.com>
Date: Mon, 16 Dec 2024 13:30:56 -0500
Subject: [PATCH] Speed-up range operations in vector<bool>

---
 libcxx/include/__algorithm/copy.h             | 50 ++++++++++++++++
 libcxx/include/__bit_reference                |  2 +
 libcxx/include/__cxx03/__algorithm/copy.h     | 50 ++++++++++++++++
 libcxx/include/__cxx03/__bit_reference        |  5 +-
 .../containers/ContainerBenchmarks.h          | 58 +++++++++++++++++++
 .../vector_bool_operations.bench.cpp          | 37 ++++++++++++
 6 files changed, 201 insertions(+), 1 deletion(-)
 create mode 100644 libcxx/test/benchmarks/containers/vector_bool_operations.bench.cpp

diff --git a/libcxx/include/__algorithm/copy.h b/libcxx/include/__algorithm/copy.h
index 4f30b2050abbaf..ff8cfa5a97fb62 100644
--- a/libcxx/include/__algorithm/copy.h
+++ b/libcxx/include/__algorithm/copy.h
@@ -13,6 +13,8 @@
 #include <__algorithm/for_each_segment.h>
 #include <__algorithm/min.h>
 #include <__config>
+#include <__fwd/bit_reference.h>
+#include <__iterator/distance.h>
 #include <__iterator/iterator_traits.h>
 #include <__iterator/segmented_iterator.h>
 #include <__type_traits/common_type.h>
@@ -95,6 +97,54 @@ struct __copy_impl {
     }
   }
 
+  template <class _InIter, class _Cp, __enable_if_t<__has_forward_iterator_category<_InIter>::value, int> = 0>
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 pair<_InIter, __bit_iterator<_Cp, false>>
+  operator()(_InIter __first, _InIter __last, __bit_iterator<_Cp, false> __result) {
+    using _It                      = __bit_iterator<_Cp, false>;
+    using __storage_type           = typename _It::__storage_type;
+    __storage_type __n             = static_cast<__storage_type>(std::distance(__first, __last));
+    const unsigned __bits_per_word = _It::__bits_per_word;
+
+    if (__n) {
+      // do first partial word, if present
+      if (__result.__ctz_ != 0) {
+        __storage_type __clz = static_cast<__storage_type>(__bits_per_word - __result.__ctz_);
+        __storage_type __dn  = std::min(__clz, __n);
+        __storage_type __w   = *__result.__seg_;
+        __storage_type __m   = (~__storage_type(0) << __result.__ctz_) & (~__storage_type(0) >> (__clz - __dn));
+        __w &= ~__m;
+        for (__storage_type __i = 0; __i < __dn; ++__i, ++__first)
+          __w |= static_cast<__storage_type>(*__first) << __result.__ctz_++;
+        *__result.__seg_ = __w;
+        if (__result.__ctz_ == __bits_per_word) {
+          __result.__ctz_ = 0;
+          ++__result.__seg_;
+        }
+        __n -= __dn;
+      }
+    }
+    // do middle whole words, if present
+    __storage_type __nw = __n / __bits_per_word;
+    __n -= __nw * __bits_per_word;
+    for (; __nw; --__nw) {
+      __storage_type __w = 0;
+      for (__storage_type __i = 0; __i < __bits_per_word; ++__i, ++__first)
+        __w |= static_cast<__storage_type>(*__first) << __i;
+      *__result.__seg_++ = __w;
+    }
+    // do last partial word, if present
+    if (__n) {
+      __storage_type __w = 0;
+      for (__storage_type __i = 0; __i < __n; ++__i, ++__first)
+        __w |= static_cast<__storage_type>(*__first) << __i;
+      __storage_type __m = ~__storage_type(0) >> (__bits_per_word - __n);
+      *__result.__seg_ &= ~__m;
+      *__result.__seg_ |= __w;
+      __result.__ctz_ = __n;
+    }
+    return std::make_pair(std::move(__first), std::move(__result));
+  }
+
   // At this point, the iterators have been unwrapped so any `contiguous_iterator` has been unwrapped to a pointer.
   template <class _In, class _Out, __enable_if_t<__can_lower_copy_assignment_to_memmove<_In, _Out>::value, int> = 0>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 pair<_In*, _Out*>
diff --git a/libcxx/include/__bit_reference b/libcxx/include/__bit_reference
index 22637d43974123..3424f455ac13dd 100644
--- a/libcxx/include/__bit_reference
+++ b/libcxx/include/__bit_reference
@@ -10,6 +10,7 @@
 #ifndef _LIBCPP___BIT_REFERENCE
 #define _LIBCPP___BIT_REFERENCE
 
+#include <__algorithm/copy.h>
 #include <__algorithm/copy_n.h>
 #include <__algorithm/fill_n.h>
 #include <__algorithm/min.h>
@@ -965,6 +966,7 @@ private:
   friend class __bit_iterator<_Cp, true>;
   template <class _Dp>
   friend struct __bit_array;
+  friend struct __copy_impl;
 
   template <bool _FillVal, class _Dp>
   _LIBCPP_CONSTEXPR_SINCE_CXX20 friend void
diff --git a/libcxx/include/__cxx03/__algorithm/copy.h b/libcxx/include/__cxx03/__algorithm/copy.h
index 2aa0ab78b78582..e1c03ba74b8916 100644
--- a/libcxx/include/__cxx03/__algorithm/copy.h
+++ b/libcxx/include/__cxx03/__algorithm/copy.h
@@ -14,6 +14,8 @@
 #include <__cxx03/__algorithm/iterator_operations.h>
 #include <__cxx03/__algorithm/min.h>
 #include <__cxx03/__config>
+#include <__cxx03/__fwd/bit_reference.h>
+#include <__cxx03/__iterator/distance.h>
 #include <__cxx03/__iterator/segmented_iterator.h>
 #include <__cxx03/__type_traits/common_type.h>
 #include <__cxx03/__utility/move.h>
@@ -95,6 +97,54 @@ struct __copy_impl {
     }
   }
 
+  template <class _InIter, class _Cp, __enable_if_t<__has_forward_iterator_category<_InIter>::value, int> = 0>
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 pair<_InIter, __bit_iterator<_Cp, false>>
+  operator()(_InIter __first, _InIter __last, __bit_iterator<_Cp, false> __result) {
+    using _It                      = __bit_iterator<_Cp, false>;
+    using __storage_type           = typename _It::__storage_type;
+    __storage_type __n             = static_cast<__storage_type>(std::distance(__first, __last));
+    const unsigned __bits_per_word = _It::__bits_per_word;
+
+    if (__n) {
+      // do first partial word, if present
+      if (__result.__ctz_ != 0) {
+        __storage_type __clz = static_cast<__storage_type>(__bits_per_word - __result.__ctz_);
+        __storage_type __dn  = std::min(__clz, __n);
+        __storage_type __w   = *__result.__seg_;
+        __storage_type __m   = (~__storage_type(0) << __result.__ctz_) & (~__storage_type(0) >> (__clz - __dn));
+        __w &= ~__m;
+        for (__storage_type __i = 0; __i < __dn; ++__i, ++__first)
+          __w |= static_cast<__storage_type>(*__first) << __result.__ctz_++;
+        *__result.__seg_ = __w;
+        if (__result.__ctz_ == __bits_per_word) {
+          __result.__ctz_ = 0;
+          ++__result.__seg_;
+        }
+        __n -= __dn;
+      }
+    }
+    // do middle whole words, if present
+    __storage_type __nw = __n / __bits_per_word;
+    __n -= __nw * __bits_per_word;
+    for (; __nw; --__nw) {
+      __storage_type __w = 0;
+      for (__storage_type __i = 0; __i < __bits_per_word; ++__i, ++__first)
+        __w |= static_cast<__storage_type>(*__first) << __i;
+      *__result.__seg_++ = __w;
+    }
+    // do last partial word, if present
+    if (__n) {
+      __storage_type __w = 0;
+      for (__storage_type __i = 0; __i < __n; ++__i, ++__first)
+        __w |= static_cast<__storage_type>(*__first) << __i;
+      __storage_type __m = ~__storage_type(0) >> (__bits_per_word - __n);
+      *__result.__seg_ &= ~__m;
+      *__result.__seg_ |= __w;
+      __result.__ctz_ = __n;
+    }
+    return std::make_pair(std::move(__first), std::move(__result));
+  }
+
   // At this point, the iterators have been unwrapped so any `contiguous_iterator` has been unwrapped to a pointer.
   template <class _In, class _Out, __enable_if_t<__can_lower_copy_assignment_to_memmove<_In, _Out>::value, int> = 0>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 pair<_In*, _Out*>
diff --git a/libcxx/include/__cxx03/__bit_reference b/libcxx/include/__cxx03/__bit_reference
index bf86f9a76e24a1..64ba33dfe4a4b9 100644
--- a/libcxx/include/__cxx03/__bit_reference
+++ b/libcxx/include/__cxx03/__bit_reference
@@ -10,6 +10,7 @@
 #ifndef _LIBCPP___CXX03___BIT_REFERENCE
 #define _LIBCPP___CXX03___BIT_REFERENCE
 
+#include <__cxx03/__algorithm/copy.h>
 #include <__cxx03/__algorithm/copy_n.h>
 #include <__cxx03/__algorithm/fill_n.h>
 #include <__cxx03/__algorithm/min.h>
@@ -965,7 +966,9 @@ private:
   friend class __bit_iterator<_Cp, true>;
   template <class _Dp>
   friend struct __bit_array;
-
+  template <class _AlgPolicy>
+  friend struct __copy_impl;
+  
   template <bool _FillVal, class _Dp>
   _LIBCPP_CONSTEXPR_SINCE_CXX20 friend void
   __fill_n_bool(__bit_iterator<_Dp, false> __first, typename _Dp::size_type __n);
diff --git a/libcxx/test/benchmarks/containers/ContainerBenchmarks.h b/libcxx/test/benchmarks/containers/ContainerBenchmarks.h
index 6d21e12896ec9e..c3effbbed1d110 100644
--- a/libcxx/test/benchmarks/containers/ContainerBenchmarks.h
+++ b/libcxx/test/benchmarks/containers/ContainerBenchmarks.h
@@ -51,6 +51,30 @@ void BM_Assignment(benchmark::State& st, Container) {
   }
 }
 
+template <class Container, class GenInputs>
+void BM_assign_iter_iter(benchmark::State& st, Container c, GenInputs gen) {
+  auto in  = gen(st.range(0));
+  auto beg = in.begin();
+  auto end = in.end();
+  for (auto _ : st) {
+    c.assign(beg, end);
+    DoNotOptimizeData(c);
+    DoNotOptimizeData(in);
+    benchmark::ClobberMemory();
+  }
+}
+
+template <std::size_t... sz, typename Container, typename GenInputs>
+void BM_assign_range(benchmark::State& st, Container c, GenInputs gen) {
+  auto in = gen(st.range(0));
+  for (auto _ : st) {
+    c.assign_range(in);
+    DoNotOptimizeData(c);
+    DoNotOptimizeData(in);
+    benchmark::ClobberMemory();
+  }
+}
+
 template <std::size_t... sz, typename Container, typename GenInputs>
 void BM_AssignInputIterIter(benchmark::State& st, Container c, GenInputs gen) {
   auto v = gen(1, sz...);
@@ -108,6 +132,40 @@ void BM_Pushback_no_grow(benchmark::State& state, Container c) {
   }
 }
 
+template <class Container, class GenInputs>
+void BM_insert_iter_iter_iter(benchmark::State& st, Container c, GenInputs gen) {
+  auto in        = gen(st.range(0));
+  const auto beg = in.begin();
+  const auto end = in.end();
+  for (auto _ : st) {
+    c.resize(100);
+    c.insert(c.begin() + 50, beg, end);
+    DoNotOptimizeData(c);
+    benchmark::ClobberMemory();
+  }
+}
+
+template <class Container, class GenInputs>
+void BM_insert_range(benchmark::State& st, Container c, GenInputs gen) {
+  auto in = gen(st.range(0));
+  for (auto _ : st) {
+    c.resize(100);
+    c.insert_range(c.begin() + 50, in);
+    DoNotOptimizeData(c);
+    benchmark::ClobberMemory();
+  }
+}
+
+template <class Container, class GenInputs>
+void BM_append_range(benchmark::State& st, Container c, GenInputs gen) {
+  auto in = gen(st.range(0));
+  for (auto _ : st) {
+    c.append_range(in);
+    DoNotOptimizeData(c);
+    benchmark::ClobberMemory();
+  }
+}
+
 template <class Container, class GenInputs>
 void BM_InsertValue(benchmark::State& st, Container c, GenInputs gen) {
   auto in        = gen(st.range(0));
diff --git a/libcxx/test/benchmarks/containers/vector_bool_operations.bench.cpp b/libcxx/test/benchmarks/containers/vector_bool_operations.bench.cpp
new file mode 100644
index 00000000000000..e62f72ab02e72f
--- /dev/null
+++ b/libcxx/test/benchmarks/containers/vector_bool_operations.bench.cpp
@@ -0,0 +1,37 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+#include <deque>
+#include <functional>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "benchmark/benchmark.h"
+#include "ContainerBenchmarks.h"
+#include "../GenerateInput.h"
+
+using namespace ContainerBenchmarks;
+
+BENCHMARK_CAPTURE(BM_ConstructIterIter, vector_bool, std::vector<bool>{}, getRandomIntegerInputs<bool>)->Arg(5140480);
+BENCHMARK_CAPTURE(BM_ConstructFromRange, vector_bool, std::vector<bool>{}, getRandomIntegerInputs<bool>)->Arg(5140480);
+
+BENCHMARK_CAPTURE(BM_assign_iter_iter, vector_bool, std::vector<bool>{}, getRandomIntegerInputs<bool>)->Arg(5140480);
+BENCHMARK_CAPTURE(BM_assign_range, vector_bool, std::vector<bool>{}, getRandomIntegerInputs<bool>)->Arg(5140480);
+
+BENCHMARK_CAPTURE(BM_insert_iter_iter_iter, vector_bool, std::vector<bool>{}, getRandomIntegerInputs<bool>)
+    ->Arg(5140480);
+BENCHMARK_CAPTURE(BM_insert_range, vector_bool, std::vector<bool>{}, getRandomIntegerInputs<bool>)->Arg(5140480);
+BENCHMARK_CAPTURE(BM_append_range, vector_bool, std::vector<bool>{}, getRandomIntegerInputs<bool>)->Arg(5140480);
+
+BENCHMARK_MAIN();



More information about the libcxx-commits mailing list