[libcxx-commits] [libcxx] [libc++] Speed-up vector<bool> range-based operations [3/3] (PR #120134)
via libcxx-commits
libcxx-commits at lists.llvm.org
Mon Dec 16 12:19:52 PST 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-libcxx
Author: Peng Liu (winner245)
<details>
<summary>Changes</summary>
### General description
This PR is part of a series aimed at significantly improving the performance of `vector<bool>`. Each PR focuses on enhancing a specific subset of operations, ensuring they are self-contained and easy to review. The main idea for performance improvements involves using word-wise implementation along with bit manipulation techniques, rather than solely using bit-wise operations in the previous implementation, resulting in substantial performance gains.
### Current PR
This PR enhances the performance of all range-based operations in `vector<bool>` by at least **5x**. The main idea is to provide a more efficient overload of `std::__copy(_InIter __first, _InIter __last, __bit_iterator<_Cp, false> __result)`, which is used by various range-based operations in vector<bool>. With this efficient overload of `std::__copy`, all range-based operations benefit from significant performance improvements, which apply to the iterator-pair based range operations as well as C++23's range constructor and {insert, append}_range functions:
- range-ctor `vector( InputIt first, InputIt last, const Allocator& alloc)`: **5.84x**
- C++23 range-ctor `vector(std::from_range_t, R&& rg, const Allocator& alloc)`: **5.86x**
- range-assignment `assign(InputIt first, InputIt last)`: **5.84x**
- C++23 `assign_range(R&& rg)`: **5.9x**
- range-insert `insert( const_iterator pos, InputIt first, InputIt last )`: **6.38x**
- C++23 `insert_range(const_iterator pos, R&& rg)`: **6.45x**
- C++23 `append_range(R&& rg)`: **5.5x**
#### Before:
```
--------------------------------------------------------------------------------------
Benchmark Time CPU Iterations
--------------------------------------------------------------------------------------
BM_ConstructIterIter/vector_bool/5140480 22432969 ns 22560977 ns 31
BM_ConstructFromRange/vector_bool/5140480 22499312 ns 22632239 ns 31
BM_Assign_IterIter/vector_bool/5140480 22542583 ns 22679677 ns 30
BM_Assign_Range/vector_bool/5140480 22739005 ns 22881371 ns 31
BM_Insert_Iter_IterIter/vector_bool/5140480 23249604 ns 23398233 ns 30
BM_Insert_Range/vector_bool/5140480 23031899 ns 23181587 ns 30
BM_Append_Range/vector_bool/5140480 23432886 ns 23586148 ns 29
```
#### After:
```
--------------------------------------------------------------------------------------
Benchmark Time CPU Iterations
--------------------------------------------------------------------------------------
BM_ConstructIterIter/vector_bool/5140480 3836990 ns 3857075 ns 182
BM_ConstructFromRange/vector_bool/5140480 3838558 ns 3860015 ns 177
BM_Assign_IterIter/vector_bool/5140480 3856720 ns 3879212 ns 181
BM_Assign_Range/vector_bool/5140480 3849086 ns 3872665 ns 178
BM_Insert_Iter_IterIter/vector_bool/5140480 3639338 ns 3661651 ns 189
BM_Insert_Range/vector_bool/5140480 3569611 ns 3592612 ns 195
BM_Append_Range/vector_bool/5140480 4256268 ns 4284186 ns 168
```
---
Full diff: https://github.com/llvm/llvm-project/pull/120134.diff
4 Files Affected:
- (modified) libcxx/include/__algorithm/copy.h (+50)
- (modified) libcxx/include/__bit_reference (+3)
- (modified) libcxx/test/benchmarks/containers/ContainerBenchmarks.h (+58)
- (added) libcxx/test/benchmarks/containers/vector_bool_operations.bench.cpp (+37)
``````````diff
diff --git a/libcxx/include/__algorithm/copy.h b/libcxx/include/__algorithm/copy.h
index 4f30b2050abbaf..f737bc4e98e6d6 100644
--- a/libcxx/include/__algorithm/copy.h
+++ b/libcxx/include/__algorithm/copy.h
@@ -13,6 +13,8 @@
#include <__algorithm/for_each_segment.h>
#include <__algorithm/min.h>
#include <__config>
+#include <__fwd/bit_reference.h>
+#include <__iterator/distance.h>
#include <__iterator/iterator_traits.h>
#include <__iterator/segmented_iterator.h>
#include <__type_traits/common_type.h>
@@ -95,6 +97,54 @@ struct __copy_impl {
}
}
+ template <class _InIter, class _Cp, __enable_if_t<__has_forward_iterator_category<_InIter>::value, int> = 0>
+ _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair<_InIter, __bit_iterator<_Cp, false>>
+ operator()(_InIter __first, _InIter __last, __bit_iterator<_Cp, false> __result) {
+ using _It = __bit_iterator<_Cp, false>;
+ using __storage_type = typename _It::__storage_type;
+ __storage_type __n = static_cast<__storage_type>(std::distance(__first, __last));
+ const unsigned __bits_per_word = _It::__bits_per_word;
+
+ if (__n) {
+ // do first partial word, if present
+ if (__result.__ctz_ != 0) {
+ __storage_type __clz = static_cast<__storage_type>(__bits_per_word - __result.__ctz_);
+ __storage_type __dn = std::min(__clz, __n);
+ __storage_type __w = *__result.__seg_;
+ __storage_type __m = (~__storage_type(0) << __result.__ctz_) & (~__storage_type(0) >> (__clz - __dn));
+ __w &= ~__m;
+ for (__storage_type __i = 0; __i < __dn; ++__i, ++__first)
+ __w |= static_cast<__storage_type>(*__first) << __result.__ctz_++;
+ *__result.__seg_ = __w;
+ if (__result.__ctz_ == __bits_per_word) {
+ __result.__ctz_ = 0;
+ ++__result.__seg_;
+ }
+ __n -= __dn;
+ }
+ }
+ // do middle whole words, if present
+ __storage_type __nw = __n / __bits_per_word;
+ __n -= __nw * __bits_per_word;
+ for (; __nw; --__nw) {
+ __storage_type __w = 0;
+ for (__storage_type __i = 0; __i < __bits_per_word; ++__i, ++__first)
+ __w |= static_cast<__storage_type>(*__first) << __i;
+ *__result.__seg_++ = __w;
+ }
+ // do last partial word, if present
+ if (__n) {
+ __storage_type __w = 0;
+ for (__storage_type __i = 0; __i < __n; ++__i, ++__first)
+ __w |= static_cast<__storage_type>(*__first) << __i;
+ __storage_type __m = ~__storage_type(0) >> (__bits_per_word - __n);
+ *__result.__seg_ &= ~__m;
+ *__result.__seg_ |= __w;
+ __result.__ctz_ = __n;
+ }
+ return std::make_pair(std::move(__first), std::move(__result));
+ }
+
// At this point, the iterators have been unwrapped so any `contiguous_iterator` has been unwrapped to a pointer.
template <class _In, class _Out, __enable_if_t<__can_lower_copy_assignment_to_memmove<_In, _Out>::value, int> = 0>
_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 pair<_In*, _Out*>
diff --git a/libcxx/include/__bit_reference b/libcxx/include/__bit_reference
index 22637d43974123..e8cbb63988ba54 100644
--- a/libcxx/include/__bit_reference
+++ b/libcxx/include/__bit_reference
@@ -10,6 +10,7 @@
#ifndef _LIBCPP___BIT_REFERENCE
#define _LIBCPP___BIT_REFERENCE
+#include <__algorithm/copy.h>
#include <__algorithm/copy_n.h>
#include <__algorithm/fill_n.h>
#include <__algorithm/min.h>
@@ -970,6 +971,8 @@ private:
_LIBCPP_CONSTEXPR_SINCE_CXX20 friend void
__fill_n_bool(__bit_iterator<_Dp, false> __first, typename _Dp::size_type __n);
+ friend struct __copy_impl;
+
template <class _Dp, bool _IC>
_LIBCPP_CONSTEXPR_SINCE_CXX20 friend __bit_iterator<_Dp, false> __copy_aligned(
__bit_iterator<_Dp, _IC> __first, __bit_iterator<_Dp, _IC> __last, __bit_iterator<_Dp, false> __result);
diff --git a/libcxx/test/benchmarks/containers/ContainerBenchmarks.h b/libcxx/test/benchmarks/containers/ContainerBenchmarks.h
index 6d21e12896ec9e..123f7bc95d4745 100644
--- a/libcxx/test/benchmarks/containers/ContainerBenchmarks.h
+++ b/libcxx/test/benchmarks/containers/ContainerBenchmarks.h
@@ -51,6 +51,30 @@ void BM_Assignment(benchmark::State& st, Container) {
}
}
+template <class Container, class GenInputs>
+void BM_Assign_IterIter(benchmark::State& st, Container c, GenInputs gen) {
+ auto in = gen(st.range(0));
+ auto beg = in.begin();
+ auto end = in.end();
+ for (auto _ : st) {
+ c.assign(beg, end);
+ DoNotOptimizeData(c);
+ DoNotOptimizeData(in);
+ benchmark::ClobberMemory();
+ }
+}
+
+template <std::size_t... sz, typename Container, typename GenInputs>
+void BM_Assign_Range(benchmark::State& st, Container c, GenInputs gen) {
+ auto in = gen(st.range(0));
+ for (auto _ : st) {
+ c.assign_range(in);
+ DoNotOptimizeData(c);
+ DoNotOptimizeData(in);
+ benchmark::ClobberMemory();
+ }
+}
+
template <std::size_t... sz, typename Container, typename GenInputs>
void BM_AssignInputIterIter(benchmark::State& st, Container c, GenInputs gen) {
auto v = gen(1, sz...);
@@ -108,6 +132,40 @@ void BM_Pushback_no_grow(benchmark::State& state, Container c) {
}
}
+template <class Container, class GenInputs>
+void BM_Insert_Iter_IterIter(benchmark::State& st, Container c, GenInputs gen) {
+ auto in = gen(st.range(0));
+ const auto beg = in.begin();
+ const auto end = in.end();
+ for (auto _ : st) {
+ c.resize(100);
+ c.insert(c.begin() + 50, beg, end);
+ DoNotOptimizeData(c);
+ benchmark::ClobberMemory();
+ }
+}
+
+template <class Container, class GenInputs>
+void BM_Insert_Range(benchmark::State& st, Container c, GenInputs gen) {
+ auto in = gen(st.range(0));
+ for (auto _ : st) {
+ c.resize(100);
+ c.insert_range(c.begin() + 50, in);
+ DoNotOptimizeData(c);
+ benchmark::ClobberMemory();
+ }
+}
+
+template <class Container, class GenInputs>
+void BM_Append_Range(benchmark::State& st, Container c, GenInputs gen) {
+ auto in = gen(st.range(0));
+ for (auto _ : st) {
+ c.append_range(in);
+ DoNotOptimizeData(c);
+ benchmark::ClobberMemory();
+ }
+}
+
template <class Container, class GenInputs>
void BM_InsertValue(benchmark::State& st, Container c, GenInputs gen) {
auto in = gen(st.range(0));
diff --git a/libcxx/test/benchmarks/containers/vector_bool_operations.bench.cpp b/libcxx/test/benchmarks/containers/vector_bool_operations.bench.cpp
new file mode 100644
index 00000000000000..2ce10cb6d3d1b6
--- /dev/null
+++ b/libcxx/test/benchmarks/containers/vector_bool_operations.bench.cpp
@@ -0,0 +1,37 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+#include <deque>
+#include <functional>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "benchmark/benchmark.h"
+#include "ContainerBenchmarks.h"
+#include "../GenerateInput.h"
+
+using namespace ContainerBenchmarks;
+
+BENCHMARK_CAPTURE(BM_ConstructIterIter, vector_bool, std::vector<bool>{}, getRandomIntegerInputs<bool>)->Arg(5140480);
+BENCHMARK_CAPTURE(BM_ConstructFromRange, vector_bool, std::vector<bool>{}, getRandomIntegerInputs<bool>)->Arg(5140480);
+
+BENCHMARK_CAPTURE(BM_Assign_IterIter, vector_bool, std::vector<bool>{}, getRandomIntegerInputs<bool>)->Arg(5140480);
+BENCHMARK_CAPTURE(BM_Assign_Range, vector_bool, std::vector<bool>{}, getRandomIntegerInputs<bool>)->Arg(5140480);
+
+BENCHMARK_CAPTURE(BM_Insert_Iter_IterIter, vector_bool, std::vector<bool>{}, getRandomIntegerInputs<bool>)
+ ->Arg(5140480);
+BENCHMARK_CAPTURE(BM_Insert_Range, vector_bool, std::vector<bool>{}, getRandomIntegerInputs<bool>)->Arg(5140480);
+BENCHMARK_CAPTURE(BM_Append_Range, vector_bool, std::vector<bool>{}, getRandomIntegerInputs<bool>)->Arg(5140480);
+
+BENCHMARK_MAIN();
\ No newline at end of file
``````````
</details>
https://github.com/llvm/llvm-project/pull/120134
More information about the libcxx-commits
mailing list