[libcxx-commits] [libcxx] [libc++] Optimize ranges::fill{, _n} for vector<bool>::iterator (PR #84642)
Nikolas Klauser via libcxx-commits
libcxx-commits at lists.llvm.org
Sat Mar 9 09:24:57 PST 2024
https://github.com/philnik777 created https://github.com/llvm/llvm-project/pull/84642
```
------------------------------------------------------
Benchmark old new
------------------------------------------------------
bm_ranges_fill_n/1 1.64 ns 3.06 ns
bm_ranges_fill_n/2 3.45 ns 3.06 ns
bm_ranges_fill_n/3 4.88 ns 3.06 ns
bm_ranges_fill_n/4 6.46 ns 3.06 ns
bm_ranges_fill_n/5 8.03 ns 3.06 ns
bm_ranges_fill_n/6 9.65 ns 3.07 ns
bm_ranges_fill_n/7 11.5 ns 3.06 ns
bm_ranges_fill_n/8 13.0 ns 3.06 ns
bm_ranges_fill_n/16 25.9 ns 3.06 ns
bm_ranges_fill_n/64 103 ns 4.62 ns
bm_ranges_fill_n/512 711 ns 4.40 ns
bm_ranges_fill_n/4096 5642 ns 9.86 ns
bm_ranges_fill_n/32768 45135 ns 33.6 ns
bm_ranges_fill_n/262144 360818 ns 243 ns
bm_ranges_fill_n/1048576 1442828 ns 982 ns
bm_ranges_fill/1 1.63 ns 3.17 ns
bm_ranges_fill/2 3.43 ns 3.28 ns
bm_ranges_fill/3 4.97 ns 3.31 ns
bm_ranges_fill/4 6.53 ns 3.27 ns
bm_ranges_fill/5 8.12 ns 3.33 ns
bm_ranges_fill/6 9.76 ns 3.32 ns
bm_ranges_fill/7 11.6 ns 3.29 ns
bm_ranges_fill/8 13.2 ns 3.26 ns
bm_ranges_fill/16 26.3 ns 3.26 ns
bm_ranges_fill/64 104 ns 4.92 ns
bm_ranges_fill/512 716 ns 4.47 ns
bm_ranges_fill/4096 5772 ns 8.21 ns
bm_ranges_fill/32768 45778 ns 33.1 ns
bm_ranges_fill/262144 351422 ns 241 ns
bm_ranges_fill/1048576 1404710 ns 965 ns
```
>From a85183edea154e15aeaf4f2375839f3d3df59d7c Mon Sep 17 00:00:00 2001
From: Nikolas Klauser <nikolasklauser at berlin.de>
Date: Tue, 5 Mar 2024 12:33:09 +0100
Subject: [PATCH] [libc++] Optimize ranges::fill{,_n} for
vector<bool>::iterator
---
libcxx/benchmarks/CMakeLists.txt | 1 +
libcxx/benchmarks/algorithms/fill.bench.cpp | 49 +++++++
libcxx/docs/ReleaseNotes/19.rst | 2 +
libcxx/include/__algorithm/fill_n.h | 53 ++++++++
libcxx/include/__bit_reference | 59 +--------
.../alg.fill/fill.pass.cpp | 121 +++++++++++-------
6 files changed, 184 insertions(+), 101 deletions(-)
create mode 100644 libcxx/benchmarks/algorithms/fill.bench.cpp
diff --git a/libcxx/benchmarks/CMakeLists.txt b/libcxx/benchmarks/CMakeLists.txt
index b436e96f178b70..3dec6faea13a0c 100644
--- a/libcxx/benchmarks/CMakeLists.txt
+++ b/libcxx/benchmarks/CMakeLists.txt
@@ -176,6 +176,7 @@ set(BENCHMARK_TESTS
algorithms/count.bench.cpp
algorithms/equal.bench.cpp
algorithms/find.bench.cpp
+ algorithms/fill.bench.cpp
algorithms/for_each.bench.cpp
algorithms/lower_bound.bench.cpp
algorithms/make_heap.bench.cpp
diff --git a/libcxx/benchmarks/algorithms/fill.bench.cpp b/libcxx/benchmarks/algorithms/fill.bench.cpp
new file mode 100644
index 00000000000000..40f37425c394cf
--- /dev/null
+++ b/libcxx/benchmarks/algorithms/fill.bench.cpp
@@ -0,0 +1,49 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <algorithm>
+#include <benchmark/benchmark.h>
+#include <vector>
+
+static void bm_fill_n(benchmark::State& state) {
+ std::vector<bool> vec1(state.range());
+ for (auto _ : state) {
+ benchmark::DoNotOptimize(vec1);
+ benchmark::DoNotOptimize(std::fill_n(vec1.begin(), vec1.size(), false));
+ }
+}
+BENCHMARK(bm_fill_n)->DenseRange(1, 8)->Range(16, 1 << 20);
+
+static void bm_ranges_fill_n(benchmark::State& state) {
+ std::vector<bool> vec1(state.range());
+ for (auto _ : state) {
+ benchmark::DoNotOptimize(vec1);
+ benchmark::DoNotOptimize(std::ranges::fill_n(vec1.begin(), vec1.size(), false));
+ }
+}
+BENCHMARK(bm_ranges_fill_n)->DenseRange(1, 8)->Range(16, 1 << 20);
+
+static void bm_fill(benchmark::State& state) {
+ std::vector<bool> vec1(state.range());
+ for (auto _ : state) {
+ benchmark::DoNotOptimize(vec1);
+ std::fill(vec1.begin(), vec1.end(), false);
+ }
+}
+BENCHMARK(bm_fill)->DenseRange(1, 8)->Range(16, 1 << 20);
+
+static void bm_ranges_fill(benchmark::State& state) {
+ std::vector<bool> vec1(state.range());
+ for (auto _ : state) {
+ benchmark::DoNotOptimize(vec1);
+ benchmark::DoNotOptimize(std::ranges::fill(vec1, false));
+ }
+}
+BENCHMARK(bm_ranges_fill)->DenseRange(1, 8)->Range(16, 1 << 20);
+
+BENCHMARK_MAIN();
diff --git a/libcxx/docs/ReleaseNotes/19.rst b/libcxx/docs/ReleaseNotes/19.rst
index 04f16610f8117e..e2f5c87b537056 100644
--- a/libcxx/docs/ReleaseNotes/19.rst
+++ b/libcxx/docs/ReleaseNotes/19.rst
@@ -49,6 +49,8 @@ Improvements and New Features
-----------------------------
- The performance of growing ``std::vector`` has been improved for trivially relocatable types.
+- The performance of ``ranges::fill`` and ``ranges::fill_n`` has been improved for ``vector<bool>::iterator``s,
+ resulting in a performance increase of up to 1400x.
Deprecations and Removals
-------------------------
diff --git a/libcxx/include/__algorithm/fill_n.h b/libcxx/include/__algorithm/fill_n.h
index 36f3349d9e7a37..03daa1da8fa1b5 100644
--- a/libcxx/include/__algorithm/fill_n.h
+++ b/libcxx/include/__algorithm/fill_n.h
@@ -9,8 +9,11 @@
#ifndef _LIBCPP___ALGORITHM_FILL_N_H
#define _LIBCPP___ALGORITHM_FILL_N_H
+#include <__algorithm/min.h>
#include <__config>
+#include <__fwd/bit_reference.h>
#include <__iterator/iterator_traits.h>
+#include <__memory/pointer_traits.h>
#include <__utility/convert_to_integral.h>
#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
@@ -21,6 +24,56 @@ _LIBCPP_BEGIN_NAMESPACE_STD
// fill_n isn't specialized for std::memset, because the compiler already optimizes the loop to a call to std::memset.
+template <class _OutputIterator, class _Size, class _Tp>
+inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _OutputIterator
+__fill_n(_OutputIterator __first, _Size __n, const _Tp& __value);
+
+template <bool _FillVal, class _Cp>
+_LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void
+__fill_n_bool(__bit_iterator<_Cp, false> __first, typename _Cp::size_type __n) {
+ using _It = __bit_iterator<_Cp, false>;
+ using __storage_type = typename _It::__storage_type;
+
+ const int __bits_per_word = _It::__bits_per_word;
+ // do first partial word
+ if (__first.__ctz_ != 0) {
+ __storage_type __clz_f = static_cast<__storage_type>(__bits_per_word - __first.__ctz_);
+ __storage_type __dn = std::min(__clz_f, __n);
+ __storage_type __m = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz_f - __dn));
+ if (_FillVal)
+ *__first.__seg_ |= __m;
+ else
+ *__first.__seg_ &= ~__m;
+ __n -= __dn;
+ ++__first.__seg_;
+ }
+ // do middle whole words
+ __storage_type __nw = __n / __bits_per_word;
+ std::__fill_n(std::__to_address(__first.__seg_), __nw, _FillVal ? static_cast<__storage_type>(-1) : 0);
+ __n -= __nw * __bits_per_word;
+ // do last partial word
+ if (__n > 0) {
+ __first.__seg_ += __nw;
+ __storage_type __m = ~__storage_type(0) >> (__bits_per_word - __n);
+ if (_FillVal)
+ *__first.__seg_ |= __m;
+ else
+ *__first.__seg_ &= ~__m;
+ }
+}
+
+template <class _Cp, class _Size>
+inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __bit_iterator<_Cp, false>
+__fill_n(__bit_iterator<_Cp, false> __first, _Size __n, const bool& __value) {
+ if (__n > 0) {
+ if (__value)
+ std::__fill_n_bool<true>(__first, __n);
+ else
+ std::__fill_n_bool<false>(__first, __n);
+ }
+ return __first + __n;
+}
+
template <class _OutputIterator, class _Size, class _Tp>
inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _OutputIterator
__fill_n(_OutputIterator __first, _Size __n, const _Tp& __value) {
diff --git a/libcxx/include/__bit_reference b/libcxx/include/__bit_reference
index 3a5339b72ddc31..9579b9eaf70bbd 100644
--- a/libcxx/include/__bit_reference
+++ b/libcxx/include/__bit_reference
@@ -171,61 +171,6 @@ private:
__bit_const_reference& operator=(const __bit_const_reference&) = delete;
};
-// fill_n
-
-template <bool _FillVal, class _Cp>
-_LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void
-__fill_n(__bit_iterator<_Cp, false> __first, typename _Cp::size_type __n) {
- using _It = __bit_iterator<_Cp, false>;
- using __storage_type = typename _It::__storage_type;
-
- const int __bits_per_word = _It::__bits_per_word;
- // do first partial word
- if (__first.__ctz_ != 0) {
- __storage_type __clz_f = static_cast<__storage_type>(__bits_per_word - __first.__ctz_);
- __storage_type __dn = std::min(__clz_f, __n);
- __storage_type __m = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz_f - __dn));
- if (_FillVal)
- *__first.__seg_ |= __m;
- else
- *__first.__seg_ &= ~__m;
- __n -= __dn;
- ++__first.__seg_;
- }
- // do middle whole words
- __storage_type __nw = __n / __bits_per_word;
- std::fill_n(std::__to_address(__first.__seg_), __nw, _FillVal ? static_cast<__storage_type>(-1) : 0);
- __n -= __nw * __bits_per_word;
- // do last partial word
- if (__n > 0) {
- __first.__seg_ += __nw;
- __storage_type __m = ~__storage_type(0) >> (__bits_per_word - __n);
- if (_FillVal)
- *__first.__seg_ |= __m;
- else
- *__first.__seg_ &= ~__m;
- }
-}
-
-template <class _Cp>
-inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void
-fill_n(__bit_iterator<_Cp, false> __first, typename _Cp::size_type __n, bool __value) {
- if (__n > 0) {
- if (__value)
- std::__fill_n<true>(__first, __n);
- else
- std::__fill_n<false>(__first, __n);
- }
-}
-
-// fill
-
-template <class _Cp>
-inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void
-fill(__bit_iterator<_Cp, false> __first, __bit_iterator<_Cp, false> __last, bool __value) {
- std::fill_n(__first, static_cast<typename _Cp::size_type>(__last - __first), __value);
-}
-
// copy
template <class _Cp, bool _IsConst>
@@ -1007,8 +952,10 @@ private:
friend class __bit_iterator<_Cp, true>;
template <class _Dp>
friend struct __bit_array;
+
template <bool _FillVal, class _Dp>
- _LIBCPP_CONSTEXPR_SINCE_CXX20 friend void __fill_n(__bit_iterator<_Dp, false> __first, typename _Dp::size_type __n);
+ _LIBCPP_CONSTEXPR_SINCE_CXX20 friend void
+ __fill_n_bool(__bit_iterator<_Dp, false> __first, typename _Dp::size_type __n);
template <class _Dp, bool _IC>
_LIBCPP_CONSTEXPR_SINCE_CXX20 friend __bit_iterator<_Dp, false> __copy_aligned(
diff --git a/libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/fill.pass.cpp b/libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/fill.pass.cpp
index da56ec30f128b1..7960483104a446 100644
--- a/libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/fill.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/fill.pass.cpp
@@ -14,62 +14,93 @@
// fill(Iter first, Iter last, const T& value);
#include <algorithm>
+#include <array>
#include <cassert>
+#include <vector>
#include "test_macros.h"
#include "test_iterators.h"
-#if TEST_STD_VER > 17
-TEST_CONSTEXPR bool test_constexpr() {
- int ia[] = {0, 1, 2, 3, 4};
-
- std::fill(std::begin(ia), std::end(ia), 5);
+template <class Iter, class Container>
+TEST_CONSTEXPR_CXX20 void
+test(Container in, size_t from, size_t to, typename Container::value_type value, Container expected) {
+ std::fill(Iter(in.data() + from), Iter(in.data() + to), value);
+ assert(in == expected);
+}
- return std::all_of(std::begin(ia), std::end(ia), [](int a) {return a == 5; })
- ;
+template <class T>
+struct Test {
+ template <class Iter>
+ TEST_CONSTEXPR_CXX20 void operator()() {
+ {
+ std::array<T, 4> in = {1, 2, 3, 4};
+ std::array<T, 4> expected = {5, 5, 5, 5};
+ test<Iter>(in, 0, 4, 5, expected);
}
-#endif
+ {
+ std::array<T, 4> in = {1, 2, 3, 4};
+ std::array<T, 4> expected = {1, 5, 5, 4};
+ test<Iter>(in, 1, 3, 5, expected);
+ }
+ }
+};
-template <class Iter>
-void
-test_char()
-{
- const unsigned n = 4;
- char ca[n] = {0};
- std::fill(Iter(ca), Iter(ca+n), char(1));
- assert(ca[0] == 1);
- assert(ca[1] == 1);
- assert(ca[2] == 1);
- assert(ca[3] == 1);
-}
+TEST_CONSTEXPR_CXX20 bool test() {
+ types::for_each(types::forward_iterator_list<char*>(), Test<char>());
+ types::for_each(types::forward_iterator_list<int*>(), Test<int>());
+ { // test vector<bool>::iterator optimization
+ using Iter = typename std::vector<bool>::iterator;
-template <class Iter>
-void
-test_int()
-{
- const unsigned n = 4;
- int ia[n] = {0};
- std::fill(Iter(ia), Iter(ia+n), 1);
- assert(ia[0] == 1);
- assert(ia[1] == 1);
- assert(ia[2] == 1);
- assert(ia[3] == 1);
+ { // simple case
+ std::vector<bool> in(4, false);
+ std::vector<bool> expected(4, true);
+ std::fill(in.begin(), in.end(), true);
+ assert(in == expected);
+ }
+ { // partial byte in the front is not filled
+ std::vector<bool> in(8, false);
+ std::vector<bool> expected(8, true);
+ expected[0] = false;
+ expected[1] = false;
+ std::fill(in.begin() + 2, in.end(), true);
+ assert(in == expected);
+ }
+ { // partial byte in the back is not filled
+ std::vector<bool> in(8, false);
+ std::vector<bool> expected(8, true);
+ expected[6] = false;
+ expected[7] = false;
+ std::fill(in.begin(), in.end() - 2, true);
+ assert(in == expected);
+ }
+ { // partial byte in the front and back is not filled
+ std::vector<bool> in(16, false);
+ std::vector<bool> expected(16, true);
+ expected[0] = false;
+ expected[1] = false;
+ expected[14] = false;
+ expected[15] = false;
+ std::fill(in.begin() + 2, in.end() - 2, true);
+ assert(in == expected);
+ }
+ { // only a few bits of a byte are set
+ std::vector<bool> in(8, false);
+ std::vector<bool> expected(8, true);
+ expected[0] = false;
+ expected[1] = false;
+ expected[6] = false;
+ expected[7] = false;
+ std::fill(in.begin() + 2, in.end() - 2, true);
+ assert(in == expected);
+ }
+ }
+ return true;
}
-int main(int, char**)
-{
- test_char<forward_iterator<char*> >();
- test_char<bidirectional_iterator<char*> >();
- test_char<random_access_iterator<char*> >();
- test_char<char*>();
-
- test_int<forward_iterator<int*> >();
- test_int<bidirectional_iterator<int*> >();
- test_int<random_access_iterator<int*> >();
- test_int<int*>();
-
-#if TEST_STD_VER > 17
- static_assert(test_constexpr());
+int main(int, char**) {
+ test();
+#if TEST_STD_VER >= 20
+ static_assert(test());
#endif
return 0;
More information about the libcxx-commits
mailing list