[libcxx-commits] [libcxx] [libc++] Optimize {std, ranges}::{fill, fill_n} for segmented iterators (PR #132665)
via libcxx-commits
libcxx-commits at lists.llvm.org
Mon Mar 24 10:45:25 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-libcxx
Author: Peng Liu (winner245)
<details>
<summary>Changes</summary>
This patch optimizes `std::fill`, `std::fill_n}`, `std::ranges::fill`, and `std::ranges::fill_n` for segmented iterators, achieving substantial performance improvements. Specifically, for `deque<int>` iterators, the performance improvements are above 10x for all these algorithms. The optimization for segmented iterators is rather effective, as it enables filling segmented memory in `deque<int>` to match the performance of filling contiguous memory in `vector<int>`.
Below are the benchmark results comparing the before and after implementations. For reference purposes, we've also provided the benchmarks for `vector<int>`, though its performance remains unchanged, as this patch focuses solely on segmented iterators and does not modify contiguous memory handling.
Fixes two subtasks outlined in #<!-- -->102817.
#### `fill_n`
```
----------------------------------------------------------------------------
Benchmark Before After Speedup
----------------------------------------------------------------------------
std::fill_n(deque<int>)/32 12.9 ns 3.00 ns 4.3x
std::fill_n(deque<int>)/50 21.0 ns 4.07 ns 5.2x
std::fill_n(deque<int>)/1024 362 ns 39.4 ns 9.2x
std::fill_n(deque<int>)/8192 2945 ns 239 ns 12.3x
std::fill_n(deque<int>)/65536 23760 ns 2891 ns 8.2x
std::fill_n(deque<int>)/1048576 390518 ns 78182 ns 5.0x
rng::fill_n(deque<int>)/32 14.1 ns 3.16 ns 4.5x
rng::fill_n(deque<int>)/50 22.7 ns 4.06 ns 5.6x
rng::fill_n(deque<int>)/1024 382 ns 39.1 ns 9.8x
rng::fill_n(deque<int>)/8192 2984 ns 260 ns 11.5x
rng::fill_n(deque<int>)/65536 23809 ns 2875 ns 8.3x
rng::fill_n(deque<int>)/1048576 377297 ns 78241 ns 4.8x
std::fill_n(vector<int>)/32 1.97 ns 1.56 ns 1.3x
std::fill_n(vector<int>)/50 1.96 ns 2.03 ns 1.0x
std::fill_n(vector<int>)/1024 41.5 ns 43.0 ns 1.0x
std::fill_n(vector<int>)/8192 231 ns 245 ns 0.9x
std::fill_n(vector<int>)/65536 2946 ns 3092 ns 1.0x
std::fill_n(vector<int>)/1048576 73327 ns 78515 ns 0.9x
rng::fill_n(vector<int>)/32 1.81 ns 1.89 ns 1.0x
rng::fill_n(vector<int>)/50 2.68 ns 2.79 ns 1.0x
rng::fill_n(vector<int>)/1024 37.2 ns 37.9 ns 1.0x
rng::fill_n(vector<int>)/8192 251 ns 253 ns 1.0x
rng::fill_n(vector<int>)/65536 2876 ns 2919 ns 1.0x
rng::fill_n(vector<int>)/1048576 78037 ns 78382 ns 1.0x
```
#### `fill`
```
--------------------------------------------------------------------------
Benchmark Before After Speedup
--------------------------------------------------------------------------
std::fill(deque<int>)/32 13.9 ns 2.99 ns 4.6x
std::fill(deque<int>)/50 20.8 ns 4.38 ns 4.7x
std::fill(deque<int>)/1024 360 ns 36.8 ns 9.8x
std::fill(deque<int>)/8192 2931 ns 223 ns 13.1x
std::fill(deque<int>)/65536 23794 ns 2766 ns 8.6x
std::fill(deque<int>)/1048576 386669 ns 76049 ns 5.1x
rng::fill(deque<int>)/32 14.9 ns 3.87 ns 3.9x
rng::fill(deque<int>)/50 23.3 ns 4.82 ns 4.8x
rng::fill(deque<int>)/1024 376 ns 38.0 ns 9.9x
rng::fill(deque<int>)/8192 2969 ns 251 ns 11.8x
rng::fill(deque<int>)/65536 24008 ns 2827 ns 8.5x
rng::fill(deque<int>)/1048576 381733 ns 77591 ns 4.9x
std::fill(vector<int>)/32 1.81 ns 2.03 ns 0.9x
std::fill(vector<int>)/50 2.61 ns 2.64 ns 1.0x
std::fill(vector<int>)/1024 35.7 ns 36.7 ns 1.0x
std::fill(vector<int>)/8192 241 ns 243 ns 1.0x
std::fill(vector<int>)/65536 2766 ns 2849 ns 1.0x
std::fill(vector<int>)/1048576 76281 ns 77685 ns 1.0x
rng::fill(vector<int>)/32 1.73 ns 1.69 ns 1.0x
rng::fill(vector<int>)/50 2.27 ns 2.25 ns 1.0x
rng::fill(vector<int>)/1024 35.2 ns 35.4 ns 1.0x
rng::fill(vector<int>)/8192 246 ns 246 ns 1.0x
rng::fill(vector<int>)/65536 2843 ns 2900 ns 1.0x
rng::fill(vector<int>)/1048576 77900 ns 78126 ns 1.0x
```
---
Full diff: https://github.com/llvm/llvm-project/pull/132665.diff
8 Files Affected:
- (modified) libcxx/include/CMakeLists.txt (+1)
- (modified) libcxx/include/__algorithm/fill.h (+37-5)
- (modified) libcxx/include/__algorithm/fill_n.h (+21-9)
- (added) libcxx/include/__fwd/fill.h (+26)
- (modified) libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/fill.pass.cpp (+26)
- (modified) libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/fill_n.pass.cpp (+26)
- (modified) libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/ranges.fill.pass.cpp (+22)
- (modified) libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/ranges.fill_n.pass.cpp (+22)
``````````diff
diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt
index a021b9bb44d67..1ac582dcc1614 100644
--- a/libcxx/include/CMakeLists.txt
+++ b/libcxx/include/CMakeLists.txt
@@ -435,6 +435,7 @@ set(files
__fwd/byte.h
__fwd/complex.h
__fwd/deque.h
+ __fwd/fill.h
__fwd/format.h
__fwd/fstream.h
__fwd/functional.h
diff --git a/libcxx/include/__algorithm/fill.h b/libcxx/include/__algorithm/fill.h
index 1ce3eadb013d0..e519022f18084 100644
--- a/libcxx/include/__algorithm/fill.h
+++ b/libcxx/include/__algorithm/fill.h
@@ -10,8 +10,11 @@
#define _LIBCPP___ALGORITHM_FILL_H
#include <__algorithm/fill_n.h>
+#include <__algorithm/for_each_segment.h>
#include <__config>
#include <__iterator/iterator_traits.h>
+#include <__iterator/segmented_iterator.h>
+#include <__type_traits/enable_if.h>
#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
# pragma GCC system_header
@@ -21,23 +24,52 @@ _LIBCPP_BEGIN_NAMESPACE_STD
// fill isn't specialized for std::memset, because the compiler already optimizes the loop to a call to std::memset.
-template <class _ForwardIterator, class _Tp>
+template <class _ForwardIterator,
+ class _Sentinel,
+ class _Tp,
+ __enable_if_t<__has_forward_iterator_category<_ForwardIterator>::value, int> = 0>
inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void
-__fill(_ForwardIterator __first, _ForwardIterator __last, const _Tp& __value, forward_iterator_tag) {
+__fill(_ForwardIterator __first, _Sentinel __last, const _Tp& __value) {
for (; __first != __last; ++__first)
*__first = __value;
}
-template <class _RandomAccessIterator, class _Tp>
+template <class _OutIter, class _Tp>
+struct _FillSegment {
+ using _Traits _LIBCPP_NODEBUG = __segmented_iterator_traits<_OutIter>;
+
+ const _Tp& __value_;
+
+ _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 explicit _FillSegment(const _Tp& __value) : __value_(__value) {}
+
+ _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 void
+ operator()(typename _Traits::__local_iterator __lfirst, typename _Traits::__local_iterator __llast) {
+ std::__fill(__lfirst, __llast, __value_);
+ }
+};
+
+template <class _RandomAccessIterator,
+ class _Tp,
+ __enable_if_t<__has_random_access_iterator_category<_RandomAccessIterator>::value &&
+ !__is_segmented_iterator<_RandomAccessIterator>::value,
+ int> = 0>
inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void
-__fill(_RandomAccessIterator __first, _RandomAccessIterator __last, const _Tp& __value, random_access_iterator_tag) {
+__fill(_RandomAccessIterator __first, _RandomAccessIterator __last, const _Tp& __value) {
std::fill_n(__first, __last - __first, __value);
}
+template <class _SegmentedIterator,
+ class _Tp,
+ __enable_if_t<__is_segmented_iterator<_SegmentedIterator>::value, int> = 0>
+inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void
+__fill(_SegmentedIterator __first, _SegmentedIterator __last, const _Tp& __value) {
+ std::__for_each_segment(__first, __last, _FillSegment<_SegmentedIterator, _Tp>(__value));
+}
+
template <class _ForwardIterator, class _Tp>
inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void
fill(_ForwardIterator __first, _ForwardIterator __last, const _Tp& __value) {
- std::__fill(__first, __last, __value, typename iterator_traits<_ForwardIterator>::iterator_category());
+ std::__fill(__first, __last, __value);
}
_LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/include/__algorithm/fill_n.h b/libcxx/include/__algorithm/fill_n.h
index 0da78e1f38c4c..c492ec882a483 100644
--- a/libcxx/include/__algorithm/fill_n.h
+++ b/libcxx/include/__algorithm/fill_n.h
@@ -12,7 +12,12 @@
#include <__algorithm/min.h>
#include <__config>
#include <__fwd/bit_reference.h>
+#include <__fwd/fill.h>
+#include <__iterator/iterator_traits.h>
+#include <__iterator/next.h>
+#include <__iterator/segmented_iterator.h>
#include <__memory/pointer_traits.h>
+#include <__type_traits/enable_if.h>
#include <__utility/convert_to_integral.h>
#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
@@ -26,9 +31,13 @@ _LIBCPP_BEGIN_NAMESPACE_STD
// fill_n isn't specialized for std::memset, because the compiler already optimizes the loop to a call to std::memset.
-template <class _OutputIterator, class _Size, class _Tp>
-inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _OutputIterator
-__fill_n(_OutputIterator __first, _Size __n, const _Tp& __value);
+template <class _OutIter, class _Size, class _Tp, __enable_if_t<!__is_segmented_iterator<_OutIter>::value, int> = 0>
+inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _OutIter
+__fill_n(_OutIter __first, _Size __n, const _Tp& __value) {
+ for (; __n > 0; ++__first, (void)--__n)
+ *__first = __value;
+ return __first;
+}
template <bool _FillVal, class _Cp>
_LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void
@@ -68,12 +77,15 @@ __fill_n(__bit_iterator<_Cp, false> __first, _Size __n, const bool& __value) {
return __first + __n;
}
-template <class _OutputIterator, class _Size, class _Tp>
-inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _OutputIterator
-__fill_n(_OutputIterator __first, _Size __n, const _Tp& __value) {
- for (; __n > 0; ++__first, (void)--__n)
- *__first = __value;
- return __first;
+template < class _OutIter,
+ class _Size,
+ class _Tp,
+ __enable_if_t<__is_segmented_iterator<_OutIter>::value && __has_forward_iterator_category<_OutIter>::value,
+ int> = 0>
+_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _OutIter __fill_n(_OutIter __first, _Size __n, const _Tp& __value) {
+ _OutIter __last = std::next(__first, __n);
+ std::fill(__first, __last, __value);
+ return __last;
}
template <class _OutputIterator, class _Size, class _Tp>
diff --git a/libcxx/include/__fwd/fill.h b/libcxx/include/__fwd/fill.h
new file mode 100644
index 0000000000000..628c552757201
--- /dev/null
+++ b/libcxx/include/__fwd/fill.h
@@ -0,0 +1,26 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___FWD_FILL_H
+#define _LIBCPP___FWD_FILL_H
+
+#include <__config>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+# pragma GCC system_header
+#endif
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+template <class _ForwardIterator, class _Tp>
+_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void
+fill(_ForwardIterator __first, _ForwardIterator __last, const _Tp& __value);
+
+_LIBCPP_END_NAMESPACE_STD
+
+#endif // _LIBCPP___FWD_FILL_H
diff --git a/libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/fill.pass.cpp b/libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/fill.pass.cpp
index 9b403db85ebf9..0b87f9038e618 100644
--- a/libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/fill.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/fill.pass.cpp
@@ -17,6 +17,8 @@
#include <array>
#include <cassert>
#include <cstddef>
+#include <deque>
+#include <ranges>
#include <vector>
#include "sized_allocator.h"
@@ -93,6 +95,27 @@ TEST_CONSTEXPR_CXX20 bool test_vector_bool(std::size_t N) {
return true;
}
+/*TEST_CONSTEXPR_CXX23*/ void
+test_segmented_iterator() { // TODO: Mark this test as TEST_CONSTEXPR_CXX23 when std::deque is constexpr
+ { // std::deque iterator
+ std::deque<int> in(20);
+ std::deque<int> expected(in.size(), 42);
+ std::fill(in.begin(), in.end(), 42);
+ assert(in == expected);
+ }
+
+#if TEST_STD_VER >= 20
+ { // join_view iterator
+ std::vector<std::vector<int>> v{{1, 2}, {1, 2, 3}, {0, 0}, {3, 4, 5}, {6}, {7, 8, 9, 6}, {0, 1, 2, 3, 0, 1, 2}};
+ auto jv = std::ranges::join_view(v);
+ std::fill(jv.begin(), jv.end(), 42);
+ for (const auto& vec : v)
+ for (auto n : vec)
+ assert(n == 42);
+ }
+#endif
+}
+
TEST_CONSTEXPR_CXX20 bool test() {
types::for_each(types::forward_iterator_list<char*>(), Test<char>());
types::for_each(types::forward_iterator_list<int*>(), Test<int>());
@@ -138,6 +161,9 @@ TEST_CONSTEXPR_CXX20 bool test() {
}
}
+ if (!TEST_IS_CONSTANT_EVALUATED) // TODO: Use TEST_STD_AT_LEAST_23_OR_RUNTIME_EVALUATED when std::deque is made constexpr
+ test_segmented_iterator();
+
return true;
}
diff --git a/libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/fill_n.pass.cpp b/libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/fill_n.pass.cpp
index 4dda8714d2cfa..3d397bc6c9f06 100644
--- a/libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/fill_n.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/fill_n.pass.cpp
@@ -17,6 +17,8 @@
#include <array>
#include <cassert>
#include <cstddef>
+#include <deque>
+#include <ranges>
#include <vector>
#include "sized_allocator.h"
@@ -176,6 +178,27 @@ TEST_CONSTEXPR_CXX20 void test_struct_array() {
}
}
+/*TEST_CONSTEXPR_CXX23*/ void
+test_segmented_iterator() { // TODO: Mark this test as TEST_CONSTEXPR_CXX23 when std::deque is constexpr
+ { // std::deque iterator
+ std::deque<int> in(20);
+ std::deque<int> expected(in.size(), 42);
+ std::fill_n(in.begin(), in.size(), 42);
+ assert(in == expected);
+ }
+
+#if TEST_STD_VER >= 20
+ { // join_view iterator
+ std::vector<std::vector<int>> v{{1, 2}, {1, 2, 3}, {0, 0}, {3, 4, 5}, {6}, {7, 8, 9, 6}, {0, 1, 2, 3, 0, 1, 2}};
+ auto jv = std::ranges::join_view(v);
+ std::fill_n(jv.begin(), std::distance(jv.begin(), jv.end()), 42);
+ for (const auto& vec : v)
+ for (auto n : vec)
+ assert(n == 42);
+ }
+#endif
+}
+
TEST_CONSTEXPR_CXX20 bool test() {
types::for_each(types::forward_iterator_list<char*>(), Test<char>());
types::for_each(types::forward_iterator_list<int*>(), Test<int>());
@@ -225,6 +248,9 @@ TEST_CONSTEXPR_CXX20 bool test() {
}
}
+ if (!TEST_IS_CONSTANT_EVALUATED) // TODO: Use TEST_STD_AT_LEAST_23_OR_RUNTIME_EVALUATED when std::deque is made constexpr
+ test_segmented_iterator();
+
return true;
}
diff --git a/libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/ranges.fill.pass.cpp b/libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/ranges.fill.pass.cpp
index 61a659fb0028c..e610fd3c3cb06 100644
--- a/libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/ranges.fill.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/ranges.fill.pass.cpp
@@ -18,6 +18,7 @@
#include <algorithm>
#include <array>
#include <cassert>
+#include <deque>
#include <ranges>
#include <string>
#include <vector>
@@ -128,6 +129,24 @@ constexpr bool test_vector_bool(std::size_t N) {
}
#endif
+/*TEST_CONSTEXPR_CXX23*/ void
+test_segmented_range() { // TODO: Mark this test as TEST_CONSTEXPR_CXX23 when std::deque is constexpr
+ { // std::deque
+ std::deque<int> in(20);
+ std::deque<int> expected(in.size(), 42);
+ std::ranges::fill(in, 42);
+ assert(in == expected);
+ }
+ { // join_view
+ std::vector<std::vector<int>> v{{1, 2}, {1, 2, 3}, {0, 0}, {3, 4, 5}, {6}, {7, 8, 9, 6}, {0, 1, 2, 3, 0, 1, 2}};
+ auto jv = std::ranges::join_view(v);
+ std::ranges::fill(jv, 42);
+ for (const auto& vec : v)
+ for (auto n : vec)
+ assert(n == 42);
+ }
+}
+
constexpr bool test() {
test_iterators<cpp17_output_iterator<int*>, sentinel_wrapper<cpp17_output_iterator<int*>>>();
test_iterators<cpp20_output_iterator<int*>, sentinel_wrapper<cpp20_output_iterator<int*>>>();
@@ -227,6 +246,9 @@ constexpr bool test() {
}
#endif
+ if (!TEST_IS_CONSTANT_EVALUATED) // TODO: Use TEST_STD_AT_LEAST_23_OR_RUNTIME_EVALUATED when std::deque is made constexpr
+ test_segmented_range();
+
return true;
}
diff --git a/libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/ranges.fill_n.pass.cpp b/libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/ranges.fill_n.pass.cpp
index 2d6e24a03e0b3..2c7b0fe091c04 100644
--- a/libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/ranges.fill_n.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/ranges.fill_n.pass.cpp
@@ -16,6 +16,7 @@
#include <algorithm>
#include <array>
#include <cassert>
+#include <deque>
#include <ranges>
#include <string>
#include <vector>
@@ -101,6 +102,24 @@ constexpr bool test_vector_bool(std::size_t N) {
}
#endif
+/*TEST_CONSTEXPR_CXX23*/ void
+test_segmented_range() { // TODO: Mark this test as TEST_CONSTEXPR_CXX23 when std::deque is constexpr
+ { // std::deque
+ std::deque<int> in(20);
+ std::deque<int> expected(in.size(), 42);
+ std::ranges::fill_n(std::ranges::begin(in), std::ranges::size(in), 42);
+ assert(in == expected);
+ }
+ { // join_view
+ std::vector<std::vector<int>> v{{1, 2}, {1, 2, 3}, {0, 0}, {3, 4, 5}, {6}, {7, 8, 9, 6}, {0, 1, 2, 3, 0, 1, 2}};
+ auto jv = std::ranges::join_view(v);
+ std::ranges::fill_n(std::ranges::begin(jv), std::ranges::distance(jv), 42);
+ for (const auto& vec : v)
+ for (auto n : vec)
+ assert(n == 42);
+ }
+}
+
constexpr bool test() {
test_iterators<cpp17_output_iterator<int*>, sentinel_wrapper<cpp17_output_iterator<int*>>>();
test_iterators<cpp20_output_iterator<int*>, sentinel_wrapper<cpp20_output_iterator<int*>>>();
@@ -175,6 +194,9 @@ constexpr bool test() {
}
#endif
+ if (!TEST_IS_CONSTANT_EVALUATED) // TODO: Use TEST_STD_AT_LEAST_23_OR_RUNTIME_EVALUATED when std::deque is made constexpr
+ test_segmented_range();
+
return true;
}
``````````
</details>
https://github.com/llvm/llvm-project/pull/132665
More information about the libcxx-commits
mailing list