[libcxx-commits] [libcxx] [libc++] Optimize std::for_each_n for segmented iterators (PR #135468)
Peng Liu via libcxx-commits
libcxx-commits at lists.llvm.org
Sat Apr 19 04:20:26 PDT 2025
https://github.com/winner245 updated https://github.com/llvm/llvm-project/pull/135468
>From 5b6a84655ea5023e8b1916c6164b39b7bcc3e9ee Mon Sep 17 00:00:00 2001
From: Peng Liu <winner245 at hotmail.com>
Date: Fri, 11 Apr 2025 17:37:10 -0400
Subject: [PATCH] Optimize std::for_each_n for segmented iterators
---
libcxx/docs/ReleaseNotes/21.rst | 3 +
libcxx/include/CMakeLists.txt | 1 +
libcxx/include/__algorithm/for_each.h | 1 -
libcxx/include/__algorithm/for_each_n.h | 70 +++++++++-
.../include/__algorithm/for_each_n_segment.h | 63 +++++++++
libcxx/include/module.modulemap.in | 1 +
.../nonmodifying/for_each_n.bench.cpp | 98 +++++++++++++
.../alg.foreach/for_each_n.pass.cpp | 129 ++++++++++++------
8 files changed, 318 insertions(+), 48 deletions(-)
create mode 100644 libcxx/include/__algorithm/for_each_n_segment.h
create mode 100644 libcxx/test/benchmarks/algorithms/nonmodifying/for_each_n.bench.cpp
diff --git a/libcxx/docs/ReleaseNotes/21.rst b/libcxx/docs/ReleaseNotes/21.rst
index a7382c5222d08..3ea9f17418447 100644
--- a/libcxx/docs/ReleaseNotes/21.rst
+++ b/libcxx/docs/ReleaseNotes/21.rst
@@ -70,6 +70,9 @@ Improvements and New Features
- The segmented iterator optimization for ``std::for_each`` has been backported to C++11. Previously it was only available
in C++23 and later.
+- The ``std::for_each_n`` algorithm has been optimized for segmented iterators, resulting in a performance improvement of
+ up to 17.7x for ``std::deque<short>`` iterators, and up to 13.9x for ``std::join_view<vector<vector<short>>>`` iterators.
+
Deprecations and Removals
-------------------------
diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt
index f1bdf684a8549..b6de4b1800dff 100644
--- a/libcxx/include/CMakeLists.txt
+++ b/libcxx/include/CMakeLists.txt
@@ -25,6 +25,7 @@ set(files
__algorithm/find_segment_if.h
__algorithm/for_each.h
__algorithm/for_each_n.h
+ __algorithm/for_each_n_segment.h
__algorithm/for_each_segment.h
__algorithm/generate.h
__algorithm/generate_n.h
diff --git a/libcxx/include/__algorithm/for_each.h b/libcxx/include/__algorithm/for_each.h
index b6c2c7c056edd..0b14d8c219931 100644
--- a/libcxx/include/__algorithm/for_each.h
+++ b/libcxx/include/__algorithm/for_each.h
@@ -9,7 +9,6 @@
#ifndef _LIBCPP___ALGORITHM_FOR_EACH_H
#define _LIBCPP___ALGORITHM_FOR_EACH_H
-
#include <__algorithm/for_each_segment.h>
#include <__config>
#include <__iterator/segmented_iterator.h>
diff --git a/libcxx/include/__algorithm/for_each_n.h b/libcxx/include/__algorithm/for_each_n.h
index fce380b49df3e..12b8d1810685d 100644
--- a/libcxx/include/__algorithm/for_each_n.h
+++ b/libcxx/include/__algorithm/for_each_n.h
@@ -10,20 +10,36 @@
#ifndef _LIBCPP___ALGORITHM_FOR_EACH_N_H
#define _LIBCPP___ALGORITHM_FOR_EACH_N_H
+#include <__algorithm/for_each.h>
+#include <__algorithm/for_each_n_segment.h>
#include <__config>
+#include <__iterator/iterator_traits.h>
+#include <__iterator/segmented_iterator.h>
+#include <__type_traits/enable_if.h>
#include <__utility/convert_to_integral.h>
+#include <__utility/move.h>
#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
# pragma GCC system_header
#endif
-_LIBCPP_BEGIN_NAMESPACE_STD
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
-#if _LIBCPP_STD_VER >= 17
+_LIBCPP_BEGIN_NAMESPACE_STD
-template <class _InputIterator, class _Size, class _Function>
-inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _InputIterator
-for_each_n(_InputIterator __first, _Size __orig_n, _Function __f) {
+template <class _InputIterator,
+ class _Size,
+ class _Func,
+ __enable_if_t<!__has_random_access_iterator_category<_InputIterator>::value &&
+ (!__is_segmented_iterator<_InputIterator>::value
+ // || !__has_random_access_iterator_category<
+ // typename __segmented_iterator_traits<_InputIterator>::__local_iterator>::value
+ ), // TODO: __segmented_iterator_traits<_InputIterator> results in template instantiation
+ // during SFINAE, which is a hard error to be fixed. Once fixed, we should uncomment.
+ int> = 0>
+_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _InputIterator
+__for_each_n(_InputIterator __first, _Size __orig_n, _Func& __f) {
typedef decltype(std::__convert_to_integral(__orig_n)) _IntegralSize;
_IntegralSize __n = __orig_n;
while (__n > 0) {
@@ -31,11 +47,51 @@ for_each_n(_InputIterator __first, _Size __orig_n, _Function __f) {
++__first;
--__n;
}
- return __first;
+ return std::move(__first);
}
-#endif
+template <class _RandIter,
+ class _Size,
+ class _Func,
+ __enable_if_t<__has_random_access_iterator_category<_RandIter>::value, int> = 0>
+_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _RandIter
+__for_each_n(_RandIter __first, _Size __orig_n, _Func& __f) {
+ typename std::iterator_traits<_RandIter>::difference_type __n = __orig_n;
+ auto __last = __first + __n;
+ std::__for_each(__first, __last, __f);
+ return std::move(__last);
+}
+
+#ifndef _LIBCPP_CXX03_LANG
+template <class _SegmentedIterator,
+ class _Size,
+ class _Func,
+ __enable_if_t<!__has_random_access_iterator_category<_SegmentedIterator>::value &&
+ __is_segmented_iterator<_SegmentedIterator>::value &&
+ __has_random_access_iterator_category<
+ typename __segmented_iterator_traits<_SegmentedIterator>::__local_iterator>::value,
+ int> = 0>
+_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _SegmentedIterator
+__for_each_n(_SegmentedIterator __first, _Size __orig_n, _Func& __f) {
+ using __local_iterator_t = typename __segmented_iterator_traits<_SegmentedIterator>::__local_iterator;
+ return std::__for_each_n_segment(__first, __orig_n, [&](__local_iterator_t __lfirst, __local_iterator_t __llast) {
+ std::__for_each(__lfirst, __llast, __f);
+ });
+}
+#endif // !_LIBCPP_CXX03_LANG
+
+#if _LIBCPP_STD_VER >= 17
+
+template <class _InputIterator, class _Size, class _Function>
+inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _InputIterator
+for_each_n(_InputIterator __first, _Size __orig_n, _Function __f) {
+ return std::__for_each_n(__first, __orig_n, __f);
+}
+
+#endif // _LIBCPP_STD_VER >= 17
_LIBCPP_END_NAMESPACE_STD
+_LIBCPP_POP_MACROS
+
#endif // _LIBCPP___ALGORITHM_FOR_EACH_N_H
diff --git a/libcxx/include/__algorithm/for_each_n_segment.h b/libcxx/include/__algorithm/for_each_n_segment.h
new file mode 100644
index 0000000000000..1b522fb373eee
--- /dev/null
+++ b/libcxx/include/__algorithm/for_each_n_segment.h
@@ -0,0 +1,63 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___ALGORITHM_FOR_EACH_N_SEGMENT_H
+#define _LIBCPP___ALGORITHM_FOR_EACH_N_SEGMENT_H
+
+#include <__config>
+#include <__iterator/iterator_traits.h>
+#include <__iterator/segmented_iterator.h>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+# pragma GCC system_header
+#endif
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+// __for_each_n_segment optimizes linear iteration over segmented iterators. It processes a segmented
+// input range [__first, __first + __n) by applying the functor __func to each element within the segment.
+// The return value of __func is ignored, and the function returns an iterator pointing to one past the
+// last processed element in the input range.
+
+template <class _SegmentedIterator, class _Size, class _Functor>
+_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _SegmentedIterator
+__for_each_n_segment(_SegmentedIterator __first, _Size __orig_n, _Functor __func) {
+ static_assert(__is_segmented_iterator<_SegmentedIterator>::value &&
+ __has_random_access_iterator_category<
+ typename __segmented_iterator_traits<_SegmentedIterator>::__local_iterator>::value,
+ "__for_each_n_segment only works with segmented iterators with random-access local iterators");
+ if (__orig_n <= 0)
+ return __first;
+
+ using _Traits = __segmented_iterator_traits<_SegmentedIterator>;
+ using __local_iter_t = typename _Traits::__local_iterator;
+ using __difference_t = typename std::iterator_traits<__local_iter_t>::difference_type;
+ __difference_t __n = __orig_n;
+ auto __seg = _Traits::__segment(__first);
+ auto __local_first = _Traits::__local(__first);
+ __local_iter_t __local_last;
+
+ while (__n > 0) {
+ __local_last = _Traits::__end(__seg);
+ auto __seg_size = __local_last - __local_first;
+ if (__n <= __seg_size) {
+ __local_last = __local_first + __n;
+ __func(__local_first, __local_last);
+ break;
+ }
+ __func(__local_first, __local_last);
+ __n -= __seg_size;
+ __local_first = _Traits::__begin(++__seg);
+ }
+
+ return _Traits::__compose(__seg, __local_last);
+}
+
+_LIBCPP_END_NAMESPACE_STD
+
+#endif // _LIBCPP___ALGORITHM_FOR_EACH_N_SEGMENT_H
diff --git a/libcxx/include/module.modulemap.in b/libcxx/include/module.modulemap.in
index af928a63f2315..8e8b7a6f400d9 100644
--- a/libcxx/include/module.modulemap.in
+++ b/libcxx/include/module.modulemap.in
@@ -437,6 +437,7 @@ module std [system] {
module find_segment_if { header "__algorithm/find_segment_if.h" }
module find { header "__algorithm/find.h" }
module for_each_n { header "__algorithm/for_each_n.h" }
+ module for_each_n_segment { header "__algorithm/for_each_n_segment.h" }
module for_each_segment { header "__algorithm/for_each_segment.h" }
module for_each { header "__algorithm/for_each.h" }
module generate_n { header "__algorithm/generate_n.h" }
diff --git a/libcxx/test/benchmarks/algorithms/nonmodifying/for_each_n.bench.cpp b/libcxx/test/benchmarks/algorithms/nonmodifying/for_each_n.bench.cpp
new file mode 100644
index 0000000000000..784708c7e01eb
--- /dev/null
+++ b/libcxx/test/benchmarks/algorithms/nonmodifying/for_each_n.bench.cpp
@@ -0,0 +1,98 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+
+#include <algorithm>
+#include <cstddef>
+#include <deque>
+#include <list>
+#include <ranges>
+#include <string>
+#include <vector>
+
+#include <benchmark/benchmark.h>
+
+int main(int argc, char** argv) {
+ auto std_for_each_n = [](auto first, auto n, auto f) { return std::for_each_n(first, n, f); };
+
+ // std::for_each_n
+ {
+ auto bm = []<class Container>(std::string name, auto for_each_n) {
+ using ElemType = typename Container::value_type;
+ benchmark::RegisterBenchmark(
+ name,
+ [for_each_n](auto& st) {
+ std::size_t const n = st.range(0);
+ Container c(n, 1);
+ auto first = c.begin();
+
+ for ([[maybe_unused]] auto _ : st) {
+ benchmark::DoNotOptimize(c);
+ auto result = for_each_n(first, n, [](ElemType& x) { x = std::clamp<ElemType>(x, 10, 100); });
+ benchmark::DoNotOptimize(result);
+ }
+ })
+ ->Arg(8)
+ ->Arg(32)
+ ->Arg(50) // non power-of-two
+ ->Arg(1024)
+ ->Arg(4096)
+ ->Arg(8192)
+ ->Arg(1 << 14)
+ ->Arg(1 << 16)
+ ->Arg(1 << 18);
+ };
+ bm.operator()<std::vector<int>>("std::for_each_n(vector<int>)", std_for_each_n);
+ bm.operator()<std::deque<int>>("std::for_each_n(deque<int>)", std_for_each_n);
+ bm.operator()<std::list<int>>("std::for_each_n(list<int>)", std_for_each_n);
+ }
+
+ // std::for_each_n for join_view
+ {
+ auto bm = []<class Container>(std::string name, auto for_each_n) {
+ using C1 = typename Container::value_type;
+ using ElemType = typename C1::value_type;
+ benchmark::RegisterBenchmark(
+ name,
+ [for_each_n](auto& st) {
+ std::size_t const size = st.range(0);
+ std::size_t const seg_size = 256;
+ std::size_t const segments = (size + seg_size - 1) / seg_size;
+ Container c(segments);
+ for (std::size_t i = 0, n = size; i < segments; ++i, n -= seg_size) {
+ c[i].resize(std::min(seg_size, n), ElemType(1));
+ }
+
+ auto view = c | std::views::join;
+ auto first = view.begin();
+
+ for ([[maybe_unused]] auto _ : st) {
+ benchmark::DoNotOptimize(c);
+ auto result = for_each_n(first, size, [](ElemType& x) { x = std::clamp<ElemType>(x, 10, 100); });
+ benchmark::DoNotOptimize(result);
+ }
+ })
+ ->Arg(8)
+ ->Arg(32)
+ ->Arg(50) // non power-of-two
+ ->Arg(1024)
+ ->Arg(4096)
+ ->Arg(8192)
+ ->Arg(1 << 14)
+ ->Arg(1 << 16)
+ ->Arg(1 << 18);
+ };
+ bm.operator()<std::vector<std::vector<int>>>("std::for_each_n(join_view(vector<vector<int>>))", std_for_each_n);
+ }
+
+ benchmark::Initialize(&argc, argv);
+ benchmark::RunSpecifiedBenchmarks();
+ benchmark::Shutdown();
+ return 0;
+}
diff --git a/libcxx/test/std/algorithms/alg.nonmodifying/alg.foreach/for_each_n.pass.cpp b/libcxx/test/std/algorithms/alg.nonmodifying/alg.foreach/for_each_n.pass.cpp
index 371f6c92f1ed1..39c1174dcec8b 100644
--- a/libcxx/test/std/algorithms/alg.nonmodifying/alg.foreach/for_each_n.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.nonmodifying/alg.foreach/for_each_n.pass.cpp
@@ -13,69 +13,118 @@
// constexpr InputIterator // constexpr after C++17
// for_each_n(InputIterator first, Size n, Function f);
-
#include <algorithm>
#include <cassert>
+#include <deque>
#include <functional>
+#include <iterator>
+#include <ranges>
+#include <vector>
#include "test_macros.h"
#include "test_iterators.h"
-#if TEST_STD_VER > 17
-TEST_CONSTEXPR bool test_constexpr() {
- int ia[] = {1, 3, 6, 7};
- int expected[] = {3, 5, 8, 9};
- const std::size_t N = 4;
+struct for_each_test {
+ TEST_CONSTEXPR for_each_test(int c) : count(c) {}
+ int count;
+ TEST_CONSTEXPR_CXX14 void operator()(int& i) {
+ ++i;
+ ++count;
+ }
+};
- auto it = std::for_each_n(std::begin(ia), N, [](int &a) { a += 2; });
- return it == (std::begin(ia) + N)
- && std::equal(std::begin(ia), std::end(ia), std::begin(expected))
- ;
- }
-#endif
+struct deque_test {
+ std::deque<int>* d_;
+ int* i_;
+
+ deque_test(std::deque<int>& d, int& i) : d_(&d), i_(&i) {}
-struct for_each_test
-{
- for_each_test(int c) : count(c) {}
- int count;
- void operator()(int& i) {++i; ++count;}
+ void operator()(int& v) {
+ assert(&(*d_)[*i_] == &v);
+ ++*i_;
+ }
};
-int main(int, char**)
-{
+/*TEST_CONSTEXPR_CXX26*/
+void test_segmented_deque_iterator() { // TODO: Mark as TEST_CONSTEXPR_CXX26 once std::deque is constexpr
+ // check that segmented deque iterators work properly
+ int sizes[] = {0, 1, 2, 1023, 1024, 1025, 2047, 2048, 2049};
+ for (const int size : sizes) {
+ std::deque<int> d(size);
+ int index = 0;
+
+ std::for_each_n(d.begin(), d.size(), deque_test(d, index));
+ }
+}
+
+TEST_CONSTEXPR_CXX20 bool test() {
+ {
typedef cpp17_input_iterator<int*> Iter;
- int ia[] = {0, 1, 2, 3, 4, 5};
- const unsigned s = sizeof(ia)/sizeof(ia[0]);
+ int ia[] = {0, 1, 2, 3, 4, 5};
+ const unsigned s = sizeof(ia) / sizeof(ia[0]);
{
- auto f = for_each_test(0);
- Iter it = std::for_each_n(Iter(ia), 0, std::ref(f));
- assert(it == Iter(ia));
- assert(f.count == 0);
+ unsigned count = 0;
+ Iter it = std::for_each_n(Iter(ia), 0, [&count](int& i) {
+ ++i;
+ ++count;
+ });
+ assert(it == Iter(ia));
+ assert(count == 0);
}
{
- auto f = for_each_test(0);
- Iter it = std::for_each_n(Iter(ia), s, std::ref(f));
-
- assert(it == Iter(ia+s));
- assert(f.count == s);
- for (unsigned i = 0; i < s; ++i)
- assert(ia[i] == static_cast<int>(i+1));
+ unsigned count = 0;
+ Iter it = std::for_each_n(Iter(ia), s, [&count](int& i) {
+ ++i;
+ ++count;
+ });
+ assert(it == Iter(ia + s));
+ assert(count == s);
+ for (unsigned i = 0; i < s; ++i)
+ assert(ia[i] == static_cast<int>(i + 1));
}
{
- auto f = for_each_test(0);
- Iter it = std::for_each_n(Iter(ia), 1, std::ref(f));
-
- assert(it == Iter(ia+1));
- assert(f.count == 1);
- for (unsigned i = 0; i < 1; ++i)
- assert(ia[i] == static_cast<int>(i+2));
+ unsigned count = 0;
+ Iter it = std::for_each_n(Iter(ia), 1, [&count](int& i) {
+ ++i;
+ ++count;
+ });
+ assert(it == Iter(ia + 1));
+ assert(count == 1);
+ for (unsigned i = 0; i < 1; ++i)
+ assert(ia[i] == static_cast<int>(i + 2));
}
+ }
+
+ {
+ int ia[] = {1, 3, 6, 7};
+ int expected[] = {3, 5, 8, 9};
+ const std::size_t N = 4;
+
+ auto it = std::for_each_n(std::begin(ia), N, [](int& a) { a += 2; });
+ assert(it == (std::begin(ia) + N) && std::equal(std::begin(ia), std::end(ia), std::begin(expected)));
+ }
+
+ if (!TEST_IS_CONSTANT_EVALUATED) // TODO: Use TEST_STD_AT_LEAST_26_OR_RUNTIME_EVALUATED when std::deque is made constexpr
+ test_segmented_deque_iterator();
+
+#if TEST_STD_VER >= 20
+ {
+ std::vector<std::vector<int>> vec = {{0}, {1, 2}, {3, 4, 5}, {6, 7, 8, 9}, {10}, {11, 12, 13}};
+ auto v = vec | std::views::join;
+ std::for_each_n(v.begin(), std::ranges::distance(v), [i = 0](int& a) mutable { assert(a == i++); });
+ }
+#endif
+
+ return true;
+}
+int main(int, char**) {
+ assert(test());
#if TEST_STD_VER > 17
- static_assert(test_constexpr());
+ static_assert(test());
#endif
return 0;
More information about the libcxx-commits
mailing list