[libcxx-commits] [libcxx] [libc++] Optimize std::for_each_n for segmented iterators (PR #135468)

Peng Liu via libcxx-commits libcxx-commits at lists.llvm.org
Sat Apr 19 04:20:26 PDT 2025


https://github.com/winner245 updated https://github.com/llvm/llvm-project/pull/135468

>From 5b6a84655ea5023e8b1916c6164b39b7bcc3e9ee Mon Sep 17 00:00:00 2001
From: Peng Liu <winner245 at hotmail.com>
Date: Fri, 11 Apr 2025 17:37:10 -0400
Subject: [PATCH] Optimize std::for_each_n for segmented iterators

---
 libcxx/docs/ReleaseNotes/21.rst               |   3 +
 libcxx/include/CMakeLists.txt                 |   1 +
 libcxx/include/__algorithm/for_each.h         |   1 -
 libcxx/include/__algorithm/for_each_n.h       |  70 +++++++++-
 .../include/__algorithm/for_each_n_segment.h  |  63 +++++++++
 libcxx/include/module.modulemap.in            |   1 +
 .../nonmodifying/for_each_n.bench.cpp         |  98 +++++++++++++
 .../alg.foreach/for_each_n.pass.cpp           | 129 ++++++++++++------
 8 files changed, 318 insertions(+), 48 deletions(-)
 create mode 100644 libcxx/include/__algorithm/for_each_n_segment.h
 create mode 100644 libcxx/test/benchmarks/algorithms/nonmodifying/for_each_n.bench.cpp

diff --git a/libcxx/docs/ReleaseNotes/21.rst b/libcxx/docs/ReleaseNotes/21.rst
index a7382c5222d08..3ea9f17418447 100644
--- a/libcxx/docs/ReleaseNotes/21.rst
+++ b/libcxx/docs/ReleaseNotes/21.rst
@@ -70,6 +70,9 @@ Improvements and New Features
 - The segmented iterator optimization for ``std::for_each`` has been backported to C++11. Previously it was only available
   in C++23 and later.
 
+- The ``std::for_each_n`` algorithm has been optimized for segmented iterators, resulting in a performance improvement of
+  up to 17.7x for ``std::deque<short>`` iterators, and up to 13.9x for ``std::join_view<vector<vector<short>>>`` iterators.
+
 Deprecations and Removals
 -------------------------
 
diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt
index f1bdf684a8549..b6de4b1800dff 100644
--- a/libcxx/include/CMakeLists.txt
+++ b/libcxx/include/CMakeLists.txt
@@ -25,6 +25,7 @@ set(files
   __algorithm/find_segment_if.h
   __algorithm/for_each.h
   __algorithm/for_each_n.h
+  __algorithm/for_each_n_segment.h
   __algorithm/for_each_segment.h
   __algorithm/generate.h
   __algorithm/generate_n.h
diff --git a/libcxx/include/__algorithm/for_each.h b/libcxx/include/__algorithm/for_each.h
index b6c2c7c056edd..0b14d8c219931 100644
--- a/libcxx/include/__algorithm/for_each.h
+++ b/libcxx/include/__algorithm/for_each.h
@@ -9,7 +9,6 @@
 
 #ifndef _LIBCPP___ALGORITHM_FOR_EACH_H
 #define _LIBCPP___ALGORITHM_FOR_EACH_H
-
 #include <__algorithm/for_each_segment.h>
 #include <__config>
 #include <__iterator/segmented_iterator.h>
diff --git a/libcxx/include/__algorithm/for_each_n.h b/libcxx/include/__algorithm/for_each_n.h
index fce380b49df3e..12b8d1810685d 100644
--- a/libcxx/include/__algorithm/for_each_n.h
+++ b/libcxx/include/__algorithm/for_each_n.h
@@ -10,20 +10,36 @@
 #ifndef _LIBCPP___ALGORITHM_FOR_EACH_N_H
 #define _LIBCPP___ALGORITHM_FOR_EACH_N_H
 
+#include <__algorithm/for_each.h>
+#include <__algorithm/for_each_n_segment.h>
 #include <__config>
+#include <__iterator/iterator_traits.h>
+#include <__iterator/segmented_iterator.h>
+#include <__type_traits/enable_if.h>
 #include <__utility/convert_to_integral.h>
+#include <__utility/move.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
 #endif
 
-_LIBCPP_BEGIN_NAMESPACE_STD
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
 
-#if _LIBCPP_STD_VER >= 17
+_LIBCPP_BEGIN_NAMESPACE_STD
 
-template <class _InputIterator, class _Size, class _Function>
-inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _InputIterator
-for_each_n(_InputIterator __first, _Size __orig_n, _Function __f) {
+template <class _InputIterator,
+          class _Size,
+          class _Func,
+          __enable_if_t<!__has_random_access_iterator_category<_InputIterator>::value &&
+                            (!__is_segmented_iterator<_InputIterator>::value
+                             //   || !__has_random_access_iterator_category<
+                             //      typename __segmented_iterator_traits<_InputIterator>::__local_iterator>::value
+                             ), // TODO: __segmented_iterator_traits<_InputIterator> results in template instantiation
+                                // during SFINAE, which is a hard error to be fixed. Once fixed, we should uncomment.
+                        int> = 0>
+_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _InputIterator
+__for_each_n(_InputIterator __first, _Size __orig_n, _Func& __f) {
   typedef decltype(std::__convert_to_integral(__orig_n)) _IntegralSize;
   _IntegralSize __n = __orig_n;
   while (__n > 0) {
@@ -31,11 +47,51 @@ for_each_n(_InputIterator __first, _Size __orig_n, _Function __f) {
     ++__first;
     --__n;
   }
-  return __first;
+  return std::move(__first);
 }
 
-#endif
+template <class _RandIter,
+          class _Size,
+          class _Func,
+          __enable_if_t<__has_random_access_iterator_category<_RandIter>::value, int> = 0>
+_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _RandIter
+__for_each_n(_RandIter __first, _Size __orig_n, _Func& __f) {
+  typename std::iterator_traits<_RandIter>::difference_type __n = __orig_n;
+  auto __last                                                   = __first + __n;
+  std::__for_each(__first, __last, __f);
+  return std::move(__last);
+}
+
+#ifndef _LIBCPP_CXX03_LANG
+template <class _SegmentedIterator,
+          class _Size,
+          class _Func,
+          __enable_if_t<!__has_random_access_iterator_category<_SegmentedIterator>::value &&
+                            __is_segmented_iterator<_SegmentedIterator>::value &&
+                            __has_random_access_iterator_category<
+                                typename __segmented_iterator_traits<_SegmentedIterator>::__local_iterator>::value,
+                        int> = 0>
+_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _SegmentedIterator
+__for_each_n(_SegmentedIterator __first, _Size __orig_n, _Func& __f) {
+  using __local_iterator_t = typename __segmented_iterator_traits<_SegmentedIterator>::__local_iterator;
+  return std::__for_each_n_segment(__first, __orig_n, [&](__local_iterator_t __lfirst, __local_iterator_t __llast) {
+    std::__for_each(__lfirst, __llast, __f);
+  });
+}
+#endif // !_LIBCPP_CXX03_LANG
+
+#if _LIBCPP_STD_VER >= 17
+
+template <class _InputIterator, class _Size, class _Function>
+inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _InputIterator
+for_each_n(_InputIterator __first, _Size __orig_n, _Function __f) {
+  return std::__for_each_n(__first, __orig_n, __f);
+}
+
+#endif // _LIBCPP_STD_VER >= 17
 
 _LIBCPP_END_NAMESPACE_STD
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_FOR_EACH_N_H
diff --git a/libcxx/include/__algorithm/for_each_n_segment.h b/libcxx/include/__algorithm/for_each_n_segment.h
new file mode 100644
index 0000000000000..1b522fb373eee
--- /dev/null
+++ b/libcxx/include/__algorithm/for_each_n_segment.h
@@ -0,0 +1,63 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___ALGORITHM_FOR_EACH_N_SEGMENT_H
+#define _LIBCPP___ALGORITHM_FOR_EACH_N_SEGMENT_H
+
+#include <__config>
+#include <__iterator/iterator_traits.h>
+#include <__iterator/segmented_iterator.h>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#  pragma GCC system_header
+#endif
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+// __for_each_n_segment optimizes linear iteration over segmented iterators. It processes a segmented
+// input range [__first, __first + __n) by applying the functor __func to each element within the segment.
+// The return value of __func is ignored, and the function returns an iterator pointing to one past the
+// last processed element in the input range.
+
+template <class _SegmentedIterator, class _Size, class _Functor>
+_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _SegmentedIterator
+__for_each_n_segment(_SegmentedIterator __first, _Size __orig_n, _Functor __func) {
+  static_assert(__is_segmented_iterator<_SegmentedIterator>::value &&
+                    __has_random_access_iterator_category<
+                        typename __segmented_iterator_traits<_SegmentedIterator>::__local_iterator>::value,
+                "__for_each_n_segment only works with segmented iterators with random-access local iterators");
+  if (__orig_n <= 0)
+    return __first;
+
+  using _Traits        = __segmented_iterator_traits<_SegmentedIterator>;
+  using __local_iter_t = typename _Traits::__local_iterator;
+  using __difference_t = typename std::iterator_traits<__local_iter_t>::difference_type;
+  __difference_t __n   = __orig_n;
+  auto __seg           = _Traits::__segment(__first);
+  auto __local_first   = _Traits::__local(__first);
+  __local_iter_t __local_last;
+
+  while (__n > 0) {
+    __local_last    = _Traits::__end(__seg);
+    auto __seg_size = __local_last - __local_first;
+    if (__n <= __seg_size) {
+      __local_last = __local_first + __n;
+      __func(__local_first, __local_last);
+      break;
+    }
+    __func(__local_first, __local_last);
+    __n -= __seg_size;
+    __local_first = _Traits::__begin(++__seg);
+  }
+
+  return _Traits::__compose(__seg, __local_last);
+}
+
+_LIBCPP_END_NAMESPACE_STD
+
+#endif // _LIBCPP___ALGORITHM_FOR_EACH_N_SEGMENT_H
diff --git a/libcxx/include/module.modulemap.in b/libcxx/include/module.modulemap.in
index af928a63f2315..8e8b7a6f400d9 100644
--- a/libcxx/include/module.modulemap.in
+++ b/libcxx/include/module.modulemap.in
@@ -437,6 +437,7 @@ module std [system] {
     module find_segment_if                        { header "__algorithm/find_segment_if.h" }
     module find                                   { header "__algorithm/find.h" }
     module for_each_n                             { header "__algorithm/for_each_n.h" }
+    module for_each_n_segment                     { header "__algorithm/for_each_n_segment.h" }
     module for_each_segment                       { header "__algorithm/for_each_segment.h" }
     module for_each                               { header "__algorithm/for_each.h" }
     module generate_n                             { header "__algorithm/generate_n.h" }
diff --git a/libcxx/test/benchmarks/algorithms/nonmodifying/for_each_n.bench.cpp b/libcxx/test/benchmarks/algorithms/nonmodifying/for_each_n.bench.cpp
new file mode 100644
index 0000000000000..784708c7e01eb
--- /dev/null
+++ b/libcxx/test/benchmarks/algorithms/nonmodifying/for_each_n.bench.cpp
@@ -0,0 +1,98 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+
+#include <algorithm>
+#include <cstddef>
+#include <deque>
+#include <list>
+#include <ranges>
+#include <string>
+#include <vector>
+
+#include <benchmark/benchmark.h>
+
+int main(int argc, char** argv) {
+  auto std_for_each_n = [](auto first, auto n, auto f) { return std::for_each_n(first, n, f); };
+
+  // std::for_each_n
+  {
+    auto bm = []<class Container>(std::string name, auto for_each_n) {
+      using ElemType = typename Container::value_type;
+      benchmark::RegisterBenchmark(
+          name,
+          [for_each_n](auto& st) {
+            std::size_t const n = st.range(0);
+            Container c(n, 1);
+            auto first = c.begin();
+
+            for ([[maybe_unused]] auto _ : st) {
+              benchmark::DoNotOptimize(c);
+              auto result = for_each_n(first, n, [](ElemType& x) { x = std::clamp<ElemType>(x, 10, 100); });
+              benchmark::DoNotOptimize(result);
+            }
+          })
+          ->Arg(8)
+          ->Arg(32)
+          ->Arg(50) // non power-of-two
+          ->Arg(1024)
+          ->Arg(4096)
+          ->Arg(8192)
+          ->Arg(1 << 14)
+          ->Arg(1 << 16)
+          ->Arg(1 << 18);
+    };
+    bm.operator()<std::vector<int>>("std::for_each_n(vector<int>)", std_for_each_n);
+    bm.operator()<std::deque<int>>("std::for_each_n(deque<int>)", std_for_each_n);
+    bm.operator()<std::list<int>>("std::for_each_n(list<int>)", std_for_each_n);
+  }
+
+  // std::for_each_n for join_view
+  {
+    auto bm = []<class Container>(std::string name, auto for_each_n) {
+      using C1       = typename Container::value_type;
+      using ElemType = typename C1::value_type;
+      benchmark::RegisterBenchmark(
+          name,
+          [for_each_n](auto& st) {
+            std::size_t const size     = st.range(0);
+            std::size_t const seg_size = 256;
+            std::size_t const segments = (size + seg_size - 1) / seg_size;
+            Container c(segments);
+            for (std::size_t i = 0, n = size; i < segments; ++i, n -= seg_size) {
+              c[i].resize(std::min(seg_size, n), ElemType(1));
+            }
+
+            auto view  = c | std::views::join;
+            auto first = view.begin();
+
+            for ([[maybe_unused]] auto _ : st) {
+              benchmark::DoNotOptimize(c);
+              auto result = for_each_n(first, size, [](ElemType& x) { x = std::clamp<ElemType>(x, 10, 100); });
+              benchmark::DoNotOptimize(result);
+            }
+          })
+          ->Arg(8)
+          ->Arg(32)
+          ->Arg(50) // non power-of-two
+          ->Arg(1024)
+          ->Arg(4096)
+          ->Arg(8192)
+          ->Arg(1 << 14)
+          ->Arg(1 << 16)
+          ->Arg(1 << 18);
+    };
+    bm.operator()<std::vector<std::vector<int>>>("std::for_each_n(join_view(vector<vector<int>>))", std_for_each_n);
+  }
+
+  benchmark::Initialize(&argc, argv);
+  benchmark::RunSpecifiedBenchmarks();
+  benchmark::Shutdown();
+  return 0;
+}
diff --git a/libcxx/test/std/algorithms/alg.nonmodifying/alg.foreach/for_each_n.pass.cpp b/libcxx/test/std/algorithms/alg.nonmodifying/alg.foreach/for_each_n.pass.cpp
index 371f6c92f1ed1..39c1174dcec8b 100644
--- a/libcxx/test/std/algorithms/alg.nonmodifying/alg.foreach/for_each_n.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.nonmodifying/alg.foreach/for_each_n.pass.cpp
@@ -13,69 +13,118 @@
 //    constexpr InputIterator      // constexpr after C++17
 //    for_each_n(InputIterator first, Size n, Function f);
 
-
 #include <algorithm>
 #include <cassert>
+#include <deque>
 #include <functional>
+#include <iterator>
+#include <ranges>
+#include <vector>
 
 #include "test_macros.h"
 #include "test_iterators.h"
 
-#if TEST_STD_VER > 17
-TEST_CONSTEXPR bool test_constexpr() {
-    int ia[] = {1, 3, 6, 7};
-    int expected[] = {3, 5, 8, 9};
-    const std::size_t N = 4;
+struct for_each_test {
+  TEST_CONSTEXPR for_each_test(int c) : count(c) {}
+  int count;
+  TEST_CONSTEXPR_CXX14 void operator()(int& i) {
+    ++i;
+    ++count;
+  }
+};
 
-    auto it = std::for_each_n(std::begin(ia), N, [](int &a) { a += 2; });
-    return it == (std::begin(ia) + N)
-        && std::equal(std::begin(ia), std::end(ia), std::begin(expected))
-        ;
-    }
-#endif
+struct deque_test {
+  std::deque<int>* d_;
+  int* i_;
+
+  deque_test(std::deque<int>& d, int& i) : d_(&d), i_(&i) {}
 
-struct for_each_test
-{
-    for_each_test(int c) : count(c) {}
-    int count;
-    void operator()(int& i) {++i; ++count;}
+  void operator()(int& v) {
+    assert(&(*d_)[*i_] == &v);
+    ++*i_;
+  }
 };
 
-int main(int, char**)
-{
+/*TEST_CONSTEXPR_CXX26*/
+void test_segmented_deque_iterator() { // TODO: Mark as TEST_CONSTEXPR_CXX26 once std::deque is constexpr
+  // check that segmented deque iterators work properly
+  int sizes[] = {0, 1, 2, 1023, 1024, 1025, 2047, 2048, 2049};
+  for (const int size : sizes) {
+    std::deque<int> d(size);
+    int index = 0;
+
+    std::for_each_n(d.begin(), d.size(), deque_test(d, index));
+  }
+}
+
+TEST_CONSTEXPR_CXX20 bool test() {
+  {
     typedef cpp17_input_iterator<int*> Iter;
-    int ia[] = {0, 1, 2, 3, 4, 5};
-    const unsigned s = sizeof(ia)/sizeof(ia[0]);
+    int ia[]         = {0, 1, 2, 3, 4, 5};
+    const unsigned s = sizeof(ia) / sizeof(ia[0]);
 
     {
-    auto f = for_each_test(0);
-    Iter it = std::for_each_n(Iter(ia), 0, std::ref(f));
-    assert(it == Iter(ia));
-    assert(f.count == 0);
+      unsigned count = 0;
+      Iter it        = std::for_each_n(Iter(ia), 0, [&count](int& i) {
+        ++i;
+        ++count;
+      });
+      assert(it == Iter(ia));
+      assert(count == 0);
     }
 
     {
-    auto f = for_each_test(0);
-    Iter it = std::for_each_n(Iter(ia), s, std::ref(f));
-
-    assert(it == Iter(ia+s));
-    assert(f.count == s);
-    for (unsigned i = 0; i < s; ++i)
-        assert(ia[i] == static_cast<int>(i+1));
+      unsigned count = 0;
+      Iter it        = std::for_each_n(Iter(ia), s, [&count](int& i) {
+        ++i;
+        ++count;
+      });
+      assert(it == Iter(ia + s));
+      assert(count == s);
+      for (unsigned i = 0; i < s; ++i)
+        assert(ia[i] == static_cast<int>(i + 1));
     }
 
     {
-    auto f = for_each_test(0);
-    Iter it = std::for_each_n(Iter(ia), 1, std::ref(f));
-
-    assert(it == Iter(ia+1));
-    assert(f.count == 1);
-    for (unsigned i = 0; i < 1; ++i)
-        assert(ia[i] == static_cast<int>(i+2));
+      unsigned count = 0;
+      Iter it        = std::for_each_n(Iter(ia), 1, [&count](int& i) {
+        ++i;
+        ++count;
+      });
+      assert(it == Iter(ia + 1));
+      assert(count == 1);
+      for (unsigned i = 0; i < 1; ++i)
+        assert(ia[i] == static_cast<int>(i + 2));
     }
+  }
+
+  {
+    int ia[]            = {1, 3, 6, 7};
+    int expected[]      = {3, 5, 8, 9};
+    const std::size_t N = 4;
+
+    auto it = std::for_each_n(std::begin(ia), N, [](int& a) { a += 2; });
+    assert(it == (std::begin(ia) + N) && std::equal(std::begin(ia), std::end(ia), std::begin(expected)));
+  }
+
+  if (!TEST_IS_CONSTANT_EVALUATED) // TODO: Use TEST_STD_AT_LEAST_26_OR_RUNTIME_EVALUATED when std::deque is made constexpr
+    test_segmented_deque_iterator();
+
+#if TEST_STD_VER >= 20
+  {
+    std::vector<std::vector<int>> vec = {{0}, {1, 2}, {3, 4, 5}, {6, 7, 8, 9}, {10}, {11, 12, 13}};
+    auto v                            = vec | std::views::join;
+    std::for_each_n(v.begin(), std::ranges::distance(v), [i = 0](int& a) mutable { assert(a == i++); });
+  }
+#endif
+
+  return true;
+}
 
+int main(int, char**) {
+  assert(test());
 #if TEST_STD_VER > 17
-    static_assert(test_constexpr());
+  static_assert(test());
 #endif
 
   return 0;



More information about the libcxx-commits mailing list