[libcxx-commits] [libcxx] [libc++] Optimize ranges::{for_each, for_each_n} for segmented iterators (PR #132896)

Peng Liu via libcxx-commits libcxx-commits at lists.llvm.org
Fri Apr 4 18:21:33 PDT 2025


https://github.com/winner245 updated https://github.com/llvm/llvm-project/pull/132896

>From 8133b63fb2a47a1ecfe53128169e7f3310b55cd7 Mon Sep 17 00:00:00 2001
From: Peng Liu <winner245 at hotmail.com>
Date: Tue, 25 Mar 2025 03:44:59 -0400
Subject: [PATCH 01/14] Optimize ranges::{for_each, for_each_n} for segmented
 iterators

---
 libcxx/include/__algorithm/for_each_n.h       |  25 +++-
 libcxx/include/__algorithm/ranges_for_each.h  |  14 +-
 .../include/__algorithm/ranges_for_each_n.h   |  15 ++-
 .../nonmodifying/for_each_n.bench.cpp         |  57 +++++++++
 .../alg.foreach/for_each_n.pass.cpp           | 120 ++++++++++++------
 .../alg.foreach/ranges.for_each.pass.cpp      |  46 ++++++-
 .../alg.foreach/ranges.for_each_n.pass.cpp    |  46 ++++++-
 7 files changed, 270 insertions(+), 53 deletions(-)
 create mode 100644 libcxx/test/benchmarks/algorithms/nonmodifying/for_each_n.bench.cpp

diff --git a/libcxx/include/__algorithm/for_each_n.h b/libcxx/include/__algorithm/for_each_n.h
index fce380b49df3e..3d91124432f56 100644
--- a/libcxx/include/__algorithm/for_each_n.h
+++ b/libcxx/include/__algorithm/for_each_n.h
@@ -10,7 +10,11 @@
 #ifndef _LIBCPP___ALGORITHM_FOR_EACH_N_H
 #define _LIBCPP___ALGORITHM_FOR_EACH_N_H
 
+#include <__algorithm/for_each.h>
 #include <__config>
+#include <__iterator/iterator_traits.h>
+#include <__iterator/segmented_iterator.h>
+#include <__type_traits/enable_if.h>
 #include <__utility/convert_to_integral.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
@@ -21,7 +25,13 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 
 #if _LIBCPP_STD_VER >= 17
 
-template <class _InputIterator, class _Size, class _Function>
+template <class _InputIterator,
+          class _Size,
+          class _Function,
+          __enable_if_t<!__is_segmented_iterator<_InputIterator>::value ||
+                            (__has_input_iterator_category<_InputIterator>::value &&
+                             !__has_random_access_iterator_category<_InputIterator>::value),
+                        int> = 0>
 inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _InputIterator
 for_each_n(_InputIterator __first, _Size __orig_n, _Function __f) {
   typedef decltype(std::__convert_to_integral(__orig_n)) _IntegralSize;
@@ -34,6 +44,19 @@ for_each_n(_InputIterator __first, _Size __orig_n, _Function __f) {
   return __first;
 }
 
+template <class _InputIterator,
+          class _Size,
+          class _Function,
+          __enable_if_t<__is_segmented_iterator<_InputIterator>::value &&
+                            __has_random_access_iterator_category<_InputIterator>::value,
+                        int> = 0>
+inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _InputIterator
+for_each_n(_InputIterator __first, _Size __orig_n, _Function __f) {
+  _InputIterator __last = __first + __orig_n;
+  std::for_each(__first, __last, __f);
+  return __last;
+}
+
 #endif
 
 _LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/include/__algorithm/ranges_for_each.h b/libcxx/include/__algorithm/ranges_for_each.h
index de39bc5522753..475f85366188e 100644
--- a/libcxx/include/__algorithm/ranges_for_each.h
+++ b/libcxx/include/__algorithm/ranges_for_each.h
@@ -9,6 +9,7 @@
 #ifndef _LIBCPP___ALGORITHM_RANGES_FOR_EACH_H
 #define _LIBCPP___ALGORITHM_RANGES_FOR_EACH_H
 
+#include <__algorithm/for_each.h>
 #include <__algorithm/in_fun_result.h>
 #include <__config>
 #include <__functional/identity.h>
@@ -41,9 +42,16 @@ struct __for_each {
   template <class _Iter, class _Sent, class _Proj, class _Func>
   _LIBCPP_HIDE_FROM_ABI constexpr static for_each_result<_Iter, _Func>
   __for_each_impl(_Iter __first, _Sent __last, _Func& __func, _Proj& __proj) {
-    for (; __first != __last; ++__first)
-      std::invoke(__func, std::invoke(__proj, *__first));
-    return {std::move(__first), std::move(__func)};
+    if constexpr (random_access_iterator<_Iter> && sized_sentinel_for<_Sent, _Iter>) {
+      auto __n   = __last - __first;
+      auto __end = __first + __n;
+      std::for_each(__first, __end, [&](auto&& __val) { std::invoke(__func, std::invoke(__proj, __val)); });
+      return {std::move(__end), std::move(__func)};
+    } else {
+      for (; __first != __last; ++__first)
+        std::invoke(__func, std::invoke(__proj, *__first));
+      return {std::move(__first), std::move(__func)};
+    }
   }
 
 public:
diff --git a/libcxx/include/__algorithm/ranges_for_each_n.h b/libcxx/include/__algorithm/ranges_for_each_n.h
index 603cb723233c8..3108d66001295 100644
--- a/libcxx/include/__algorithm/ranges_for_each_n.h
+++ b/libcxx/include/__algorithm/ranges_for_each_n.h
@@ -9,6 +9,7 @@
 #ifndef _LIBCPP___ALGORITHM_RANGES_FOR_EACH_N_H
 #define _LIBCPP___ALGORITHM_RANGES_FOR_EACH_N_H
 
+#include <__algorithm/for_each.h>
 #include <__algorithm/in_fun_result.h>
 #include <__config>
 #include <__functional/identity.h>
@@ -40,11 +41,17 @@ struct __for_each_n {
   template <input_iterator _Iter, class _Proj = identity, indirectly_unary_invocable<projected<_Iter, _Proj>> _Func>
   _LIBCPP_HIDE_FROM_ABI constexpr for_each_n_result<_Iter, _Func>
   operator()(_Iter __first, iter_difference_t<_Iter> __count, _Func __func, _Proj __proj = {}) const {
-    while (__count-- > 0) {
-      std::invoke(__func, std::invoke(__proj, *__first));
-      ++__first;
+    if constexpr (random_access_iterator<_Iter>) {
+      auto __last = __first + __count;
+      std::for_each(__first, __last, [&](auto&& __val) { std::invoke(__func, std::invoke(__proj, __val)); });
+      return {std::move(__last), std::move(__func)};
+    } else {
+      while (__count-- > 0) {
+        std::invoke(__func, std::invoke(__proj, *__first));
+        ++__first;
+      }
+      return {std::move(__first), std::move(__func)};
     }
-    return {std::move(__first), std::move(__func)};
   }
 };
 
diff --git a/libcxx/test/benchmarks/algorithms/nonmodifying/for_each_n.bench.cpp b/libcxx/test/benchmarks/algorithms/nonmodifying/for_each_n.bench.cpp
new file mode 100644
index 0000000000000..af46371881577
--- /dev/null
+++ b/libcxx/test/benchmarks/algorithms/nonmodifying/for_each_n.bench.cpp
@@ -0,0 +1,57 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+
+#include <algorithm>
+#include <cstddef>
+#include <deque>
+#include <list>
+#include <string>
+#include <vector>
+
+#include <benchmark/benchmark.h>
+
+int main(int argc, char** argv) {
+  auto std_for_each_n = [](auto first, auto n, auto f) { return std::for_each_n(first, n, f); };
+
+  // {std,ranges}::for_each_n
+  {
+    auto bm = []<class Container>(std::string name, auto for_each_n) {
+      benchmark::RegisterBenchmark(
+          name,
+          [for_each_n](auto& st) {
+            std::size_t const n = st.range(0);
+            Container c(n, 1);
+            auto first = c.begin();
+
+            for ([[maybe_unused]] auto _ : st) {
+              benchmark::DoNotOptimize(c);
+              auto result = for_each_n(first, n, [](int& x) { x = std::clamp(x, 10, 100); });
+              benchmark::DoNotOptimize(result);
+            }
+          })
+          ->Arg(8)
+          ->Arg(32)
+          ->Arg(50) // non power-of-two
+          ->Arg(8192)
+          ->Arg(1 << 20);
+    };
+    bm.operator()<std::vector<int>>("std::for_each_n(vector<int>)", std_for_each_n);
+    bm.operator()<std::deque<int>>("std::for_each_n(deque<int>)", std_for_each_n);
+    bm.operator()<std::list<int>>("std::for_each_n(list<int>)", std_for_each_n);
+    bm.operator()<std::vector<int>>("rng::for_each_n(vector<int>)", std::ranges::for_each_n);
+    bm.operator()<std::deque<int>>("rng::for_each_n(deque<int>)", std::ranges::for_each_n);
+    bm.operator()<std::list<int>>("rng::for_each_n(list<int>)", std::ranges::for_each_n);
+  }
+
+  benchmark::Initialize(&argc, argv);
+  benchmark::RunSpecifiedBenchmarks();
+  benchmark::Shutdown();
+  return 0;
+}
diff --git a/libcxx/test/std/algorithms/alg.nonmodifying/alg.foreach/for_each_n.pass.cpp b/libcxx/test/std/algorithms/alg.nonmodifying/alg.foreach/for_each_n.pass.cpp
index 371f6c92f1ed1..42f1a41a27096 100644
--- a/libcxx/test/std/algorithms/alg.nonmodifying/alg.foreach/for_each_n.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.nonmodifying/alg.foreach/for_each_n.pass.cpp
@@ -13,69 +13,113 @@
 //    constexpr InputIterator      // constexpr after C++17
 //    for_each_n(InputIterator first, Size n, Function f);
 
-
 #include <algorithm>
 #include <cassert>
+#include <deque>
 #include <functional>
+#include <iterator>
+#include <ranges>
+#include <vector>
 
 #include "test_macros.h"
 #include "test_iterators.h"
 
-#if TEST_STD_VER > 17
-TEST_CONSTEXPR bool test_constexpr() {
-    int ia[] = {1, 3, 6, 7};
-    int expected[] = {3, 5, 8, 9};
-    const std::size_t N = 4;
+struct for_each_test {
+  TEST_CONSTEXPR for_each_test(int c) : count(c) {}
+  int count;
+  TEST_CONSTEXPR_CXX14 void operator()(int& i) {
+    ++i;
+    ++count;
+  }
+};
 
-    auto it = std::for_each_n(std::begin(ia), N, [](int &a) { a += 2; });
-    return it == (std::begin(ia) + N)
-        && std::equal(std::begin(ia), std::end(ia), std::begin(expected))
-        ;
-    }
-#endif
+struct deque_test {
+  std::deque<int>* d_;
+  int* i_;
+
+  deque_test(std::deque<int>& d, int& i) : d_(&d), i_(&i) {}
 
-struct for_each_test
-{
-    for_each_test(int c) : count(c) {}
-    int count;
-    void operator()(int& i) {++i; ++count;}
+  void operator()(int& v) {
+    assert(&(*d_)[*i_] == &v);
+    ++*i_;
+  }
 };
 
-int main(int, char**)
-{
+/*TEST_CONSTEXPR_CXX23*/
+void test_segmented_deque_iterator() { // TODO: Mark as TEST_CONSTEXPR_CXX23 once std::deque is constexpr
+  // check that segmented iterators work properly
+  int sizes[] = {0, 1, 2, 1023, 1024, 1025, 2047, 2048, 2049};
+  for (const int size : sizes) {
+    std::deque<int> d(size);
+    int index = 0;
+
+    std::for_each_n(d.begin(), d.size(), deque_test(d, index));
+  }
+}
+
+TEST_CONSTEXPR_CXX20 bool test() {
+  {
     typedef cpp17_input_iterator<int*> Iter;
-    int ia[] = {0, 1, 2, 3, 4, 5};
-    const unsigned s = sizeof(ia)/sizeof(ia[0]);
+    int ia[]         = {0, 1, 2, 3, 4, 5};
+    const unsigned s = sizeof(ia) / sizeof(ia[0]);
 
     {
-    auto f = for_each_test(0);
-    Iter it = std::for_each_n(Iter(ia), 0, std::ref(f));
-    assert(it == Iter(ia));
-    assert(f.count == 0);
+      auto f  = for_each_test(0);
+      Iter it = std::for_each_n(Iter(ia), 0, std::ref(f));
+      assert(it == Iter(ia));
+      assert(f.count == 0);
     }
 
     {
-    auto f = for_each_test(0);
-    Iter it = std::for_each_n(Iter(ia), s, std::ref(f));
+      auto f  = for_each_test(0);
+      Iter it = std::for_each_n(Iter(ia), s, std::ref(f));
 
-    assert(it == Iter(ia+s));
-    assert(f.count == s);
-    for (unsigned i = 0; i < s; ++i)
-        assert(ia[i] == static_cast<int>(i+1));
+      assert(it == Iter(ia + s));
+      assert(f.count == s);
+      for (unsigned i = 0; i < s; ++i)
+        assert(ia[i] == static_cast<int>(i + 1));
     }
 
     {
-    auto f = for_each_test(0);
-    Iter it = std::for_each_n(Iter(ia), 1, std::ref(f));
+      auto f  = for_each_test(0);
+      Iter it = std::for_each_n(Iter(ia), 1, std::ref(f));
 
-    assert(it == Iter(ia+1));
-    assert(f.count == 1);
-    for (unsigned i = 0; i < 1; ++i)
-        assert(ia[i] == static_cast<int>(i+2));
+      assert(it == Iter(ia + 1));
+      assert(f.count == 1);
+      for (unsigned i = 0; i < 1; ++i)
+        assert(ia[i] == static_cast<int>(i + 2));
     }
+  }
+
+#if TEST_STD_VER > 11
+  {
+    int ia[]            = {1, 3, 6, 7};
+    int expected[]      = {3, 5, 8, 9};
+    const std::size_t N = 4;
+
+    auto it = std::for_each_n(std::begin(ia), N, [](int& a) { a += 2; });
+    assert(it == (std::begin(ia) + N) && std::equal(std::begin(ia), std::end(ia), std::begin(expected)));
+  }
+#endif
+
+  if (!TEST_IS_CONSTANT_EVALUATED) // TODO: Use TEST_STD_AT_LEAST_23_OR_RUNTIME_EVALUATED when std::deque is made constexpr
+    test_segmented_deque_iterator();
+
+#if TEST_STD_VER >= 20
+  { // Make sure that the segmented iterator optimization works during constant evaluation
+    std::vector<std::vector<int>> vec = {{0}, {1, 2}, {3, 4, 5}, {6, 7, 8, 9}, {10}, {11, 12, 13}};
+    auto v                            = vec | std::views::join;
+    std::for_each_n(v.begin(), std::ranges::distance(v), [i = 0](int& a) mutable { assert(a == i++); });
+  }
+#endif
+
+  return true;
+}
 
+int main(int, char**) {
+  assert(test());
 #if TEST_STD_VER > 17
-    static_assert(test_constexpr());
+  static_assert(test());
 #endif
 
   return 0;
diff --git a/libcxx/test/std/algorithms/alg.nonmodifying/alg.foreach/ranges.for_each.pass.cpp b/libcxx/test/std/algorithms/alg.nonmodifying/alg.foreach/ranges.for_each.pass.cpp
index 8b9b6e82cbcb2..2f4bfb9db6dba 100644
--- a/libcxx/test/std/algorithms/alg.nonmodifying/alg.foreach/ranges.for_each.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.nonmodifying/alg.foreach/ranges.for_each.pass.cpp
@@ -20,7 +20,10 @@
 
 #include <algorithm>
 #include <array>
+#include <cassert>
+#include <deque>
 #include <ranges>
+#include <vector>
 
 #include "almost_satisfies_types.h"
 #include "test_iterators.h"
@@ -30,7 +33,7 @@ struct Callable {
 };
 
 template <class Iter, class Sent = Iter>
-concept HasForEachIt = requires (Iter iter, Sent sent) { std::ranges::for_each(iter, sent, Callable{}); };
+concept HasForEachIt = requires(Iter iter, Sent sent) { std::ranges::for_each(iter, sent, Callable{}); };
 
 static_assert(HasForEachIt<int*>);
 static_assert(!HasForEachIt<InputIteratorNotDerivedFrom>);
@@ -47,7 +50,7 @@ static_assert(!HasForEachItFunc<IndirectUnaryPredicateNotPredicate>);
 static_assert(!HasForEachItFunc<IndirectUnaryPredicateNotCopyConstructible>);
 
 template <class Range>
-concept HasForEachR = requires (Range range) { std::ranges::for_each(range, Callable{}); };
+concept HasForEachR = requires(Range range) { std::ranges::for_each(range, Callable{}); };
 
 static_assert(HasForEachR<UncheckedRange<int*>>);
 static_assert(!HasForEachR<InputRangeNotDerivedFrom>);
@@ -68,7 +71,7 @@ constexpr void test_iterator() {
   { // simple test
     {
       auto func = [i = 0](int& a) mutable { a += i++; };
-      int a[] = {1, 6, 3, 4};
+      int a[]   = {1, 6, 3, 4};
       std::same_as<std::ranges::for_each_result<Iter, decltype(func)>> decltype(auto) ret =
           std::ranges::for_each(Iter(a), Sent(Iter(a + 4)), func);
       assert(a[0] == 1);
@@ -81,8 +84,8 @@ constexpr void test_iterator() {
       assert(i == 4);
     }
     {
-      auto func = [i = 0](int& a) mutable { a += i++; };
-      int a[] = {1, 6, 3, 4};
+      auto func  = [i = 0](int& a) mutable { a += i++; };
+      int a[]    = {1, 6, 3, 4};
       auto range = std::ranges::subrange(Iter(a), Sent(Iter(a + 4)));
       std::same_as<std::ranges::for_each_result<Iter, decltype(func)>> decltype(auto) ret =
           std::ranges::for_each(range, func);
@@ -110,6 +113,30 @@ constexpr void test_iterator() {
   }
 }
 
+struct deque_test {
+  std::deque<int>* d_;
+  int* i_;
+
+  deque_test(std::deque<int>& d, int& i) : d_(&d), i_(&i) {}
+
+  void operator()(int& v) {
+    assert(&(*d_)[*i_] == &v);
+    ++*i_;
+  }
+};
+
+/*TEST_CONSTEXPR_CXX23*/
+void test_segmented_deque_iterator() { // TODO: Mark as TEST_CONSTEXPR_CXX23 once std::deque is constexpr
+  // check that segmented iterators work properly
+  int sizes[] = {0, 1, 2, 1023, 1024, 1025, 2047, 2048, 2049};
+  for (const int size : sizes) {
+    std::deque<int> d(size);
+    int index = 0;
+
+    std::ranges::for_each(d, deque_test(d, index));
+  }
+}
+
 constexpr bool test() {
   test_iterator<cpp17_input_iterator<int*>, sentinel_wrapper<cpp17_input_iterator<int*>>>();
   test_iterator<cpp20_input_iterator<int*>, sentinel_wrapper<cpp20_input_iterator<int*>>>();
@@ -146,6 +173,15 @@ constexpr bool test() {
     }
   }
 
+  if (!TEST_IS_CONSTANT_EVALUATED) // TODO: Use TEST_STD_AT_LEAST_23_OR_RUNTIME_EVALUATED when std::deque is made constexpr
+    test_segmented_deque_iterator();
+
+  {
+    std::vector<std::vector<int>> vec = {{0}, {1, 2}, {3, 4, 5}, {6, 7, 8, 9}, {10}, {11, 12, 13}};
+    auto v                            = vec | std::views::join;
+    std::ranges::for_each(v, [i = 0](int x) mutable { assert(x == 2 * i++); }, [](int x) { return 2 * x; });
+  }
+
   return true;
 }
 
diff --git a/libcxx/test/std/algorithms/alg.nonmodifying/alg.foreach/ranges.for_each_n.pass.cpp b/libcxx/test/std/algorithms/alg.nonmodifying/alg.foreach/ranges.for_each_n.pass.cpp
index d4b2d053d08ce..ad1447b7348f5 100644
--- a/libcxx/test/std/algorithms/alg.nonmodifying/alg.foreach/ranges.for_each_n.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.nonmodifying/alg.foreach/ranges.for_each_n.pass.cpp
@@ -17,7 +17,12 @@
 
 #include <algorithm>
 #include <array>
+#include <cassert>
+#include <deque>
+#include <iterator>
 #include <ranges>
+#include <ranges>
+#include <vector>
 
 #include "almost_satisfies_types.h"
 #include "test_iterators.h"
@@ -27,7 +32,7 @@ struct Callable {
 };
 
 template <class Iter>
-concept HasForEachN = requires (Iter iter) { std::ranges::for_each_n(iter, 0, Callable{}); };
+concept HasForEachN = requires(Iter iter) { std::ranges::for_each_n(iter, 0, Callable{}); };
 
 static_assert(HasForEachN<int*>);
 static_assert(!HasForEachN<InputIteratorNotDerivedFrom>);
@@ -45,7 +50,7 @@ template <class Iter>
 constexpr void test_iterator() {
   { // simple test
     auto func = [i = 0](int& a) mutable { a += i++; };
-    int a[] = {1, 6, 3, 4};
+    int a[]   = {1, 6, 3, 4};
     std::same_as<std::ranges::for_each_result<Iter, decltype(func)>> auto ret =
         std::ranges::for_each_n(Iter(a), 4, func);
     assert(a[0] == 1);
@@ -64,6 +69,30 @@ constexpr void test_iterator() {
   }
 }
 
+struct deque_test {
+  std::deque<int>* d_;
+  int* i_;
+
+  deque_test(std::deque<int>& d, int& i) : d_(&d), i_(&i) {}
+
+  void operator()(int& v) {
+    assert(&(*d_)[*i_] == &v);
+    ++*i_;
+  }
+};
+
+/*TEST_CONSTEXPR_CXX23*/
+void test_segmented_deque_iterator() { // TODO: Mark as TEST_CONSTEXPR_CXX23 once std::deque is constexpr
+  // check that segmented iterators work properly
+  int sizes[] = {0, 1, 2, 1023, 1024, 1025, 2047, 2048, 2049};
+  for (const int size : sizes) {
+    std::deque<int> d(size);
+    int index = 0;
+
+    std::ranges::for_each_n(d.begin(), d.size(), deque_test(d, index));
+  }
+}
+
 constexpr bool test() {
   test_iterator<cpp17_input_iterator<int*>>();
   test_iterator<cpp20_input_iterator<int*>>();
@@ -89,6 +118,19 @@ constexpr bool test() {
     assert(a[2].other == 6);
   }
 
+  if (!TEST_IS_CONSTANT_EVALUATED) // TODO: Use TEST_STD_AT_LEAST_23_OR_RUNTIME_EVALUATED when std::deque is made constexpr
+    test_segmented_deque_iterator();
+
+  {
+    std::vector<std::vector<int>> vec = {{0}, {1, 2}, {3, 4, 5}, {6, 7, 8, 9}, {10}, {11, 12, 13}};
+    auto v                            = vec | std::views::join;
+    std::ranges::for_each_n(
+        v.begin(),
+        std::ranges::distance(v),
+        [i = 0](int x) mutable { assert(x == 2 * i++); },
+        [](int x) { return 2 * x; });
+  }
+
   return true;
 }
 

>From 6bec774bbb39c2896623a1ada3c4304c1bb3980d Mon Sep 17 00:00:00 2001
From: Peng Liu <winner245 at hotmail.com>
Date: Tue, 25 Mar 2025 21:29:27 -0400
Subject: [PATCH 02/14] Address ldionne's review comments

---
 libcxx/include/__algorithm/for_each.h           | 17 ++++++++++++-----
 libcxx/include/__algorithm/for_each_n.h         |  3 ++-
 libcxx/include/__algorithm/ranges_for_each.h    |  4 +++-
 libcxx/include/__algorithm/ranges_for_each_n.h  |  4 +++-
 .../alg.foreach/for_each_n.pass.cpp             |  2 +-
 .../alg.foreach/ranges.for_each.pass.cpp        |  2 +-
 .../alg.foreach/ranges.for_each_n.pass.cpp      |  2 +-
 7 files changed, 23 insertions(+), 11 deletions(-)

diff --git a/libcxx/include/__algorithm/for_each.h b/libcxx/include/__algorithm/for_each.h
index e08f583504c01..0a03702f982be 100644
--- a/libcxx/include/__algorithm/for_each.h
+++ b/libcxx/include/__algorithm/for_each.h
@@ -11,6 +11,7 @@
 #define _LIBCPP___ALGORITHM_FOR_EACH_H
 
 #include <__algorithm/for_each_segment.h>
+#include <__algorithm/iterator_operations.h>
 #include <__config>
 #include <__iterator/segmented_iterator.h>
 #include <__ranges/movable_box.h>
@@ -26,20 +27,20 @@ _LIBCPP_PUSH_MACROS
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
-template <class _InputIterator, class _Function>
+template <class, class _InputIterator, class _Function>
 _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Function
-for_each(_InputIterator __first, _InputIterator __last, _Function __f) {
+__for_each(_InputIterator __first, _InputIterator __last, _Function& __f) {
   for (; __first != __last; ++__first)
     __f(*__first);
-  return __f;
+  return std::move(__f);
 }
 
 // __movable_box is available in C++20, but is actually a copyable-box, so optimization is only correct in C++23
 #if _LIBCPP_STD_VER >= 23
-template <class _SegmentedIterator, class _Function>
+template <class, class _SegmentedIterator, class _Function>
   requires __is_segmented_iterator<_SegmentedIterator>::value
 _LIBCPP_HIDE_FROM_ABI constexpr _Function
-for_each(_SegmentedIterator __first, _SegmentedIterator __last, _Function __func) {
+for_each(_SegmentedIterator __first, _SegmentedIterator __last, _Function& __func) {
   ranges::__movable_box<_Function> __wrapped_func(in_place, std::move(__func));
   std::__for_each_segment(__first, __last, [&](auto __lfirst, auto __llast) {
     __wrapped_func =
@@ -49,6 +50,12 @@ for_each(_SegmentedIterator __first, _SegmentedIterator __last, _Function __func
 }
 #endif // _LIBCPP_STD_VER >= 23
 
+template <class _InputIterator, class _Function>
+_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Function
+for_each(_InputIterator __first, _InputIterator __last, _Function __f) {
+  return __for_each<_ClassicAlgPolicy>(__first, __last, __f);
+}
+
 _LIBCPP_END_NAMESPACE_STD
 
 _LIBCPP_POP_MACROS
diff --git a/libcxx/include/__algorithm/for_each_n.h b/libcxx/include/__algorithm/for_each_n.h
index 3d91124432f56..20e8bcc4b8c76 100644
--- a/libcxx/include/__algorithm/for_each_n.h
+++ b/libcxx/include/__algorithm/for_each_n.h
@@ -11,6 +11,7 @@
 #define _LIBCPP___ALGORITHM_FOR_EACH_N_H
 
 #include <__algorithm/for_each.h>
+#include <__algorithm/iterator_operations.h>
 #include <__config>
 #include <__iterator/iterator_traits.h>
 #include <__iterator/segmented_iterator.h>
@@ -53,7 +54,7 @@ template <class _InputIterator,
 inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _InputIterator
 for_each_n(_InputIterator __first, _Size __orig_n, _Function __f) {
   _InputIterator __last = __first + __orig_n;
-  std::for_each(__first, __last, __f);
+  std::__for_each<_ClassicAlgPolicy>(__first, __last, __f);
   return __last;
 }
 
diff --git a/libcxx/include/__algorithm/ranges_for_each.h b/libcxx/include/__algorithm/ranges_for_each.h
index 475f85366188e..5d27befd9619f 100644
--- a/libcxx/include/__algorithm/ranges_for_each.h
+++ b/libcxx/include/__algorithm/ranges_for_each.h
@@ -11,6 +11,7 @@
 
 #include <__algorithm/for_each.h>
 #include <__algorithm/in_fun_result.h>
+#include <__algorithm/iterator_operations.h>
 #include <__config>
 #include <__functional/identity.h>
 #include <__functional/invoke.h>
@@ -45,7 +46,8 @@ struct __for_each {
     if constexpr (random_access_iterator<_Iter> && sized_sentinel_for<_Sent, _Iter>) {
       auto __n   = __last - __first;
       auto __end = __first + __n;
-      std::for_each(__first, __end, [&](auto&& __val) { std::invoke(__func, std::invoke(__proj, __val)); });
+      auto __f   = [&](auto&& __val) { std::invoke(__func, std::invoke(__proj, __val)); };
+      std::__for_each<_RangeAlgPolicy>(__first, __end, __f);
       return {std::move(__end), std::move(__func)};
     } else {
       for (; __first != __last; ++__first)
diff --git a/libcxx/include/__algorithm/ranges_for_each_n.h b/libcxx/include/__algorithm/ranges_for_each_n.h
index 3108d66001295..8384ba3bb14e6 100644
--- a/libcxx/include/__algorithm/ranges_for_each_n.h
+++ b/libcxx/include/__algorithm/ranges_for_each_n.h
@@ -11,6 +11,7 @@
 
 #include <__algorithm/for_each.h>
 #include <__algorithm/in_fun_result.h>
+#include <__algorithm/iterator_operations.h>
 #include <__config>
 #include <__functional/identity.h>
 #include <__functional/invoke.h>
@@ -43,7 +44,8 @@ struct __for_each_n {
   operator()(_Iter __first, iter_difference_t<_Iter> __count, _Func __func, _Proj __proj = {}) const {
     if constexpr (random_access_iterator<_Iter>) {
       auto __last = __first + __count;
-      std::for_each(__first, __last, [&](auto&& __val) { std::invoke(__func, std::invoke(__proj, __val)); });
+      auto __f    = [&](auto&& __val) { std::invoke(__func, std::invoke(__proj, __val)); };
+      std::__for_each<_RangeAlgPolicy>(__first, __last, __f);
       return {std::move(__last), std::move(__func)};
     } else {
       while (__count-- > 0) {
diff --git a/libcxx/test/std/algorithms/alg.nonmodifying/alg.foreach/for_each_n.pass.cpp b/libcxx/test/std/algorithms/alg.nonmodifying/alg.foreach/for_each_n.pass.cpp
index 42f1a41a27096..7819c785fc3eb 100644
--- a/libcxx/test/std/algorithms/alg.nonmodifying/alg.foreach/for_each_n.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.nonmodifying/alg.foreach/for_each_n.pass.cpp
@@ -47,7 +47,7 @@ struct deque_test {
 
 /*TEST_CONSTEXPR_CXX23*/
 void test_segmented_deque_iterator() { // TODO: Mark as TEST_CONSTEXPR_CXX23 once std::deque is constexpr
-  // check that segmented iterators work properly
+  // check that segmented deque iterators work properly
   int sizes[] = {0, 1, 2, 1023, 1024, 1025, 2047, 2048, 2049};
   for (const int size : sizes) {
     std::deque<int> d(size);
diff --git a/libcxx/test/std/algorithms/alg.nonmodifying/alg.foreach/ranges.for_each.pass.cpp b/libcxx/test/std/algorithms/alg.nonmodifying/alg.foreach/ranges.for_each.pass.cpp
index 2f4bfb9db6dba..14be4a42f667c 100644
--- a/libcxx/test/std/algorithms/alg.nonmodifying/alg.foreach/ranges.for_each.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.nonmodifying/alg.foreach/ranges.for_each.pass.cpp
@@ -127,7 +127,7 @@ struct deque_test {
 
 /*TEST_CONSTEXPR_CXX23*/
 void test_segmented_deque_iterator() { // TODO: Mark as TEST_CONSTEXPR_CXX23 once std::deque is constexpr
-  // check that segmented iterators work properly
+  // check that segmented deque iterators work properly
   int sizes[] = {0, 1, 2, 1023, 1024, 1025, 2047, 2048, 2049};
   for (const int size : sizes) {
     std::deque<int> d(size);
diff --git a/libcxx/test/std/algorithms/alg.nonmodifying/alg.foreach/ranges.for_each_n.pass.cpp b/libcxx/test/std/algorithms/alg.nonmodifying/alg.foreach/ranges.for_each_n.pass.cpp
index ad1447b7348f5..ac073d3052170 100644
--- a/libcxx/test/std/algorithms/alg.nonmodifying/alg.foreach/ranges.for_each_n.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.nonmodifying/alg.foreach/ranges.for_each_n.pass.cpp
@@ -83,7 +83,7 @@ struct deque_test {
 
 /*TEST_CONSTEXPR_CXX23*/
 void test_segmented_deque_iterator() { // TODO: Mark as TEST_CONSTEXPR_CXX23 once std::deque is constexpr
-  // check that segmented iterators work properly
+  // check that segmented deque iterators work properly
   int sizes[] = {0, 1, 2, 1023, 1024, 1025, 2047, 2048, 2049};
   for (const int size : sizes) {
     std::deque<int> d(size);

>From aaa24967889a831d62dee0008270588deda6f344 Mon Sep 17 00:00:00 2001
From: Peng Liu <winner245 at hotmail.com>
Date: Tue, 25 Mar 2025 23:11:34 -0400
Subject: [PATCH 03/14] Fix test and ADL call

---
 libcxx/include/__algorithm/for_each.h                       | 6 +++++-
 .../alg.nonmodifying/alg.foreach/for_each_n.pass.cpp        | 6 +++---
 .../alg.nonmodifying/alg.foreach/ranges.for_each.pass.cpp   | 6 +++---
 .../alg.nonmodifying/alg.foreach/ranges.for_each_n.pass.cpp | 6 +++---
 4 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/libcxx/include/__algorithm/for_each.h b/libcxx/include/__algorithm/for_each.h
index 0a03702f982be..07fa58df55c28 100644
--- a/libcxx/include/__algorithm/for_each.h
+++ b/libcxx/include/__algorithm/for_each.h
@@ -27,6 +27,10 @@ _LIBCPP_PUSH_MACROS
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
+template <class _InputIterator, class _Function>
+_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Function
+for_each(_InputIterator __first, _InputIterator __last, _Function __f);
+
 template <class, class _InputIterator, class _Function>
 _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Function
 __for_each(_InputIterator __first, _InputIterator __last, _Function& __f) {
@@ -53,7 +57,7 @@ for_each(_SegmentedIterator __first, _SegmentedIterator __last, _Function& __fun
 template <class _InputIterator, class _Function>
 _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Function
 for_each(_InputIterator __first, _InputIterator __last, _Function __f) {
-  return __for_each<_ClassicAlgPolicy>(__first, __last, __f);
+  return std::__for_each<_ClassicAlgPolicy>(__first, __last, __f);
 }
 
 _LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/test/std/algorithms/alg.nonmodifying/alg.foreach/for_each_n.pass.cpp b/libcxx/test/std/algorithms/alg.nonmodifying/alg.foreach/for_each_n.pass.cpp
index 7819c785fc3eb..a2b8931a62985 100644
--- a/libcxx/test/std/algorithms/alg.nonmodifying/alg.foreach/for_each_n.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.nonmodifying/alg.foreach/for_each_n.pass.cpp
@@ -45,8 +45,8 @@ struct deque_test {
   }
 };
 
-/*TEST_CONSTEXPR_CXX23*/
-void test_segmented_deque_iterator() { // TODO: Mark as TEST_CONSTEXPR_CXX23 once std::deque is constexpr
+/*TEST_CONSTEXPR_CXX26*/
+void test_segmented_deque_iterator() { // TODO: Mark as TEST_CONSTEXPR_CXX26 once std::deque is constexpr
   // check that segmented deque iterators work properly
   int sizes[] = {0, 1, 2, 1023, 1024, 1025, 2047, 2048, 2049};
   for (const int size : sizes) {
@@ -102,7 +102,7 @@ TEST_CONSTEXPR_CXX20 bool test() {
   }
 #endif
 
-  if (!TEST_IS_CONSTANT_EVALUATED) // TODO: Use TEST_STD_AT_LEAST_23_OR_RUNTIME_EVALUATED when std::deque is made constexpr
+  if (!TEST_IS_CONSTANT_EVALUATED) // TODO: Use TEST_STD_AT_LEAST_26_OR_RUNTIME_EVALUATED when std::deque is made constexpr
     test_segmented_deque_iterator();
 
 #if TEST_STD_VER >= 20
diff --git a/libcxx/test/std/algorithms/alg.nonmodifying/alg.foreach/ranges.for_each.pass.cpp b/libcxx/test/std/algorithms/alg.nonmodifying/alg.foreach/ranges.for_each.pass.cpp
index 14be4a42f667c..a6d0afde3186a 100644
--- a/libcxx/test/std/algorithms/alg.nonmodifying/alg.foreach/ranges.for_each.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.nonmodifying/alg.foreach/ranges.for_each.pass.cpp
@@ -125,8 +125,8 @@ struct deque_test {
   }
 };
 
-/*TEST_CONSTEXPR_CXX23*/
-void test_segmented_deque_iterator() { // TODO: Mark as TEST_CONSTEXPR_CXX23 once std::deque is constexpr
+/*TEST_CONSTEXPR_CXX26*/
+void test_segmented_deque_iterator() { // TODO: Mark as TEST_CONSTEXPR_CXX26 once std::deque is constexpr
   // check that segmented deque iterators work properly
   int sizes[] = {0, 1, 2, 1023, 1024, 1025, 2047, 2048, 2049};
   for (const int size : sizes) {
@@ -173,7 +173,7 @@ constexpr bool test() {
     }
   }
 
-  if (!TEST_IS_CONSTANT_EVALUATED) // TODO: Use TEST_STD_AT_LEAST_23_OR_RUNTIME_EVALUATED when std::deque is made constexpr
+  if (!TEST_IS_CONSTANT_EVALUATED) // TODO: Use TEST_STD_AT_LEAST_26_OR_RUNTIME_EVALUATED when std::deque is made constexpr
     test_segmented_deque_iterator();
 
   {
diff --git a/libcxx/test/std/algorithms/alg.nonmodifying/alg.foreach/ranges.for_each_n.pass.cpp b/libcxx/test/std/algorithms/alg.nonmodifying/alg.foreach/ranges.for_each_n.pass.cpp
index ac073d3052170..1578763694231 100644
--- a/libcxx/test/std/algorithms/alg.nonmodifying/alg.foreach/ranges.for_each_n.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.nonmodifying/alg.foreach/ranges.for_each_n.pass.cpp
@@ -81,8 +81,8 @@ struct deque_test {
   }
 };
 
-/*TEST_CONSTEXPR_CXX23*/
-void test_segmented_deque_iterator() { // TODO: Mark as TEST_CONSTEXPR_CXX23 once std::deque is constexpr
+/*TEST_CONSTEXPR_CXX26*/
+void test_segmented_deque_iterator() { // TODO: Mark as TEST_CONSTEXPR_CXX26 once std::deque is constexpr
   // check that segmented deque iterators work properly
   int sizes[] = {0, 1, 2, 1023, 1024, 1025, 2047, 2048, 2049};
   for (const int size : sizes) {
@@ -118,7 +118,7 @@ constexpr bool test() {
     assert(a[2].other == 6);
   }
 
-  if (!TEST_IS_CONSTANT_EVALUATED) // TODO: Use TEST_STD_AT_LEAST_23_OR_RUNTIME_EVALUATED when std::deque is made constexpr
+  if (!TEST_IS_CONSTANT_EVALUATED) // TODO: Use TEST_STD_AT_LEAST_26_OR_RUNTIME_EVALUATED when std::deque is made constexpr
     test_segmented_deque_iterator();
 
   {

>From 964df6be45c17dc000dcb921059eb61da0515d5e Mon Sep 17 00:00:00 2001
From: Peng Liu <winner245 at hotmail.com>
Date: Wed, 26 Mar 2025 11:10:37 -0400
Subject: [PATCH 04/14] Make for_each segmented iterator optimization valid for
 C++03

---
 libcxx/include/__algorithm/for_each.h         | 43 +++++++++++--------
 libcxx/include/__algorithm/for_each_n.h       | 10 +++--
 .../include/__algorithm/ranges_for_each_n.h   |  5 ++-
 3 files changed, 33 insertions(+), 25 deletions(-)

diff --git a/libcxx/include/__algorithm/for_each.h b/libcxx/include/__algorithm/for_each.h
index 07fa58df55c28..1260d5204bcf1 100644
--- a/libcxx/include/__algorithm/for_each.h
+++ b/libcxx/include/__algorithm/for_each.h
@@ -14,7 +14,7 @@
 #include <__algorithm/iterator_operations.h>
 #include <__config>
 #include <__iterator/segmented_iterator.h>
-#include <__ranges/movable_box.h>
+#include <__type_traits/enable_if.h>
 #include <__utility/in_place.h>
 #include <__utility/move.h>
 
@@ -27,32 +27,37 @@ _LIBCPP_PUSH_MACROS
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
-template <class _InputIterator, class _Function>
-_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Function
-for_each(_InputIterator __first, _InputIterator __last, _Function __f);
-
-template <class, class _InputIterator, class _Function>
+template <class, class _InputIterator, class _Sent, class _Function>
 _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Function
-__for_each(_InputIterator __first, _InputIterator __last, _Function& __f) {
+__for_each(_InputIterator __first, _Sent __last, _Function& __f) {
   for (; __first != __last; ++__first)
     __f(*__first);
   return std::move(__f);
 }
 
-// __movable_box is available in C++20, but is actually a copyable-box, so optimization is only correct in C++23
-#if _LIBCPP_STD_VER >= 23
-template <class, class _SegmentedIterator, class _Function>
-  requires __is_segmented_iterator<_SegmentedIterator>::value
+template <class _InputIterator, class _Function>
+struct _ForeachSegment {
+  using _Traits _LIBCPP_NODEBUG = __segmented_iterator_traits<_InputIterator>;
+
+  _Function& __func_;
+
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 explicit _ForeachSegment(_Function& __func) : __func_(__func) {}
+
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 void
+  operator()(typename _Traits::__local_iterator __lfirst, typename _Traits::__local_iterator __llast) {
+    std::__for_each<_ClassicAlgPolicy>(__lfirst, __llast, __func_);
+  }
+};
+
+template <class,
+          class _SegmentedIterator,
+          class _Function,
+          __enable_if_t<__is_segmented_iterator<_SegmentedIterator>::value, int> = 0>
 _LIBCPP_HIDE_FROM_ABI constexpr _Function
-for_each(_SegmentedIterator __first, _SegmentedIterator __last, _Function& __func) {
-  ranges::__movable_box<_Function> __wrapped_func(in_place, std::move(__func));
-  std::__for_each_segment(__first, __last, [&](auto __lfirst, auto __llast) {
-    __wrapped_func =
-        ranges::__movable_box<_Function>(in_place, std::for_each(__lfirst, __llast, std::move(*__wrapped_func)));
-  });
-  return std::move(*__wrapped_func);
+__for_each(_SegmentedIterator __first, _SegmentedIterator __last, _Function& __func) {
+  std::__for_each_segment(__first, __last, _ForeachSegment<_SegmentedIterator, _Function>(__func));
+  return std::move(__func);
 }
-#endif // _LIBCPP_STD_VER >= 23
 
 template <class _InputIterator, class _Function>
 _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Function
diff --git a/libcxx/include/__algorithm/for_each_n.h b/libcxx/include/__algorithm/for_each_n.h
index 20e8bcc4b8c76..8cd136521743c 100644
--- a/libcxx/include/__algorithm/for_each_n.h
+++ b/libcxx/include/__algorithm/for_each_n.h
@@ -14,6 +14,7 @@
 #include <__algorithm/iterator_operations.h>
 #include <__config>
 #include <__iterator/iterator_traits.h>
+#include <__iterator/next.h>
 #include <__iterator/segmented_iterator.h>
 #include <__type_traits/enable_if.h>
 #include <__utility/convert_to_integral.h>
@@ -30,8 +31,7 @@ template <class _InputIterator,
           class _Size,
           class _Function,
           __enable_if_t<!__is_segmented_iterator<_InputIterator>::value ||
-                            (__has_input_iterator_category<_InputIterator>::value &&
-                             !__has_random_access_iterator_category<_InputIterator>::value),
+                            __has_exactly_input_iterator_category<_InputIterator>::value,
                         int> = 0>
 inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _InputIterator
 for_each_n(_InputIterator __first, _Size __orig_n, _Function __f) {
@@ -49,11 +49,13 @@ template <class _InputIterator,
           class _Size,
           class _Function,
           __enable_if_t<__is_segmented_iterator<_InputIterator>::value &&
-                            __has_random_access_iterator_category<_InputIterator>::value,
+                            __has_forward_iterator_category<_InputIterator>::value,
                         int> = 0>
 inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _InputIterator
 for_each_n(_InputIterator __first, _Size __orig_n, _Function __f) {
-  _InputIterator __last = __first + __orig_n;
+  typedef decltype(std::__convert_to_integral(__orig_n)) _IntegralSize;
+  _IntegralSize __n     = __orig_n;
+  _InputIterator __last = std::next(__first, __n);
   std::__for_each<_ClassicAlgPolicy>(__first, __last, __f);
   return __last;
 }
diff --git a/libcxx/include/__algorithm/ranges_for_each_n.h b/libcxx/include/__algorithm/ranges_for_each_n.h
index 8384ba3bb14e6..a5c81868c2062 100644
--- a/libcxx/include/__algorithm/ranges_for_each_n.h
+++ b/libcxx/include/__algorithm/ranges_for_each_n.h
@@ -18,6 +18,7 @@
 #include <__iterator/concepts.h>
 #include <__iterator/incrementable_traits.h>
 #include <__iterator/iterator_traits.h>
+#include <__iterator/next.h>
 #include <__iterator/projected.h>
 #include <__ranges/concepts.h>
 #include <__utility/move.h>
@@ -42,8 +43,8 @@ struct __for_each_n {
   template <input_iterator _Iter, class _Proj = identity, indirectly_unary_invocable<projected<_Iter, _Proj>> _Func>
   _LIBCPP_HIDE_FROM_ABI constexpr for_each_n_result<_Iter, _Func>
   operator()(_Iter __first, iter_difference_t<_Iter> __count, _Func __func, _Proj __proj = {}) const {
-    if constexpr (random_access_iterator<_Iter>) {
-      auto __last = __first + __count;
+    if constexpr (forward_iterator<_Iter>) {
+      auto __last = std::ranges::next(__first, __count);
       auto __f    = [&](auto&& __val) { std::invoke(__func, std::invoke(__proj, __val)); };
       std::__for_each<_RangeAlgPolicy>(__first, __last, __f);
       return {std::move(__last), std::move(__func)};

>From 383856234cca240f940669898ebee50c657af723 Mon Sep 17 00:00:00 2001
From: Peng Liu <winner245 at hotmail.com>
Date: Wed, 26 Mar 2025 11:25:44 -0400
Subject: [PATCH 05/14] Use _LIBCPP_CONSTEXPR_SINCE_CXX14 in place of constexpr

---
 libcxx/include/__algorithm/for_each.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/libcxx/include/__algorithm/for_each.h b/libcxx/include/__algorithm/for_each.h
index 1260d5204bcf1..7e514710122b8 100644
--- a/libcxx/include/__algorithm/for_each.h
+++ b/libcxx/include/__algorithm/for_each.h
@@ -15,7 +15,6 @@
 #include <__config>
 #include <__iterator/segmented_iterator.h>
 #include <__type_traits/enable_if.h>
-#include <__utility/in_place.h>
 #include <__utility/move.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
@@ -53,7 +52,7 @@ template <class,
           class _SegmentedIterator,
           class _Function,
           __enable_if_t<__is_segmented_iterator<_SegmentedIterator>::value, int> = 0>
-_LIBCPP_HIDE_FROM_ABI constexpr _Function
+_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Function
 __for_each(_SegmentedIterator __first, _SegmentedIterator __last, _Function& __func) {
   std::__for_each_segment(__first, __last, _ForeachSegment<_SegmentedIterator, _Function>(__func));
   return std::move(__func);

>From 73d961cb628b17d76877e1a45378c9103f3bd1ac Mon Sep 17 00:00:00 2001
From: Peng Liu <winner245 at hotmail.com>
Date: Thu, 27 Mar 2025 11:50:12 -0400
Subject: [PATCH 06/14] Allow transitive include of <optional> in affected
 headers

---
 libcxx/include/algorithm             | 1 +
 libcxx/include/array                 | 1 +
 libcxx/include/bitset                | 1 +
 libcxx/include/codecvt               | 1 +
 libcxx/include/condition_variable    | 1 +
 libcxx/include/experimental/iterator | 1 +
 libcxx/include/ios                   | 1 +
 libcxx/include/locale                | 1 +
 libcxx/include/mutex                 | 1 +
 libcxx/include/shared_mutex          | 1 +
 libcxx/include/streambuf             | 1 +
 libcxx/include/string                | 1 +
 libcxx/include/string_view           | 1 +
 libcxx/include/system_error          | 1 +
 libcxx/include/vector                | 1 +
 15 files changed, 15 insertions(+)

diff --git a/libcxx/include/algorithm b/libcxx/include/algorithm
index 6ba903ad3ce1e..1ef709082324b 100644
--- a/libcxx/include/algorithm
+++ b/libcxx/include/algorithm
@@ -2057,6 +2057,7 @@ template <class BidirectionalIterator, class Compare>
 #    include <cstring>
 #    include <iterator>
 #    include <memory>
+#    include <optional>
 #    include <stdexcept>
 #    include <type_traits>
 #    include <utility>
diff --git a/libcxx/include/array b/libcxx/include/array
index d536575d41680..099b85b5d222d 100644
--- a/libcxx/include/array
+++ b/libcxx/include/array
@@ -566,6 +566,7 @@ _LIBCPP_POP_MACROS
 #    include <cstdlib>
 #    include <iterator>
 #    include <new>
+#    include <optional>
 #    include <type_traits>
 #    include <utility>
 #  endif
diff --git a/libcxx/include/bitset b/libcxx/include/bitset
index 9106080ec1020..bea5c826ac3b5 100644
--- a/libcxx/include/bitset
+++ b/libcxx/include/bitset
@@ -974,6 +974,7 @@ _LIBCPP_POP_MACROS
 #  if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20
 #    include <concepts>
 #    include <cstdlib>
+#    include <optional>
 #    include <type_traits>
 #  endif
 #endif // __cplusplus < 201103L && defined(_LIBCPP_USE_FROZEN_CXX03_HEADERS)
diff --git a/libcxx/include/codecvt b/libcxx/include/codecvt
index 0526b8512175f..398a777af1333 100644
--- a/libcxx/include/codecvt
+++ b/libcxx/include/codecvt
@@ -597,6 +597,7 @@ _LIBCPP_END_NAMESPACE_STD
 #    include <limits>
 #    include <mutex>
 #    include <new>
+#    include <optional>
 #    include <stdexcept>
 #    include <type_traits>
 #    include <typeinfo>
diff --git a/libcxx/include/condition_variable b/libcxx/include/condition_variable
index 81699bf6adbf7..a80627cbf7dec 100644
--- a/libcxx/include/condition_variable
+++ b/libcxx/include/condition_variable
@@ -363,6 +363,7 @@ _LIBCPP_POP_MACROS
 #    include <initializer_list>
 #    include <iosfwd>
 #    include <new>
+#    include <optional>
 #    include <stdexcept>
 #    include <system_error>
 #    include <type_traits>
diff --git a/libcxx/include/experimental/iterator b/libcxx/include/experimental/iterator
index 8448654e8d94c..47ad7459e0150 100644
--- a/libcxx/include/experimental/iterator
+++ b/libcxx/include/experimental/iterator
@@ -127,6 +127,7 @@ _LIBCPP_POP_MACROS
 #  if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20
 #    include <cstddef>
 #    include <iosfwd>
+#    include <optional>
 #    include <type_traits>
 #  endif
 #endif // __cplusplus < 201103L && defined(_LIBCPP_USE_FROZEN_CXX03_HEADERS)
diff --git a/libcxx/include/ios b/libcxx/include/ios
index 98a088266539a..62e94a28d709e 100644
--- a/libcxx/include/ios
+++ b/libcxx/include/ios
@@ -887,6 +887,7 @@ _LIBCPP_POP_MACROS
 #    include <limits>
 #    include <mutex>
 #    include <new>
+#    include <optional>
 #    include <stdexcept>
 #    include <system_error>
 #    include <type_traits>
diff --git a/libcxx/include/locale b/libcxx/include/locale
index fa15302223202..2a233ce7c159e 100644
--- a/libcxx/include/locale
+++ b/libcxx/include/locale
@@ -3689,6 +3689,7 @@ _LIBCPP_POP_MACROS
 #    include <cstdarg>
 #    include <iterator>
 #    include <mutex>
+#    include <optional>
 #    include <stdexcept>
 #    include <type_traits>
 #    include <typeinfo>
diff --git a/libcxx/include/mutex b/libcxx/include/mutex
index 9b128e8710aae..de196b256201a 100644
--- a/libcxx/include/mutex
+++ b/libcxx/include/mutex
@@ -508,6 +508,7 @@ _LIBCPP_POP_MACROS
 #    include <initializer_list>
 #    include <iosfwd>
 #    include <new>
+#    include <optional>
 #    include <stdexcept>
 #    include <system_error>
 #    include <type_traits>
diff --git a/libcxx/include/shared_mutex b/libcxx/include/shared_mutex
index b1e2a5d434400..7200c417995a5 100644
--- a/libcxx/include/shared_mutex
+++ b/libcxx/include/shared_mutex
@@ -461,6 +461,7 @@ _LIBCPP_POP_MACROS
 #  endif // _LIBCPP_HAS_THREADS
 
 #  if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20
+#    include <optional>
 #    include <system_error>
 #  endif
 #endif // __cplusplus < 201103L && defined(_LIBCPP_USE_FROZEN_CXX03_HEADERS)
diff --git a/libcxx/include/streambuf b/libcxx/include/streambuf
index 3c4e9086e05ec..d688ec76cae3a 100644
--- a/libcxx/include/streambuf
+++ b/libcxx/include/streambuf
@@ -386,6 +386,7 @@ _LIBCPP_POP_MACROS
 
 #  if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20
 #    include <cstdint>
+#    include <optional>
 #  endif
 #endif // __cplusplus < 201103L && defined(_LIBCPP_USE_FROZEN_CXX03_HEADERS)
 
diff --git a/libcxx/include/string b/libcxx/include/string
index fa87dc2fddb59..ef873f175f70c 100644
--- a/libcxx/include/string
+++ b/libcxx/include/string
@@ -4101,6 +4101,7 @@ _LIBCPP_POP_MACROS
 #    include <cstdlib>
 #    include <iterator>
 #    include <new>
+#    include <optional>
 #    include <type_traits>
 #    include <typeinfo>
 #    include <utility>
diff --git a/libcxx/include/string_view b/libcxx/include/string_view
index c640ae4e79865..b9efaa90ef3e6 100644
--- a/libcxx/include/string_view
+++ b/libcxx/include/string_view
@@ -948,6 +948,7 @@ _LIBCPP_POP_MACROS
 #    include <concepts>
 #    include <cstdlib>
 #    include <iterator>
+#    include <optional>
 #    include <type_traits>
 #  endif
 #endif // __cplusplus < 201103L && defined(_LIBCPP_USE_FROZEN_CXX03_HEADERS)
diff --git a/libcxx/include/system_error b/libcxx/include/system_error
index 4dadc0a6ab483..2b668e5f8f1bc 100644
--- a/libcxx/include/system_error
+++ b/libcxx/include/system_error
@@ -168,6 +168,7 @@ template <> struct hash<std::error_condition>;
 #    include <cstdint>
 #    include <cstring>
 #    include <limits>
+#    include <optional>
 #    include <type_traits>
 #  endif
 #endif // __cplusplus < 201103L && defined(_LIBCPP_USE_FROZEN_CXX03_HEADERS)
diff --git a/libcxx/include/vector b/libcxx/include/vector
index 9fa81dcb7e76e..d2d5fcf4a3199 100644
--- a/libcxx/include/vector
+++ b/libcxx/include/vector
@@ -362,6 +362,7 @@ template<class T, class charT> requires is-vector-bool-reference<T> // Since C++
 #    if _LIBCPP_HAS_LOCALIZATION
 #      include <locale>
 #    endif
+#    include <optional>
 #    include <string>
 #    include <string_view>
 #    include <tuple>

>From 01082a9484c5581d0b10a8b3efbb7a14af1924fa Mon Sep 17 00:00:00 2001
From: Peng Liu <winner245 at hotmail.com>
Date: Thu, 27 Mar 2025 12:00:54 -0400
Subject: [PATCH 07/14] Remove unnecessary _AlgoPolicy template parameter

---
 libcxx/include/__algorithm/for_each.h          | 10 ++++------
 libcxx/include/__algorithm/for_each_n.h        |  3 +--
 libcxx/include/__algorithm/ranges_for_each.h   |  3 +--
 libcxx/include/__algorithm/ranges_for_each_n.h |  3 +--
 4 files changed, 7 insertions(+), 12 deletions(-)

diff --git a/libcxx/include/__algorithm/for_each.h b/libcxx/include/__algorithm/for_each.h
index 7e514710122b8..a61c2f1fd2420 100644
--- a/libcxx/include/__algorithm/for_each.h
+++ b/libcxx/include/__algorithm/for_each.h
@@ -11,7 +11,6 @@
 #define _LIBCPP___ALGORITHM_FOR_EACH_H
 
 #include <__algorithm/for_each_segment.h>
-#include <__algorithm/iterator_operations.h>
 #include <__config>
 #include <__iterator/segmented_iterator.h>
 #include <__type_traits/enable_if.h>
@@ -26,7 +25,7 @@ _LIBCPP_PUSH_MACROS
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
-template <class, class _InputIterator, class _Sent, class _Function>
+template <class _InputIterator, class _Sent, class _Function>
 _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Function
 __for_each(_InputIterator __first, _Sent __last, _Function& __f) {
   for (; __first != __last; ++__first)
@@ -44,12 +43,11 @@ struct _ForeachSegment {
 
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 void
   operator()(typename _Traits::__local_iterator __lfirst, typename _Traits::__local_iterator __llast) {
-    std::__for_each<_ClassicAlgPolicy>(__lfirst, __llast, __func_);
+    std::__for_each(__lfirst, __llast, __func_);
   }
 };
 
-template <class,
-          class _SegmentedIterator,
+template <class _SegmentedIterator,
           class _Function,
           __enable_if_t<__is_segmented_iterator<_SegmentedIterator>::value, int> = 0>
 _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Function
@@ -61,7 +59,7 @@ __for_each(_SegmentedIterator __first, _SegmentedIterator __last, _Function& __f
 template <class _InputIterator, class _Function>
 _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Function
 for_each(_InputIterator __first, _InputIterator __last, _Function __f) {
-  return std::__for_each<_ClassicAlgPolicy>(__first, __last, __f);
+  return std::__for_each(__first, __last, __f);
 }
 
 _LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/include/__algorithm/for_each_n.h b/libcxx/include/__algorithm/for_each_n.h
index 8cd136521743c..ec8fe955ed48f 100644
--- a/libcxx/include/__algorithm/for_each_n.h
+++ b/libcxx/include/__algorithm/for_each_n.h
@@ -11,7 +11,6 @@
 #define _LIBCPP___ALGORITHM_FOR_EACH_N_H
 
 #include <__algorithm/for_each.h>
-#include <__algorithm/iterator_operations.h>
 #include <__config>
 #include <__iterator/iterator_traits.h>
 #include <__iterator/next.h>
@@ -56,7 +55,7 @@ for_each_n(_InputIterator __first, _Size __orig_n, _Function __f) {
   typedef decltype(std::__convert_to_integral(__orig_n)) _IntegralSize;
   _IntegralSize __n     = __orig_n;
   _InputIterator __last = std::next(__first, __n);
-  std::__for_each<_ClassicAlgPolicy>(__first, __last, __f);
+  std::__for_each(__first, __last, __f);
   return __last;
 }
 
diff --git a/libcxx/include/__algorithm/ranges_for_each.h b/libcxx/include/__algorithm/ranges_for_each.h
index 5d27befd9619f..096e60683e39d 100644
--- a/libcxx/include/__algorithm/ranges_for_each.h
+++ b/libcxx/include/__algorithm/ranges_for_each.h
@@ -11,7 +11,6 @@
 
 #include <__algorithm/for_each.h>
 #include <__algorithm/in_fun_result.h>
-#include <__algorithm/iterator_operations.h>
 #include <__config>
 #include <__functional/identity.h>
 #include <__functional/invoke.h>
@@ -47,7 +46,7 @@ struct __for_each {
       auto __n   = __last - __first;
       auto __end = __first + __n;
       auto __f   = [&](auto&& __val) { std::invoke(__func, std::invoke(__proj, __val)); };
-      std::__for_each<_RangeAlgPolicy>(__first, __end, __f);
+      std::__for_each(__first, __end, __f);
       return {std::move(__end), std::move(__func)};
     } else {
       for (; __first != __last; ++__first)
diff --git a/libcxx/include/__algorithm/ranges_for_each_n.h b/libcxx/include/__algorithm/ranges_for_each_n.h
index a5c81868c2062..9c6c2b97a2ad1 100644
--- a/libcxx/include/__algorithm/ranges_for_each_n.h
+++ b/libcxx/include/__algorithm/ranges_for_each_n.h
@@ -11,7 +11,6 @@
 
 #include <__algorithm/for_each.h>
 #include <__algorithm/in_fun_result.h>
-#include <__algorithm/iterator_operations.h>
 #include <__config>
 #include <__functional/identity.h>
 #include <__functional/invoke.h>
@@ -46,7 +45,7 @@ struct __for_each_n {
     if constexpr (forward_iterator<_Iter>) {
       auto __last = std::ranges::next(__first, __count);
       auto __f    = [&](auto&& __val) { std::invoke(__func, std::invoke(__proj, __val)); };
-      std::__for_each<_RangeAlgPolicy>(__first, __last, __f);
+      std::__for_each(__first, __last, __f);
       return {std::move(__last), std::move(__func)};
     } else {
       while (__count-- > 0) {

>From 1d4b04b4473b096ce191517a41e8731907c86d6d Mon Sep 17 00:00:00 2001
From: Peng Liu <winner245 at hotmail.com>
Date: Thu, 27 Mar 2025 23:40:27 -0400
Subject: [PATCH 08/14] Inline tests and remove std::ref

---
 .../alg.foreach/for_each_n.pass.cpp           | 31 +++++++++++--------
 1 file changed, 18 insertions(+), 13 deletions(-)

diff --git a/libcxx/test/std/algorithms/alg.nonmodifying/alg.foreach/for_each_n.pass.cpp b/libcxx/test/std/algorithms/alg.nonmodifying/alg.foreach/for_each_n.pass.cpp
index a2b8931a62985..4d6e4c615c022 100644
--- a/libcxx/test/std/algorithms/alg.nonmodifying/alg.foreach/for_each_n.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.nonmodifying/alg.foreach/for_each_n.pass.cpp
@@ -64,34 +64,40 @@ TEST_CONSTEXPR_CXX20 bool test() {
     const unsigned s = sizeof(ia) / sizeof(ia[0]);
 
     {
-      auto f  = for_each_test(0);
-      Iter it = std::for_each_n(Iter(ia), 0, std::ref(f));
+      unsigned count = 0;
+      Iter it        = std::for_each_n(Iter(ia), 0, [&count](int& i) mutable {
+        ++i;
+        ++count;
+      });
       assert(it == Iter(ia));
-      assert(f.count == 0);
+      assert(count == 0);
     }
 
     {
-      auto f  = for_each_test(0);
-      Iter it = std::for_each_n(Iter(ia), s, std::ref(f));
-
+      unsigned count = 0;
+      Iter it        = std::for_each_n(Iter(ia), s, [&count](int& i) mutable {
+        ++i;
+        ++count;
+      });
       assert(it == Iter(ia + s));
-      assert(f.count == s);
+      assert(count == s);
       for (unsigned i = 0; i < s; ++i)
         assert(ia[i] == static_cast<int>(i + 1));
     }
 
     {
-      auto f  = for_each_test(0);
-      Iter it = std::for_each_n(Iter(ia), 1, std::ref(f));
-
+      unsigned count = 0;
+      Iter it        = std::for_each_n(Iter(ia), 1, [&count](int& i) mutable {
+        ++i;
+        ++count;
+      });
       assert(it == Iter(ia + 1));
-      assert(f.count == 1);
+      assert(count == 1);
       for (unsigned i = 0; i < 1; ++i)
         assert(ia[i] == static_cast<int>(i + 2));
     }
   }
 
-#if TEST_STD_VER > 11
   {
     int ia[]            = {1, 3, 6, 7};
     int expected[]      = {3, 5, 8, 9};
@@ -100,7 +106,6 @@ TEST_CONSTEXPR_CXX20 bool test() {
     auto it = std::for_each_n(std::begin(ia), N, [](int& a) { a += 2; });
     assert(it == (std::begin(ia) + N) && std::equal(std::begin(ia), std::end(ia), std::begin(expected)));
   }
-#endif
 
   if (!TEST_IS_CONSTANT_EVALUATED) // TODO: Use TEST_STD_AT_LEAST_26_OR_RUNTIME_EVALUATED when std::deque is made constexpr
     test_segmented_deque_iterator();

>From 32512e86391f4938fc0c7a42c69c993c971e7702 Mon Sep 17 00:00:00 2001
From: Peng Liu <winner245 at hotmail.com>
Date: Fri, 28 Mar 2025 20:26:31 -0400
Subject: [PATCH 09/14] Apply optimization for join_view segmented iterators

---
 libcxx/docs/ReleaseNotes/21.rst               |   4 +
 libcxx/include/CMakeLists.txt                 |   1 +
 libcxx/include/__algorithm/for_each.h         |   7 +-
 libcxx/include/__algorithm/for_each_n.h       |  17 +--
 .../include/__algorithm/for_each_n_segment.h  |  77 +++++++++++
 .../include/__algorithm/ranges_for_each_n.h   |   5 +-
 libcxx/include/module.modulemap               |   1 +
 .../nonmodifying/for_each.bench.cpp           |  23 +++-
 .../nonmodifying/for_each_join_view.bench.cpp | 122 ++++++++++++++++++
 .../nonmodifying/for_each_n.bench.cpp         |  23 +++-
 10 files changed, 260 insertions(+), 20 deletions(-)
 create mode 100644 libcxx/include/__algorithm/for_each_n_segment.h
 create mode 100644 libcxx/test/benchmarks/algorithms/nonmodifying/for_each_join_view.bench.cpp

diff --git a/libcxx/docs/ReleaseNotes/21.rst b/libcxx/docs/ReleaseNotes/21.rst
index 7af109ddc8657..39c6e93be9a99 100644
--- a/libcxx/docs/ReleaseNotes/21.rst
+++ b/libcxx/docs/ReleaseNotes/21.rst
@@ -60,6 +60,10 @@ Improvements and New Features
 
 - Updated formatting library to Unicode 16.0.0.
 
+- The ``std::ranges::for_each`` and ``std::ranges::for_each_n`` algorithms have been optimized for segmented iterators,
+  resulting in performance improvements of up to 21.2x for ``std::deque::iterator`` segmented inputs and 17.9x for
+  ``join_view`` of ``vector<vector<T>>``.
+
 Deprecations and Removals
 -------------------------
 
diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt
index a021b9bb44d67..9803ff8c193c4 100644
--- a/libcxx/include/CMakeLists.txt
+++ b/libcxx/include/CMakeLists.txt
@@ -25,6 +25,7 @@ set(files
   __algorithm/find_segment_if.h
   __algorithm/for_each.h
   __algorithm/for_each_n.h
+  __algorithm/for_each_n_segment.h
   __algorithm/for_each_segment.h
   __algorithm/generate.h
   __algorithm/generate_n.h
diff --git a/libcxx/include/__algorithm/for_each.h b/libcxx/include/__algorithm/for_each.h
index a61c2f1fd2420..e6cb505e35274 100644
--- a/libcxx/include/__algorithm/for_each.h
+++ b/libcxx/include/__algorithm/for_each.h
@@ -33,13 +33,14 @@ __for_each(_InputIterator __first, _Sent __last, _Function& __f) {
   return std::move(__f);
 }
 
+// __do_segment acts as a functor for processing individual segments within the __for_each_segment{, _n} algorithms.
 template <class _InputIterator, class _Function>
-struct _ForeachSegment {
+struct __do_segment {
   using _Traits _LIBCPP_NODEBUG = __segmented_iterator_traits<_InputIterator>;
 
   _Function& __func_;
 
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 explicit _ForeachSegment(_Function& __func) : __func_(__func) {}
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 explicit __do_segment(_Function& __func) : __func_(__func) {}
 
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 void
   operator()(typename _Traits::__local_iterator __lfirst, typename _Traits::__local_iterator __llast) {
@@ -52,7 +53,7 @@ template <class _SegmentedIterator,
           __enable_if_t<__is_segmented_iterator<_SegmentedIterator>::value, int> = 0>
 _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Function
 __for_each(_SegmentedIterator __first, _SegmentedIterator __last, _Function& __func) {
-  std::__for_each_segment(__first, __last, _ForeachSegment<_SegmentedIterator, _Function>(__func));
+  std::__for_each_segment(__first, __last, std::__do_segment<_SegmentedIterator, _Function>(__func));
   return std::move(__func);
 }
 
diff --git a/libcxx/include/__algorithm/for_each_n.h b/libcxx/include/__algorithm/for_each_n.h
index ec8fe955ed48f..2abb163766dcb 100644
--- a/libcxx/include/__algorithm/for_each_n.h
+++ b/libcxx/include/__algorithm/for_each_n.h
@@ -11,6 +11,7 @@
 #define _LIBCPP___ALGORITHM_FOR_EACH_N_H
 
 #include <__algorithm/for_each.h>
+#include <__algorithm/for_each_n_segment.h>
 #include <__config>
 #include <__iterator/iterator_traits.h>
 #include <__iterator/next.h>
@@ -44,19 +45,15 @@ for_each_n(_InputIterator __first, _Size __orig_n, _Function __f) {
   return __first;
 }
 
-template <class _InputIterator,
+template <class _SegmentedIterator,
           class _Size,
           class _Function,
-          __enable_if_t<__is_segmented_iterator<_InputIterator>::value &&
-                            __has_forward_iterator_category<_InputIterator>::value,
+          __enable_if_t<__is_segmented_iterator<_SegmentedIterator>::value &&
+                            __has_forward_iterator_category<_SegmentedIterator>::value,
                         int> = 0>
-inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _InputIterator
-for_each_n(_InputIterator __first, _Size __orig_n, _Function __f) {
-  typedef decltype(std::__convert_to_integral(__orig_n)) _IntegralSize;
-  _IntegralSize __n     = __orig_n;
-  _InputIterator __last = std::next(__first, __n);
-  std::__for_each(__first, __last, __f);
-  return __last;
+inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _SegmentedIterator
+for_each_n(_SegmentedIterator __first, _Size __orig_n, _Function __f) {
+  return std::__for_each_n_segment(__first, __orig_n, std::__do_segment<_SegmentedIterator, _Function>(__f));
 }
 
 #endif
diff --git a/libcxx/include/__algorithm/for_each_n_segment.h b/libcxx/include/__algorithm/for_each_n_segment.h
new file mode 100644
index 0000000000000..e2e19cb31ecee
--- /dev/null
+++ b/libcxx/include/__algorithm/for_each_n_segment.h
@@ -0,0 +1,77 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___ALGORITHM_FOR_EACH_N_SEGMENT_H
+#define _LIBCPP___ALGORITHM_FOR_EACH_N_SEGMENT_H
+
+#include <__config>
+#include <__iterator/distance.h>
+#include <__iterator/next.h>
+#include <__iterator/segmented_iterator.h>
+#include <__utility/convert_to_integral.h>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#  pragma GCC system_header
+#endif
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+// __for_each_n_segment is a utility function for optimizing iterating over segmented iterators linearly.
+// __first and __orig_n are represent the begining and size of a segmented range. __func is expected to
+// take a range of local iterators. Anything that is returned from __func is ignored.
+
+template <class _SegmentedIterator, class _Size, class _Functor>
+_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _SegmentedIterator
+__for_each_n_segment(_SegmentedIterator __first, _Size __orig_n, _Functor __func) {
+  if (__orig_n == 0)
+    return __first;
+
+  using _Traits = __segmented_iterator_traits<_SegmentedIterator>;
+  typedef decltype(std::__convert_to_integral(__orig_n)) _IntegralSize;
+  _IntegralSize __n = __orig_n;
+  auto __seg        = _Traits::__segment(__first);
+  auto __sfirst     = _Traits::__begin(__seg);
+  auto __slast      = _Traits::__end(__seg);
+  auto __lfirst     = _Traits::__local(__first);
+  auto __seg_size   = static_cast<_IntegralSize>(std::distance(__lfirst, __slast));
+
+  // We have only one single segment, which might not start or end at the boundaries of the segment
+  if (__n <= __seg_size) {
+    auto __llast = std::next(__lfirst, __n);
+    __func(__lfirst, __llast);
+    return _Traits::__compose(__seg, __llast);
+  }
+
+  // We have more than one segment. Iterate over the first segment which might not start at the beginning
+  __func(__lfirst, std::next(__lfirst, __seg_size));
+  ++__seg;
+  __n -= __seg_size;
+
+  // Iterate over the 2nd to last segments which are guaranteed to start at the beginning of each segment
+  while (true) {
+    __sfirst   = _Traits::__begin(__seg);
+    __slast    = _Traits::__end(__seg);
+    __seg_size = std::distance(__sfirst, __slast);
+
+    // We are in the last segment
+    if (__n <= __seg_size) {
+      auto __llast = std::next(__sfirst, __n);
+      __func(__sfirst, __llast);
+      return _Traits::__compose(__seg, __llast);
+    }
+
+    // We are in middle segments that are completely in the range
+    __func(__sfirst, __slast);
+    ++__seg;
+    __n -= __seg_size;
+  }
+}
+
+_LIBCPP_END_NAMESPACE_STD
+
+#endif // _LIBCPP___ALGORITHM_FOR_EACH_N_SEGMENT_H
diff --git a/libcxx/include/__algorithm/ranges_for_each_n.h b/libcxx/include/__algorithm/ranges_for_each_n.h
index 9c6c2b97a2ad1..b92eeb6fa8d7c 100644
--- a/libcxx/include/__algorithm/ranges_for_each_n.h
+++ b/libcxx/include/__algorithm/ranges_for_each_n.h
@@ -9,7 +9,7 @@
 #ifndef _LIBCPP___ALGORITHM_RANGES_FOR_EACH_N_H
 #define _LIBCPP___ALGORITHM_RANGES_FOR_EACH_N_H
 
-#include <__algorithm/for_each.h>
+#include <__algorithm/for_each_n.h>
 #include <__algorithm/in_fun_result.h>
 #include <__config>
 #include <__functional/identity.h>
@@ -43,9 +43,8 @@ struct __for_each_n {
   _LIBCPP_HIDE_FROM_ABI constexpr for_each_n_result<_Iter, _Func>
   operator()(_Iter __first, iter_difference_t<_Iter> __count, _Func __func, _Proj __proj = {}) const {
     if constexpr (forward_iterator<_Iter>) {
-      auto __last = std::ranges::next(__first, __count);
       auto __f    = [&](auto&& __val) { std::invoke(__func, std::invoke(__proj, __val)); };
-      std::__for_each(__first, __last, __f);
+      auto __last = std::for_each_n(__first, __count, __f);
       return {std::move(__last), std::move(__func)};
     } else {
       while (__count-- > 0) {
diff --git a/libcxx/include/module.modulemap b/libcxx/include/module.modulemap
index 0ce42fc4d3633..da7034f7c081d 100644
--- a/libcxx/include/module.modulemap
+++ b/libcxx/include/module.modulemap
@@ -436,6 +436,7 @@ module std [system] {
     module find_segment_if                        { header "__algorithm/find_segment_if.h" }
     module find                                   { header "__algorithm/find.h" }
     module for_each_n                             { header "__algorithm/for_each_n.h" }
+    module for_each_n_segment                     { header "__algorithm/for_each_n_segment.h" }
     module for_each_segment                       { header "__algorithm/for_each_segment.h" }
     module for_each                               { header "__algorithm/for_each.h" }
     module generate_n                             { header "__algorithm/generate_n.h" }
diff --git a/libcxx/test/benchmarks/algorithms/nonmodifying/for_each.bench.cpp b/libcxx/test/benchmarks/algorithms/nonmodifying/for_each.bench.cpp
index 760accbe4d929..1e33cf70f8487 100644
--- a/libcxx/test/benchmarks/algorithms/nonmodifying/for_each.bench.cpp
+++ b/libcxx/test/benchmarks/algorithms/nonmodifying/for_each.bench.cpp
@@ -23,6 +23,7 @@ int main(int argc, char** argv) {
   // {std,ranges}::for_each
   {
     auto bm = []<class Container>(std::string name, auto for_each) {
+      using ElemType = typename Container::value_type;
       benchmark::RegisterBenchmark(
           name,
           [for_each](auto& st) {
@@ -33,16 +34,34 @@ int main(int argc, char** argv) {
 
             for ([[maybe_unused]] auto _ : st) {
               benchmark::DoNotOptimize(c);
-              auto result = for_each(first, last, [](int& x) { x = std::clamp(x, 10, 100); });
+              auto result = for_each(first, last, [](ElemType& x) { x = std::clamp<ElemType>(x, 10, 100); });
               benchmark::DoNotOptimize(result);
             }
           })
           ->Arg(8)
           ->Arg(32)
           ->Arg(50) // non power-of-two
+          ->Arg(1024)
+          ->Arg(4096)
           ->Arg(8192)
-          ->Arg(1 << 20);
+          ->Arg(1 << 14)
+          ->Arg(1 << 16)
+          ->Arg(1 << 18);
     };
+    bm.operator()<std::vector<char>>("std::for_each(vector<char>)", std_for_each);
+    bm.operator()<std::deque<char>>("std::for_each(deque<char>)", std_for_each);
+    bm.operator()<std::list<char>>("std::for_each(list<char>)", std_for_each);
+    bm.operator()<std::vector<char>>("rng::for_each(vector<char>)", std::ranges::for_each);
+    bm.operator()<std::deque<char>>("rng::for_each(deque<char>)", std::ranges::for_each);
+    bm.operator()<std::list<char>>("rng::for_each(list<char>)", std::ranges::for_each);
+
+    bm.operator()<std::vector<short>>("std::for_each(vector<short>)", std_for_each);
+    bm.operator()<std::deque<short>>("std::for_each(deque<short>)", std_for_each);
+    bm.operator()<std::list<short>>("std::for_each(list<short>)", std_for_each);
+    bm.operator()<std::vector<short>>("rng::for_each(vector<short>)", std::ranges::for_each);
+    bm.operator()<std::deque<short>>("rng::for_each(deque<short>)", std::ranges::for_each);
+    bm.operator()<std::list<short>>("rng::for_each(list<short>)", std::ranges::for_each);
+
     bm.operator()<std::vector<int>>("std::for_each(vector<int>)", std_for_each);
     bm.operator()<std::deque<int>>("std::for_each(deque<int>)", std_for_each);
     bm.operator()<std::list<int>>("std::for_each(list<int>)", std_for_each);
diff --git a/libcxx/test/benchmarks/algorithms/nonmodifying/for_each_join_view.bench.cpp b/libcxx/test/benchmarks/algorithms/nonmodifying/for_each_join_view.bench.cpp
new file mode 100644
index 0000000000000..28398ac988bf7
--- /dev/null
+++ b/libcxx/test/benchmarks/algorithms/nonmodifying/for_each_join_view.bench.cpp
@@ -0,0 +1,122 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+
+#include <algorithm>
+#include <cstddef>
+#include <deque>
+#include <list>
+#include <ranges>
+#include <string>
+#include <vector>
+
+#include <benchmark/benchmark.h>
+
+int main(int argc, char** argv) {
+  auto std_for_each   = [](auto first, auto last, auto f) { return std::for_each(first, last, f); };
+  auto std_for_each_n = [](auto first, auto n, auto f) { return std::for_each_n(first, n, f); };
+
+  // {std,ranges}::for_each
+  {
+    auto bm = []<class Container>(std::string name, auto for_each) {
+      using C1       = typename Container::value_type;
+      using ElemType = typename C1::value_type;
+
+      benchmark::RegisterBenchmark(
+          name,
+          [for_each](auto& st) {
+            std::size_t const size     = st.range(0);
+            std::size_t const seg_size = 256;
+            std::size_t const segments = (size + seg_size - 1) / seg_size;
+            Container c(segments);
+            for (std::size_t i = 0, n = size; i < segments; ++i, n -= seg_size) {
+              c[i].resize(std::min(seg_size, n), ElemType(1));
+            }
+
+            auto view  = c | std::views::join;
+            auto first = view.begin();
+            auto last  = view.end();
+
+            for ([[maybe_unused]] auto _ : st) {
+              benchmark::DoNotOptimize(c);
+              auto result = for_each(first, last, [](ElemType& x) { x = std::clamp<ElemType>(x, 10, 100); });
+              benchmark::DoNotOptimize(result);
+            }
+          })
+          ->Arg(8)
+          ->Arg(32)
+          ->Arg(50) // non power-of-two
+          ->Arg(1024)
+          ->Arg(4096)
+          ->Arg(8192)
+          ->Arg(1 << 14)
+          ->Arg(1 << 16)
+          ->Arg(1 << 18);
+    };
+    bm.operator()<std::vector<std::vector<char>>>("std::for_each(join_view(vector<vector<char>>))", std_for_each);
+    bm.operator()<std::vector<std::vector<short>>>("std::for_each(join_view(vector<vector<short>>))", std_for_each);
+    bm.operator()<std::vector<std::vector<int>>>("std::for_each(join_view(vector<vector<int>>))", std_for_each);
+    bm.operator()<std::vector<std::vector<char>>>(
+        "rng::for_each(join_view(vector<vector<char>>)", std::ranges::for_each);
+    bm.operator()<std::vector<std::vector<short>>>(
+        "rng::for_each(join_view(vector<vector<short>>)", std::ranges::for_each);
+    bm.operator()<std::vector<std::vector<int>>>("rng::for_each(join_view(vector<vector<int>>)", std::ranges::for_each);
+  }
+
+  // {std,ranges}::for_each_n
+  {
+    auto bm = []<class Container>(std::string name, auto for_each_n) {
+      using C1       = typename Container::value_type;
+      using ElemType = typename C1::value_type;
+      benchmark::RegisterBenchmark(
+          name,
+          [for_each_n](auto& st) {
+            std::size_t const size     = st.range(0);
+            std::size_t const seg_size = 256;
+            std::size_t const segments = (size + seg_size - 1) / seg_size;
+            Container c(segments);
+            for (std::size_t i = 0, n = size; i < segments; ++i, n -= seg_size) {
+              c[i].resize(std::min(seg_size, n), ElemType(1));
+            }
+
+            auto view  = c | std::views::join;
+            auto first = view.begin();
+
+            for ([[maybe_unused]] auto _ : st) {
+              benchmark::DoNotOptimize(c);
+              auto result = for_each_n(first, size, [](ElemType& x) { x = std::clamp<ElemType>(x, 10, 100); });
+              benchmark::DoNotOptimize(result);
+            }
+          })
+          ->Arg(8)
+          ->Arg(32)
+          ->Arg(50) // non power-of-two
+          ->Arg(1024)
+          ->Arg(4096)
+          ->Arg(8192)
+          ->Arg(1 << 14)
+          ->Arg(1 << 16)
+          ->Arg(1 << 18);
+    };
+    bm.operator()<std::vector<std::vector<char>>>("std::for_each_n(join_view(vector<vector<char>>))", std_for_each_n);
+    bm.operator()<std::vector<std::vector<short>>>("std::for_each_n(join_view(vector<vector<short>>))", std_for_each_n);
+    bm.operator()<std::vector<std::vector<int>>>("std::for_each_n(join_view(vector<vector<int>>))", std_for_each_n);
+    bm.operator()<std::vector<std::vector<char>>>(
+        "rng::for_each_n(join_view(vector<vector<char>>)", std::ranges::for_each_n);
+    bm.operator()<std::vector<std::vector<short>>>(
+        "rng::for_each_n(join_view(vector<vector<short>>)", std::ranges::for_each_n);
+    bm.operator()<std::vector<std::vector<int>>>(
+        "rng::for_each_n(join_view(vector<vector<int>>)", std::ranges::for_each_n);
+  }
+
+  benchmark::Initialize(&argc, argv);
+  benchmark::RunSpecifiedBenchmarks();
+  benchmark::Shutdown();
+  return 0;
+}
diff --git a/libcxx/test/benchmarks/algorithms/nonmodifying/for_each_n.bench.cpp b/libcxx/test/benchmarks/algorithms/nonmodifying/for_each_n.bench.cpp
index af46371881577..9e77f51db10cc 100644
--- a/libcxx/test/benchmarks/algorithms/nonmodifying/for_each_n.bench.cpp
+++ b/libcxx/test/benchmarks/algorithms/nonmodifying/for_each_n.bench.cpp
@@ -23,6 +23,7 @@ int main(int argc, char** argv) {
   // {std,ranges}::for_each_n
   {
     auto bm = []<class Container>(std::string name, auto for_each_n) {
+      using ElemType = typename Container::value_type;
       benchmark::RegisterBenchmark(
           name,
           [for_each_n](auto& st) {
@@ -32,16 +33,34 @@ int main(int argc, char** argv) {
 
             for ([[maybe_unused]] auto _ : st) {
               benchmark::DoNotOptimize(c);
-              auto result = for_each_n(first, n, [](int& x) { x = std::clamp(x, 10, 100); });
+              auto result = for_each_n(first, n, [](ElemType& x) { x = std::clamp<ElemType>(x, 10, 100); });
               benchmark::DoNotOptimize(result);
             }
           })
           ->Arg(8)
           ->Arg(32)
           ->Arg(50) // non power-of-two
+          ->Arg(1024)
+          ->Arg(4096)
           ->Arg(8192)
-          ->Arg(1 << 20);
+          ->Arg(1 << 14)
+          ->Arg(1 << 16)
+          ->Arg(1 << 18);
     };
+    bm.operator()<std::vector<char>>("std::for_each_n(vector<char>)", std_for_each_n);
+    bm.operator()<std::deque<char>>("std::for_each_n(deque<char>)", std_for_each_n);
+    bm.operator()<std::list<char>>("std::for_each_n(list<char>)", std_for_each_n);
+    bm.operator()<std::vector<char>>("rng::for_each_n(vector<char>)", std::ranges::for_each_n);
+    bm.operator()<std::deque<char>>("rng::for_each_n(deque<char>)", std::ranges::for_each_n);
+    bm.operator()<std::list<char>>("rng::for_each_n(list<char>)", std::ranges::for_each_n);
+
+    bm.operator()<std::vector<short>>("std::for_each_n(vector<short>)", std_for_each_n);
+    bm.operator()<std::deque<short>>("std::for_each_n(deque<short>)", std_for_each_n);
+    bm.operator()<std::list<short>>("std::for_each_n(list<short>)", std_for_each_n);
+    bm.operator()<std::vector<short>>("rng::for_each_n(vector<short>)", std::ranges::for_each_n);
+    bm.operator()<std::deque<short>>("rng::for_each_n(deque<short>)", std::ranges::for_each_n);
+    bm.operator()<std::list<short>>("rng::for_each_n(list<short>)", std::ranges::for_each_n);
+
     bm.operator()<std::vector<int>>("std::for_each_n(vector<int>)", std_for_each_n);
     bm.operator()<std::deque<int>>("std::for_each_n(deque<int>)", std_for_each_n);
     bm.operator()<std::list<int>>("std::for_each_n(list<int>)", std_for_each_n);

>From 24d3e4719bc8edb32f532ae69601a3b300cdb5ee Mon Sep 17 00:00:00 2001
From: Peng Liu <winner245 at hotmail.com>
Date: Sat, 29 Mar 2025 08:50:15 -0400
Subject: [PATCH 10/14] Slightly improve __for_each_n_segment

---
 .../include/__algorithm/for_each_n_segment.h  | 21 +++++++++++--------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/libcxx/include/__algorithm/for_each_n_segment.h b/libcxx/include/__algorithm/for_each_n_segment.h
index e2e19cb31ecee..81e3cf17f6db8 100644
--- a/libcxx/include/__algorithm/for_each_n_segment.h
+++ b/libcxx/include/__algorithm/for_each_n_segment.h
@@ -21,9 +21,11 @@
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
-// __for_each_n_segment is a utility function for optimizing iterating over segmented iterators linearly.
-// __first and __orig_n are represent the begining and size of a segmented range. __func is expected to
-// take a range of local iterators. Anything that is returned from __func is ignored.
+// __for_each_n_segment optimizes linear iteration over segmented iterators. It processes a segmented
+// input range defined by (__first, __orig_n), where __first is the starting segmented iterator and
+// __orig_n is the number of elements to process. The functor __func is applied to each segment using
+// local iterator pairs for that segment. The return value of __func is ignored, and the function
+// returns an iterator pointing to one past the last processed element in the input range.
 
 template <class _SegmentedIterator, class _Size, class _Functor>
 _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _SegmentedIterator
@@ -40,32 +42,33 @@ __for_each_n_segment(_SegmentedIterator __first, _Size __orig_n, _Functor __func
   auto __lfirst     = _Traits::__local(__first);
   auto __seg_size   = static_cast<_IntegralSize>(std::distance(__lfirst, __slast));
 
-  // We have only one single segment, which might not start or end at the boundaries of the segment
+  // Single-segment case: input range fits within a single segment (may not align with segment boundaries)
   if (__n <= __seg_size) {
     auto __llast = std::next(__lfirst, __n);
     __func(__lfirst, __llast);
     return _Traits::__compose(__seg, __llast);
   }
 
-  // We have more than one segment. Iterate over the first segment which might not start at the beginning
-  __func(__lfirst, std::next(__lfirst, __seg_size));
+  // Multi-segment case: input range spans multiple segments.
+  // Process the first segment which might not start at the beginning of the segment
+  __func(__lfirst, __slast);
   ++__seg;
   __n -= __seg_size;
 
-  // Iterate over the 2nd to last segments which are guaranteed to start at the beginning of each segment
+  // Process the 2nd to last segments guaranteed to start at the beginning of each segment
   while (true) {
     __sfirst   = _Traits::__begin(__seg);
     __slast    = _Traits::__end(__seg);
     __seg_size = std::distance(__sfirst, __slast);
 
-    // We are in the last segment
+    // The last (potentially partial) segment
     if (__n <= __seg_size) {
       auto __llast = std::next(__sfirst, __n);
       __func(__sfirst, __llast);
       return _Traits::__compose(__seg, __llast);
     }
 
-    // We are in middle segments that are completely in the range
+    // Middle whole segments that are completely in the range
     __func(__sfirst, __slast);
     ++__seg;
     __n -= __seg_size;

>From 52d20cfbf6de2cc5d2ce353f944a6cd9a796d9f9 Mon Sep 17 00:00:00 2001
From: Peng Liu <winner245 at hotmail.com>
Date: Sat, 29 Mar 2025 11:21:07 -0400
Subject: [PATCH 11/14] Consistently extend segmented iterator optimization to
 ranges::for_each

---
 libcxx/docs/ReleaseNotes/21.rst              |  2 +-
 libcxx/include/__algorithm/ranges_for_each.h | 15 ++++++++++-----
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/libcxx/docs/ReleaseNotes/21.rst b/libcxx/docs/ReleaseNotes/21.rst
index 39c6e93be9a99..cdff856836013 100644
--- a/libcxx/docs/ReleaseNotes/21.rst
+++ b/libcxx/docs/ReleaseNotes/21.rst
@@ -61,7 +61,7 @@ Improvements and New Features
 - Updated formatting library to Unicode 16.0.0.
 
 - The ``std::ranges::for_each`` and ``std::ranges::for_each_n`` algorithms have been optimized for segmented iterators,
-  resulting in performance improvements of up to 21.2x for ``std::deque::iterator`` segmented inputs and 17.9x for
+  resulting in performance improvements of up to 21.3x for ``std::deque::iterator`` segmented inputs and 24.9x for
   ``join_view`` of ``vector<vector<T>>``.
 
 Deprecations and Removals
diff --git a/libcxx/include/__algorithm/ranges_for_each.h b/libcxx/include/__algorithm/ranges_for_each.h
index 096e60683e39d..961f7558149a3 100644
--- a/libcxx/include/__algorithm/ranges_for_each.h
+++ b/libcxx/include/__algorithm/ranges_for_each.h
@@ -10,7 +10,9 @@
 #define _LIBCPP___ALGORITHM_RANGES_FOR_EACH_H
 
 #include <__algorithm/for_each.h>
+#include <__algorithm/for_each_n.h>
 #include <__algorithm/in_fun_result.h>
+#include <__concepts/assignable.h>
 #include <__config>
 #include <__functional/identity.h>
 #include <__functional/invoke.h>
@@ -42,11 +44,14 @@ struct __for_each {
   template <class _Iter, class _Sent, class _Proj, class _Func>
   _LIBCPP_HIDE_FROM_ABI constexpr static for_each_result<_Iter, _Func>
   __for_each_impl(_Iter __first, _Sent __last, _Func& __func, _Proj& __proj) {
-    if constexpr (random_access_iterator<_Iter> && sized_sentinel_for<_Sent, _Iter>) {
-      auto __n   = __last - __first;
-      auto __end = __first + __n;
-      auto __f   = [&](auto&& __val) { std::invoke(__func, std::invoke(__proj, __val)); };
-      std::__for_each(__first, __end, __f);
+    if constexpr (std::assignable_from<_Iter&, _Sent>) {
+      _Iter __end = std::move(__last);
+      std::for_each(__first, __end, [&](auto&& __val) { std::invoke(__func, std::invoke(__proj, __val)); });
+      return {std::move(__end), std::move(__func)};
+    } else if constexpr (sized_sentinel_for<_Sent, _Iter>) {
+      auto __end = std::for_each_n(__first, __last - __first, [&](auto&& __val) {
+        std::invoke(__func, std::invoke(__proj, __val));
+      });
       return {std::move(__end), std::move(__func)};
     } else {
       for (; __first != __last; ++__first)

>From 18637ce162a7f168e2947452ab441f6b4136840e Mon Sep 17 00:00:00 2001
From: Peng Liu <winner245 at hotmail.com>
Date: Mon, 31 Mar 2025 11:34:14 -0400
Subject: [PATCH 12/14] Rename __do_segment to __segment_processor

---
 libcxx/include/__algorithm/for_each.h   | 14 ++++++++------
 libcxx/include/__algorithm/for_each_n.h |  2 +-
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/libcxx/include/__algorithm/for_each.h b/libcxx/include/__algorithm/for_each.h
index e6cb505e35274..57cbb47d1ba8f 100644
--- a/libcxx/include/__algorithm/for_each.h
+++ b/libcxx/include/__algorithm/for_each.h
@@ -33,14 +33,16 @@ __for_each(_InputIterator __first, _Sent __last, _Function& __f) {
   return std::move(__f);
 }
 
-// __do_segment acts as a functor for processing individual segments within the __for_each_segment{, _n} algorithms.
-template <class _InputIterator, class _Function>
-struct __do_segment {
-  using _Traits _LIBCPP_NODEBUG = __segmented_iterator_traits<_InputIterator>;
+// __segment_processor handles the per-segment processing by applying the user-provided function to each element
+// within the segment. It acts as a functor passed to the segmented iterator algorithm __for_each_segment.
+template <class _SegmentedIterator, class _Function>
+struct __segment_processor {
+  using _Traits _LIBCPP_NODEBUG = __segmented_iterator_traits<_SegmentedIterator>;
 
   _Function& __func_;
 
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 explicit __do_segment(_Function& __func) : __func_(__func) {}
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 explicit __segment_processor(_Function& __func)
+      : __func_(__func) {}
 
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 void
   operator()(typename _Traits::__local_iterator __lfirst, typename _Traits::__local_iterator __llast) {
@@ -53,7 +55,7 @@ template <class _SegmentedIterator,
           __enable_if_t<__is_segmented_iterator<_SegmentedIterator>::value, int> = 0>
 _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Function
 __for_each(_SegmentedIterator __first, _SegmentedIterator __last, _Function& __func) {
-  std::__for_each_segment(__first, __last, std::__do_segment<_SegmentedIterator, _Function>(__func));
+  std::__for_each_segment(__first, __last, std::__segment_processor<_SegmentedIterator, _Function>(__func));
   return std::move(__func);
 }
 
diff --git a/libcxx/include/__algorithm/for_each_n.h b/libcxx/include/__algorithm/for_each_n.h
index 2abb163766dcb..2eefa2c1b26ac 100644
--- a/libcxx/include/__algorithm/for_each_n.h
+++ b/libcxx/include/__algorithm/for_each_n.h
@@ -53,7 +53,7 @@ template <class _SegmentedIterator,
                         int> = 0>
 inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _SegmentedIterator
 for_each_n(_SegmentedIterator __first, _Size __orig_n, _Function __f) {
-  return std::__for_each_n_segment(__first, __orig_n, std::__do_segment<_SegmentedIterator, _Function>(__f));
+  return std::__for_each_n_segment(__first, __orig_n, std::__segment_processor<_SegmentedIterator, _Function>(__f));
 }
 
 #endif

>From d1bb4a464c57dc88018cc3327a296d4937d67f45 Mon Sep 17 00:00:00 2001
From: Peng Liu <winner245 at hotmail.com>
Date: Wed, 2 Apr 2025 23:15:57 -0400
Subject: [PATCH 13/14] Fix review comments

---
 libcxx/docs/ReleaseNotes/21.rst               |   4 +-
 libcxx/include/__algorithm/for_each.h         |  35 +++--
 libcxx/include/__algorithm/for_each_n.h       |  55 ++++++--
 .../include/__algorithm/for_each_n_segment.h  |   9 +-
 libcxx/include/__algorithm/ranges_for_each.h  |  16 +--
 .../include/__algorithm/ranges_for_each_n.h   |  14 +-
 .../nonmodifying/for_each.bench.cpp           |  56 ++++++--
 .../nonmodifying/for_each_join_view.bench.cpp | 122 ------------------
 .../nonmodifying/for_each_n.bench.cpp         |  55 ++++++--
 .../alg.foreach/for_each_n.pass.cpp           |   8 +-
 10 files changed, 166 insertions(+), 208 deletions(-)
 delete mode 100644 libcxx/test/benchmarks/algorithms/nonmodifying/for_each_join_view.bench.cpp

diff --git a/libcxx/docs/ReleaseNotes/21.rst b/libcxx/docs/ReleaseNotes/21.rst
index cdff856836013..22d4648ba2b84 100644
--- a/libcxx/docs/ReleaseNotes/21.rst
+++ b/libcxx/docs/ReleaseNotes/21.rst
@@ -61,8 +61,8 @@ Improvements and New Features
 - Updated formatting library to Unicode 16.0.0.
 
 - The ``std::ranges::for_each`` and ``std::ranges::for_each_n`` algorithms have been optimized for segmented iterators,
-  resulting in performance improvements of up to 21.3x for ``std::deque::iterator`` segmented inputs and 24.9x for
-  ``join_view`` of ``vector<vector<T>>``.
+  resulting in performance improvements of up to 21.3x for ``std::deque::iterator`` and 24.9x for ``join_view`` of
+  ``vector<vector<char>>``.
 
 Deprecations and Removals
 -------------------------
diff --git a/libcxx/include/__algorithm/for_each.h b/libcxx/include/__algorithm/for_each.h
index 57cbb47d1ba8f..f2e6a9e5513eb 100644
--- a/libcxx/include/__algorithm/for_each.h
+++ b/libcxx/include/__algorithm/for_each.h
@@ -12,6 +12,8 @@
 
 #include <__algorithm/for_each_segment.h>
 #include <__config>
+#include <__functional/identity.h>
+#include <__functional/invoke.h>
 #include <__iterator/segmented_iterator.h>
 #include <__type_traits/enable_if.h>
 #include <__utility/move.h>
@@ -25,44 +27,49 @@ _LIBCPP_PUSH_MACROS
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
-template <class _InputIterator, class _Sent, class _Function>
-_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Function
-__for_each(_InputIterator __first, _Sent __last, _Function& __f) {
+template <class _InputIterator, class _Sent, class _Function, class _Proj>
+_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _InputIterator
+__for_each(_InputIterator __first, _Sent __last, _Function& __f, _Proj& __proj) {
   for (; __first != __last; ++__first)
-    __f(*__first);
-  return std::move(__f);
+    std::invoke(__f, std::invoke(__proj, *__first));
+  return __first;
 }
 
 // __segment_processor handles the per-segment processing by applying the user-provided function to each element
 // within the segment. It acts as a functor passed to the segmented iterator algorithm __for_each_segment.
-template <class _SegmentedIterator, class _Function>
+template <class _SegmentedIterator, class _Function, class _Proj>
 struct __segment_processor {
   using _Traits _LIBCPP_NODEBUG = __segmented_iterator_traits<_SegmentedIterator>;
 
   _Function& __func_;
+  _Proj& __proj_;
 
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 explicit __segment_processor(_Function& __func)
-      : __func_(__func) {}
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 explicit __segment_processor(_Function& __func, _Proj& __proj)
+      : __func_(__func), __proj_(__proj) {}
 
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 void
   operator()(typename _Traits::__local_iterator __lfirst, typename _Traits::__local_iterator __llast) {
-    std::__for_each(__lfirst, __llast, __func_);
+    std::__for_each(__lfirst, __llast, __func_, __proj_);
   }
 };
 
 template <class _SegmentedIterator,
           class _Function,
+          class _Proj,
           __enable_if_t<__is_segmented_iterator<_SegmentedIterator>::value, int> = 0>
-_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Function
-__for_each(_SegmentedIterator __first, _SegmentedIterator __last, _Function& __func) {
-  std::__for_each_segment(__first, __last, std::__segment_processor<_SegmentedIterator, _Function>(__func));
-  return std::move(__func);
+_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _SegmentedIterator
+__for_each(_SegmentedIterator __first, _SegmentedIterator __last, _Function& __func, _Proj& __proj) {
+  std::__for_each_segment(
+      __first, __last, std::__segment_processor<_SegmentedIterator, _Function, _Proj>(__func, __proj));
+  return __last;
 }
 
 template <class _InputIterator, class _Function>
 _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Function
 for_each(_InputIterator __first, _InputIterator __last, _Function __f) {
-  return std::__for_each(__first, __last, __f);
+  __identity __proj;
+  std::__for_each(__first, __last, __f, __proj);
+  return std::move(__f);
 }
 
 _LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/include/__algorithm/for_each_n.h b/libcxx/include/__algorithm/for_each_n.h
index 2eefa2c1b26ac..ca50fa2ecb8e0 100644
--- a/libcxx/include/__algorithm/for_each_n.h
+++ b/libcxx/include/__algorithm/for_each_n.h
@@ -13,8 +13,9 @@
 #include <__algorithm/for_each.h>
 #include <__algorithm/for_each_n_segment.h>
 #include <__config>
+#include <__functional/identity.h>
+#include <__functional/invoke.h>
 #include <__iterator/iterator_traits.h>
-#include <__iterator/next.h>
 #include <__iterator/segmented_iterator.h>
 #include <__type_traits/enable_if.h>
 #include <__utility/convert_to_integral.h>
@@ -25,35 +26,63 @@
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
-#if _LIBCPP_STD_VER >= 17
-
 template <class _InputIterator,
           class _Size,
           class _Function,
-          __enable_if_t<!__is_segmented_iterator<_InputIterator>::value ||
-                            __has_exactly_input_iterator_category<_InputIterator>::value,
+          class _Proj,
+          __enable_if_t<!__has_random_access_iterator_category<_InputIterator>::value &&
+                            (!__is_segmented_iterator<_InputIterator>::value
+                             //   || !__has_random_access_iterator_category<
+                             //      typename __segmented_iterator_traits<_InputIterator>::__local_iterator>::value
+                             ), // TODO: __segmented_iterator_traits<_InputIterator> results in template instantiation
+                                // during SFINAE, which is a hard error to be fixed. Once fixed, we should uncomment.
                         int> = 0>
-inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _InputIterator
-for_each_n(_InputIterator __first, _Size __orig_n, _Function __f) {
+_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _InputIterator
+__for_each_n(_InputIterator __first, _Size __orig_n, _Function& __f, _Proj& __proj) {
   typedef decltype(std::__convert_to_integral(__orig_n)) _IntegralSize;
   _IntegralSize __n = __orig_n;
   while (__n > 0) {
-    __f(*__first);
+    std::invoke(__f, std::invoke(__proj, *__first));
     ++__first;
     --__n;
   }
   return __first;
 }
 
+template <class _RandIter,
+          class _Size,
+          class _Function,
+          class _Proj,
+          __enable_if_t<__has_random_access_iterator_category<_RandIter>::value, int> = 0>
+_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _RandIter
+__for_each_n(_RandIter __first, _Size __orig_n, _Function& __f, _Proj& __proj) {
+  typedef decltype(std::__convert_to_integral(__orig_n)) _IntegralSize;
+  _IntegralSize __n = __orig_n;
+  return std::__for_each(__first, __first + __n, __f, __proj);
+}
+
 template <class _SegmentedIterator,
           class _Size,
           class _Function,
-          __enable_if_t<__is_segmented_iterator<_SegmentedIterator>::value &&
-                            __has_forward_iterator_category<_SegmentedIterator>::value,
+          class _Proj,
+          __enable_if_t<!__has_random_access_iterator_category<_SegmentedIterator>::value &&
+                            __is_segmented_iterator<_SegmentedIterator>::value &&
+                            __has_random_access_iterator_category<
+                                typename __segmented_iterator_traits<_SegmentedIterator>::__local_iterator>::value,
                         int> = 0>
-inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _SegmentedIterator
-for_each_n(_SegmentedIterator __first, _Size __orig_n, _Function __f) {
-  return std::__for_each_n_segment(__first, __orig_n, std::__segment_processor<_SegmentedIterator, _Function>(__f));
+_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _SegmentedIterator
+__for_each_n(_SegmentedIterator __first, _Size __orig_n, _Function& __f, _Proj& __proj) {
+  return std::__for_each_n_segment(
+      __first, __orig_n, std::__segment_processor<_SegmentedIterator, _Function, _Proj>(__f, __proj));
+}
+
+#if _LIBCPP_STD_VER >= 17
+
+template <class _InputIterator, class _Size, class _Function>
+inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _InputIterator
+for_each_n(_InputIterator __first, _Size __orig_n, _Function __f) {
+  __identity __proj;
+  return std::__for_each_n(__first, __orig_n, __f, __proj);
 }
 
 #endif
diff --git a/libcxx/include/__algorithm/for_each_n_segment.h b/libcxx/include/__algorithm/for_each_n_segment.h
index 81e3cf17f6db8..67b0e5f3cdaeb 100644
--- a/libcxx/include/__algorithm/for_each_n_segment.h
+++ b/libcxx/include/__algorithm/for_each_n_segment.h
@@ -11,6 +11,7 @@
 
 #include <__config>
 #include <__iterator/distance.h>
+#include <__iterator/iterator_traits.h>
 #include <__iterator/next.h>
 #include <__iterator/segmented_iterator.h>
 #include <__utility/convert_to_integral.h>
@@ -22,14 +23,18 @@
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 // __for_each_n_segment optimizes linear iteration over segmented iterators. It processes a segmented
-// input range defined by (__first, __orig_n), where __first is the starting segmented iterator and
-// __orig_n is the number of elements to process. The functor __func is applied to each segment using
+// input range defined by [__first, __first + __n), where __first is the starting segmented iterator
+// and __n is the number of elements to process. The functor __func is applied to each segment using
 // local iterator pairs for that segment. The return value of __func is ignored, and the function
 // returns an iterator pointing to one past the last processed element in the input range.
 
 template <class _SegmentedIterator, class _Size, class _Functor>
 _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _SegmentedIterator
 __for_each_n_segment(_SegmentedIterator __first, _Size __orig_n, _Functor __func) {
+  static_assert(__is_segmented_iterator<_SegmentedIterator>::value &&
+                    __has_random_access_iterator_category<
+                        typename __segmented_iterator_traits<_SegmentedIterator>::__local_iterator>::value,
+                "__for_each_n_segment only works with segmented iterators with random-access local iterators");
   if (__orig_n == 0)
     return __first;
 
diff --git a/libcxx/include/__algorithm/ranges_for_each.h b/libcxx/include/__algorithm/ranges_for_each.h
index 961f7558149a3..ed0dcde688406 100644
--- a/libcxx/include/__algorithm/ranges_for_each.h
+++ b/libcxx/include/__algorithm/ranges_for_each.h
@@ -44,19 +44,13 @@ struct __for_each {
   template <class _Iter, class _Sent, class _Proj, class _Func>
   _LIBCPP_HIDE_FROM_ABI constexpr static for_each_result<_Iter, _Func>
   __for_each_impl(_Iter __first, _Sent __last, _Func& __func, _Proj& __proj) {
-    if constexpr (std::assignable_from<_Iter&, _Sent>) {
-      _Iter __end = std::move(__last);
-      std::for_each(__first, __end, [&](auto&& __val) { std::invoke(__func, std::invoke(__proj, __val)); });
-      return {std::move(__end), std::move(__func)};
-    } else if constexpr (sized_sentinel_for<_Sent, _Iter>) {
-      auto __end = std::for_each_n(__first, __last - __first, [&](auto&& __val) {
-        std::invoke(__func, std::invoke(__proj, __val));
-      });
+    if constexpr (!std::assignable_from<_Iter&, _Sent> && sized_sentinel_for<_Sent, _Iter>) {
+      auto __n   = __last - __first;
+      auto __end = std::__for_each_n(std::move(__first), __n, __func, __proj);
       return {std::move(__end), std::move(__func)};
     } else {
-      for (; __first != __last; ++__first)
-        std::invoke(__func, std::invoke(__proj, *__first));
-      return {std::move(__first), std::move(__func)};
+      auto __end = std::__for_each(std::move(__first), std::move(__last), __func, __proj);
+      return {std::move(__end), std::move(__func)};
     }
   }
 
diff --git a/libcxx/include/__algorithm/ranges_for_each_n.h b/libcxx/include/__algorithm/ranges_for_each_n.h
index b92eeb6fa8d7c..ebcd38a8eef6f 100644
--- a/libcxx/include/__algorithm/ranges_for_each_n.h
+++ b/libcxx/include/__algorithm/ranges_for_each_n.h
@@ -17,7 +17,6 @@
 #include <__iterator/concepts.h>
 #include <__iterator/incrementable_traits.h>
 #include <__iterator/iterator_traits.h>
-#include <__iterator/next.h>
 #include <__iterator/projected.h>
 #include <__ranges/concepts.h>
 #include <__utility/move.h>
@@ -42,17 +41,8 @@ struct __for_each_n {
   template <input_iterator _Iter, class _Proj = identity, indirectly_unary_invocable<projected<_Iter, _Proj>> _Func>
   _LIBCPP_HIDE_FROM_ABI constexpr for_each_n_result<_Iter, _Func>
   operator()(_Iter __first, iter_difference_t<_Iter> __count, _Func __func, _Proj __proj = {}) const {
-    if constexpr (forward_iterator<_Iter>) {
-      auto __f    = [&](auto&& __val) { std::invoke(__func, std::invoke(__proj, __val)); };
-      auto __last = std::for_each_n(__first, __count, __f);
-      return {std::move(__last), std::move(__func)};
-    } else {
-      while (__count-- > 0) {
-        std::invoke(__func, std::invoke(__proj, *__first));
-        ++__first;
-      }
-      return {std::move(__first), std::move(__func)};
-    }
+    auto __last = std::__for_each_n(std::move(__first), __count, __func, __proj);
+    return {std::move(__last), std::move(__func)};
   }
 };
 
diff --git a/libcxx/test/benchmarks/algorithms/nonmodifying/for_each.bench.cpp b/libcxx/test/benchmarks/algorithms/nonmodifying/for_each.bench.cpp
index 1e33cf70f8487..9151ca19c7862 100644
--- a/libcxx/test/benchmarks/algorithms/nonmodifying/for_each.bench.cpp
+++ b/libcxx/test/benchmarks/algorithms/nonmodifying/for_each.bench.cpp
@@ -12,6 +12,7 @@
 #include <cstddef>
 #include <deque>
 #include <list>
+#include <ranges>
 #include <string>
 #include <vector>
 
@@ -48,20 +49,6 @@ int main(int argc, char** argv) {
           ->Arg(1 << 16)
           ->Arg(1 << 18);
     };
-    bm.operator()<std::vector<char>>("std::for_each(vector<char>)", std_for_each);
-    bm.operator()<std::deque<char>>("std::for_each(deque<char>)", std_for_each);
-    bm.operator()<std::list<char>>("std::for_each(list<char>)", std_for_each);
-    bm.operator()<std::vector<char>>("rng::for_each(vector<char>)", std::ranges::for_each);
-    bm.operator()<std::deque<char>>("rng::for_each(deque<char>)", std::ranges::for_each);
-    bm.operator()<std::list<char>>("rng::for_each(list<char>)", std::ranges::for_each);
-
-    bm.operator()<std::vector<short>>("std::for_each(vector<short>)", std_for_each);
-    bm.operator()<std::deque<short>>("std::for_each(deque<short>)", std_for_each);
-    bm.operator()<std::list<short>>("std::for_each(list<short>)", std_for_each);
-    bm.operator()<std::vector<short>>("rng::for_each(vector<short>)", std::ranges::for_each);
-    bm.operator()<std::deque<short>>("rng::for_each(deque<short>)", std::ranges::for_each);
-    bm.operator()<std::list<short>>("rng::for_each(list<short>)", std::ranges::for_each);
-
     bm.operator()<std::vector<int>>("std::for_each(vector<int>)", std_for_each);
     bm.operator()<std::deque<int>>("std::for_each(deque<int>)", std_for_each);
     bm.operator()<std::list<int>>("std::for_each(list<int>)", std_for_each);
@@ -70,6 +57,47 @@ int main(int argc, char** argv) {
     bm.operator()<std::list<int>>("rng::for_each(list<int>)", std::ranges::for_each);
   }
 
+  // {std,ranges}::for_each for join_view
+  {
+    auto bm = []<class Container>(std::string name, auto for_each) {
+      using C1       = typename Container::value_type;
+      using ElemType = typename C1::value_type;
+
+      benchmark::RegisterBenchmark(
+          name,
+          [for_each](auto& st) {
+            std::size_t const size     = st.range(0);
+            std::size_t const seg_size = 256;
+            std::size_t const segments = (size + seg_size - 1) / seg_size;
+            Container c(segments);
+            for (std::size_t i = 0, n = size; i < segments; ++i, n -= seg_size) {
+              c[i].resize(std::min(seg_size, n), ElemType(1));
+            }
+
+            auto view  = c | std::views::join;
+            auto first = view.begin();
+            auto last  = view.end();
+
+            for ([[maybe_unused]] auto _ : st) {
+              benchmark::DoNotOptimize(c);
+              auto result = for_each(first, last, [](ElemType& x) { x = std::clamp<ElemType>(x, 10, 100); });
+              benchmark::DoNotOptimize(result);
+            }
+          })
+          ->Arg(8)
+          ->Arg(32)
+          ->Arg(50) // non power-of-two
+          ->Arg(1024)
+          ->Arg(4096)
+          ->Arg(8192)
+          ->Arg(1 << 14)
+          ->Arg(1 << 16)
+          ->Arg(1 << 18);
+    };
+    bm.operator()<std::vector<std::vector<int>>>("std::for_each(join_view(vector<vector<int>>))", std_for_each);
+    bm.operator()<std::vector<std::vector<int>>>("rng::for_each(join_view(vector<vector<int>>)", std::ranges::for_each);
+  }
+
   benchmark::Initialize(&argc, argv);
   benchmark::RunSpecifiedBenchmarks();
   benchmark::Shutdown();
diff --git a/libcxx/test/benchmarks/algorithms/nonmodifying/for_each_join_view.bench.cpp b/libcxx/test/benchmarks/algorithms/nonmodifying/for_each_join_view.bench.cpp
deleted file mode 100644
index 28398ac988bf7..0000000000000
--- a/libcxx/test/benchmarks/algorithms/nonmodifying/for_each_join_view.bench.cpp
+++ /dev/null
@@ -1,122 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// UNSUPPORTED: c++03, c++11, c++14, c++17
-
-#include <algorithm>
-#include <cstddef>
-#include <deque>
-#include <list>
-#include <ranges>
-#include <string>
-#include <vector>
-
-#include <benchmark/benchmark.h>
-
-int main(int argc, char** argv) {
-  auto std_for_each   = [](auto first, auto last, auto f) { return std::for_each(first, last, f); };
-  auto std_for_each_n = [](auto first, auto n, auto f) { return std::for_each_n(first, n, f); };
-
-  // {std,ranges}::for_each
-  {
-    auto bm = []<class Container>(std::string name, auto for_each) {
-      using C1       = typename Container::value_type;
-      using ElemType = typename C1::value_type;
-
-      benchmark::RegisterBenchmark(
-          name,
-          [for_each](auto& st) {
-            std::size_t const size     = st.range(0);
-            std::size_t const seg_size = 256;
-            std::size_t const segments = (size + seg_size - 1) / seg_size;
-            Container c(segments);
-            for (std::size_t i = 0, n = size; i < segments; ++i, n -= seg_size) {
-              c[i].resize(std::min(seg_size, n), ElemType(1));
-            }
-
-            auto view  = c | std::views::join;
-            auto first = view.begin();
-            auto last  = view.end();
-
-            for ([[maybe_unused]] auto _ : st) {
-              benchmark::DoNotOptimize(c);
-              auto result = for_each(first, last, [](ElemType& x) { x = std::clamp<ElemType>(x, 10, 100); });
-              benchmark::DoNotOptimize(result);
-            }
-          })
-          ->Arg(8)
-          ->Arg(32)
-          ->Arg(50) // non power-of-two
-          ->Arg(1024)
-          ->Arg(4096)
-          ->Arg(8192)
-          ->Arg(1 << 14)
-          ->Arg(1 << 16)
-          ->Arg(1 << 18);
-    };
-    bm.operator()<std::vector<std::vector<char>>>("std::for_each(join_view(vector<vector<char>>))", std_for_each);
-    bm.operator()<std::vector<std::vector<short>>>("std::for_each(join_view(vector<vector<short>>))", std_for_each);
-    bm.operator()<std::vector<std::vector<int>>>("std::for_each(join_view(vector<vector<int>>))", std_for_each);
-    bm.operator()<std::vector<std::vector<char>>>(
-        "rng::for_each(join_view(vector<vector<char>>)", std::ranges::for_each);
-    bm.operator()<std::vector<std::vector<short>>>(
-        "rng::for_each(join_view(vector<vector<short>>)", std::ranges::for_each);
-    bm.operator()<std::vector<std::vector<int>>>("rng::for_each(join_view(vector<vector<int>>)", std::ranges::for_each);
-  }
-
-  // {std,ranges}::for_each_n
-  {
-    auto bm = []<class Container>(std::string name, auto for_each_n) {
-      using C1       = typename Container::value_type;
-      using ElemType = typename C1::value_type;
-      benchmark::RegisterBenchmark(
-          name,
-          [for_each_n](auto& st) {
-            std::size_t const size     = st.range(0);
-            std::size_t const seg_size = 256;
-            std::size_t const segments = (size + seg_size - 1) / seg_size;
-            Container c(segments);
-            for (std::size_t i = 0, n = size; i < segments; ++i, n -= seg_size) {
-              c[i].resize(std::min(seg_size, n), ElemType(1));
-            }
-
-            auto view  = c | std::views::join;
-            auto first = view.begin();
-
-            for ([[maybe_unused]] auto _ : st) {
-              benchmark::DoNotOptimize(c);
-              auto result = for_each_n(first, size, [](ElemType& x) { x = std::clamp<ElemType>(x, 10, 100); });
-              benchmark::DoNotOptimize(result);
-            }
-          })
-          ->Arg(8)
-          ->Arg(32)
-          ->Arg(50) // non power-of-two
-          ->Arg(1024)
-          ->Arg(4096)
-          ->Arg(8192)
-          ->Arg(1 << 14)
-          ->Arg(1 << 16)
-          ->Arg(1 << 18);
-    };
-    bm.operator()<std::vector<std::vector<char>>>("std::for_each_n(join_view(vector<vector<char>>))", std_for_each_n);
-    bm.operator()<std::vector<std::vector<short>>>("std::for_each_n(join_view(vector<vector<short>>))", std_for_each_n);
-    bm.operator()<std::vector<std::vector<int>>>("std::for_each_n(join_view(vector<vector<int>>))", std_for_each_n);
-    bm.operator()<std::vector<std::vector<char>>>(
-        "rng::for_each_n(join_view(vector<vector<char>>)", std::ranges::for_each_n);
-    bm.operator()<std::vector<std::vector<short>>>(
-        "rng::for_each_n(join_view(vector<vector<short>>)", std::ranges::for_each_n);
-    bm.operator()<std::vector<std::vector<int>>>(
-        "rng::for_each_n(join_view(vector<vector<int>>)", std::ranges::for_each_n);
-  }
-
-  benchmark::Initialize(&argc, argv);
-  benchmark::RunSpecifiedBenchmarks();
-  benchmark::Shutdown();
-  return 0;
-}
diff --git a/libcxx/test/benchmarks/algorithms/nonmodifying/for_each_n.bench.cpp b/libcxx/test/benchmarks/algorithms/nonmodifying/for_each_n.bench.cpp
index 9e77f51db10cc..e6624bd304447 100644
--- a/libcxx/test/benchmarks/algorithms/nonmodifying/for_each_n.bench.cpp
+++ b/libcxx/test/benchmarks/algorithms/nonmodifying/for_each_n.bench.cpp
@@ -12,6 +12,7 @@
 #include <cstddef>
 #include <deque>
 #include <list>
+#include <ranges>
 #include <string>
 #include <vector>
 
@@ -47,20 +48,6 @@ int main(int argc, char** argv) {
           ->Arg(1 << 16)
           ->Arg(1 << 18);
     };
-    bm.operator()<std::vector<char>>("std::for_each_n(vector<char>)", std_for_each_n);
-    bm.operator()<std::deque<char>>("std::for_each_n(deque<char>)", std_for_each_n);
-    bm.operator()<std::list<char>>("std::for_each_n(list<char>)", std_for_each_n);
-    bm.operator()<std::vector<char>>("rng::for_each_n(vector<char>)", std::ranges::for_each_n);
-    bm.operator()<std::deque<char>>("rng::for_each_n(deque<char>)", std::ranges::for_each_n);
-    bm.operator()<std::list<char>>("rng::for_each_n(list<char>)", std::ranges::for_each_n);
-
-    bm.operator()<std::vector<short>>("std::for_each_n(vector<short>)", std_for_each_n);
-    bm.operator()<std::deque<short>>("std::for_each_n(deque<short>)", std_for_each_n);
-    bm.operator()<std::list<short>>("std::for_each_n(list<short>)", std_for_each_n);
-    bm.operator()<std::vector<short>>("rng::for_each_n(vector<short>)", std::ranges::for_each_n);
-    bm.operator()<std::deque<short>>("rng::for_each_n(deque<short>)", std::ranges::for_each_n);
-    bm.operator()<std::list<short>>("rng::for_each_n(list<short>)", std::ranges::for_each_n);
-
     bm.operator()<std::vector<int>>("std::for_each_n(vector<int>)", std_for_each_n);
     bm.operator()<std::deque<int>>("std::for_each_n(deque<int>)", std_for_each_n);
     bm.operator()<std::list<int>>("std::for_each_n(list<int>)", std_for_each_n);
@@ -69,6 +56,46 @@ int main(int argc, char** argv) {
     bm.operator()<std::list<int>>("rng::for_each_n(list<int>)", std::ranges::for_each_n);
   }
 
+  // {std,ranges}::for_each_n for join_view
+  {
+    auto bm = []<class Container>(std::string name, auto for_each_n) {
+      using C1       = typename Container::value_type;
+      using ElemType = typename C1::value_type;
+      benchmark::RegisterBenchmark(
+          name,
+          [for_each_n](auto& st) {
+            std::size_t const size     = st.range(0);
+            std::size_t const seg_size = 256;
+            std::size_t const segments = (size + seg_size - 1) / seg_size;
+            Container c(segments);
+            for (std::size_t i = 0, n = size; i < segments; ++i, n -= seg_size) {
+              c[i].resize(std::min(seg_size, n), ElemType(1));
+            }
+
+            auto view  = c | std::views::join;
+            auto first = view.begin();
+
+            for ([[maybe_unused]] auto _ : st) {
+              benchmark::DoNotOptimize(c);
+              auto result = for_each_n(first, size, [](ElemType& x) { x = std::clamp<ElemType>(x, 10, 100); });
+              benchmark::DoNotOptimize(result);
+            }
+          })
+          ->Arg(8)
+          ->Arg(32)
+          ->Arg(50) // non power-of-two
+          ->Arg(1024)
+          ->Arg(4096)
+          ->Arg(8192)
+          ->Arg(1 << 14)
+          ->Arg(1 << 16)
+          ->Arg(1 << 18);
+    };
+    bm.operator()<std::vector<std::vector<int>>>("std::for_each_n(join_view(vector<vector<int>>))", std_for_each_n);
+    bm.operator()<std::vector<std::vector<int>>>(
+        "rng::for_each_n(join_view(vector<vector<int>>)", std::ranges::for_each_n);
+  }
+
   benchmark::Initialize(&argc, argv);
   benchmark::RunSpecifiedBenchmarks();
   benchmark::Shutdown();
diff --git a/libcxx/test/std/algorithms/alg.nonmodifying/alg.foreach/for_each_n.pass.cpp b/libcxx/test/std/algorithms/alg.nonmodifying/alg.foreach/for_each_n.pass.cpp
index 4d6e4c615c022..39c1174dcec8b 100644
--- a/libcxx/test/std/algorithms/alg.nonmodifying/alg.foreach/for_each_n.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.nonmodifying/alg.foreach/for_each_n.pass.cpp
@@ -65,7 +65,7 @@ TEST_CONSTEXPR_CXX20 bool test() {
 
     {
       unsigned count = 0;
-      Iter it        = std::for_each_n(Iter(ia), 0, [&count](int& i) mutable {
+      Iter it        = std::for_each_n(Iter(ia), 0, [&count](int& i) {
         ++i;
         ++count;
       });
@@ -75,7 +75,7 @@ TEST_CONSTEXPR_CXX20 bool test() {
 
     {
       unsigned count = 0;
-      Iter it        = std::for_each_n(Iter(ia), s, [&count](int& i) mutable {
+      Iter it        = std::for_each_n(Iter(ia), s, [&count](int& i) {
         ++i;
         ++count;
       });
@@ -87,7 +87,7 @@ TEST_CONSTEXPR_CXX20 bool test() {
 
     {
       unsigned count = 0;
-      Iter it        = std::for_each_n(Iter(ia), 1, [&count](int& i) mutable {
+      Iter it        = std::for_each_n(Iter(ia), 1, [&count](int& i) {
         ++i;
         ++count;
       });
@@ -111,7 +111,7 @@ TEST_CONSTEXPR_CXX20 bool test() {
     test_segmented_deque_iterator();
 
 #if TEST_STD_VER >= 20
-  { // Make sure that the segmented iterator optimization works during constant evaluation
+  {
     std::vector<std::vector<int>> vec = {{0}, {1, 2}, {3, 4, 5}, {6, 7, 8, 9}, {10}, {11, 12, 13}};
     auto v                            = vec | std::views::join;
     std::for_each_n(v.begin(), std::ranges::distance(v), [i = 0](int& a) mutable { assert(a == i++); });

>From 85481547f5b47242cffbec80048679e3555e484b Mon Sep 17 00:00:00 2001
From: Peng Liu <winner245 at hotmail.com>
Date: Fri, 4 Apr 2025 21:17:22 -0400
Subject: [PATCH 14/14] Fix invoke call and add projection

---
 libcxx/include/__algorithm/for_each.h         | 41 +++++++++----------
 libcxx/include/__algorithm/for_each_n.h       | 23 +++++------
 .../include/__algorithm/for_each_n_segment.h  | 19 ++++-----
 libcxx/include/__algorithm/for_each_segment.h | 16 ++++----
 libcxx/include/__algorithm/ranges_for_each.h  |  1 -
 .../include/__algorithm/ranges_for_each_n.h   |  2 -
 6 files changed, 48 insertions(+), 54 deletions(-)

diff --git a/libcxx/include/__algorithm/for_each.h b/libcxx/include/__algorithm/for_each.h
index f2e6a9e5513eb..6d945f728faa4 100644
--- a/libcxx/include/__algorithm/for_each.h
+++ b/libcxx/include/__algorithm/for_each.h
@@ -13,9 +13,9 @@
 #include <__algorithm/for_each_segment.h>
 #include <__config>
 #include <__functional/identity.h>
-#include <__functional/invoke.h>
 #include <__iterator/segmented_iterator.h>
 #include <__type_traits/enable_if.h>
+#include <__type_traits/invoke.h>
 #include <__utility/move.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
@@ -27,46 +27,45 @@ _LIBCPP_PUSH_MACROS
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
-template <class _InputIterator, class _Sent, class _Function, class _Proj>
+template <class _InputIterator, class _Sent, class _Func, class _Proj>
 _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _InputIterator
-__for_each(_InputIterator __first, _Sent __last, _Function& __f, _Proj& __proj) {
+__for_each(_InputIterator __first, _Sent __last, _Func& __f, _Proj& __proj) {
   for (; __first != __last; ++__first)
-    std::invoke(__f, std::invoke(__proj, *__first));
+    std::__invoke(__f, std::__invoke(__proj, *__first));
   return __first;
 }
 
-// __segment_processor handles the per-segment processing by applying the user-provided function to each element
-// within the segment. It acts as a functor passed to the segmented iterator algorithm __for_each_segment.
-template <class _SegmentedIterator, class _Function, class _Proj>
+// __segment_processor handles the per-segment processing by applying the function object __func_ to the
+// projected value of each element within the segment. It serves as a functor utilized by the segmented
+// iterator algorithms such as __for_each_segment and __for_each_n_segment.
+template <class _SegmentedIterator, class _Func>
 struct __segment_processor {
   using _Traits _LIBCPP_NODEBUG = __segmented_iterator_traits<_SegmentedIterator>;
 
-  _Function& __func_;
-  _Proj& __proj_;
+  _Func& __func_;
 
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 explicit __segment_processor(_Function& __func, _Proj& __proj)
-      : __func_(__func), __proj_(__proj) {}
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR explicit __segment_processor(_Func& __f) : __func_(__f) {}
 
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 void
-  operator()(typename _Traits::__local_iterator __lfirst, typename _Traits::__local_iterator __llast) {
-    std::__for_each(__lfirst, __llast, __func_, __proj_);
+  template <class _Proj>
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void
+  operator()(typename _Traits::__local_iterator __lfirst, typename _Traits::__local_iterator __llast, _Proj& __proj) {
+    std::__for_each(__lfirst, __llast, __func_, __proj);
   }
 };
 
 template <class _SegmentedIterator,
-          class _Function,
+          class _Func,
           class _Proj,
           __enable_if_t<__is_segmented_iterator<_SegmentedIterator>::value, int> = 0>
 _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _SegmentedIterator
-__for_each(_SegmentedIterator __first, _SegmentedIterator __last, _Function& __func, _Proj& __proj) {
-  std::__for_each_segment(
-      __first, __last, std::__segment_processor<_SegmentedIterator, _Function, _Proj>(__func, __proj));
+__for_each(_SegmentedIterator __first, _SegmentedIterator __last, _Func& __f, _Proj& __proj) {
+  std::__for_each_segment(__first, __last, std::__segment_processor<_SegmentedIterator, _Func>(__f), __proj);
   return __last;
 }
 
-template <class _InputIterator, class _Function>
-_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Function
-for_each(_InputIterator __first, _InputIterator __last, _Function __f) {
+template <class _InputIterator, class _Func>
+_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Func
+for_each(_InputIterator __first, _InputIterator __last, _Func __f) {
   __identity __proj;
   std::__for_each(__first, __last, __f, __proj);
   return std::move(__f);
diff --git a/libcxx/include/__algorithm/for_each_n.h b/libcxx/include/__algorithm/for_each_n.h
index ca50fa2ecb8e0..7a1b030ea7040 100644
--- a/libcxx/include/__algorithm/for_each_n.h
+++ b/libcxx/include/__algorithm/for_each_n.h
@@ -14,10 +14,10 @@
 #include <__algorithm/for_each_n_segment.h>
 #include <__config>
 #include <__functional/identity.h>
-#include <__functional/invoke.h>
 #include <__iterator/iterator_traits.h>
 #include <__iterator/segmented_iterator.h>
 #include <__type_traits/enable_if.h>
+#include <__type_traits/invoke.h>
 #include <__utility/convert_to_integral.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
@@ -28,7 +28,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 
 template <class _InputIterator,
           class _Size,
-          class _Function,
+          class _Func,
           class _Proj,
           __enable_if_t<!__has_random_access_iterator_category<_InputIterator>::value &&
                             (!__is_segmented_iterator<_InputIterator>::value
@@ -38,11 +38,11 @@ template <class _InputIterator,
                                 // during SFINAE, which is a hard error to be fixed. Once fixed, we should uncomment.
                         int> = 0>
 _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _InputIterator
-__for_each_n(_InputIterator __first, _Size __orig_n, _Function& __f, _Proj& __proj) {
+__for_each_n(_InputIterator __first, _Size __orig_n, _Func& __f, _Proj& __proj) {
   typedef decltype(std::__convert_to_integral(__orig_n)) _IntegralSize;
   _IntegralSize __n = __orig_n;
   while (__n > 0) {
-    std::invoke(__f, std::invoke(__proj, *__first));
+    std::__invoke(__f, std::__invoke(__proj, *__first));
     ++__first;
     --__n;
   }
@@ -51,11 +51,11 @@ __for_each_n(_InputIterator __first, _Size __orig_n, _Function& __f, _Proj& __pr
 
 template <class _RandIter,
           class _Size,
-          class _Function,
+          class _Func,
           class _Proj,
           __enable_if_t<__has_random_access_iterator_category<_RandIter>::value, int> = 0>
 _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _RandIter
-__for_each_n(_RandIter __first, _Size __orig_n, _Function& __f, _Proj& __proj) {
+__for_each_n(_RandIter __first, _Size __orig_n, _Func& __f, _Proj& __proj) {
   typedef decltype(std::__convert_to_integral(__orig_n)) _IntegralSize;
   _IntegralSize __n = __orig_n;
   return std::__for_each(__first, __first + __n, __f, __proj);
@@ -63,7 +63,7 @@ __for_each_n(_RandIter __first, _Size __orig_n, _Function& __f, _Proj& __proj) {
 
 template <class _SegmentedIterator,
           class _Size,
-          class _Function,
+          class _Func,
           class _Proj,
           __enable_if_t<!__has_random_access_iterator_category<_SegmentedIterator>::value &&
                             __is_segmented_iterator<_SegmentedIterator>::value &&
@@ -71,16 +71,15 @@ template <class _SegmentedIterator,
                                 typename __segmented_iterator_traits<_SegmentedIterator>::__local_iterator>::value,
                         int> = 0>
 _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _SegmentedIterator
-__for_each_n(_SegmentedIterator __first, _Size __orig_n, _Function& __f, _Proj& __proj) {
-  return std::__for_each_n_segment(
-      __first, __orig_n, std::__segment_processor<_SegmentedIterator, _Function, _Proj>(__f, __proj));
+__for_each_n(_SegmentedIterator __first, _Size __orig_n, _Func& __f, _Proj& __proj) {
+  return std::__for_each_n_segment(__first, __orig_n, std::__segment_processor<_SegmentedIterator, _Func>(__f), __proj);
 }
 
 #if _LIBCPP_STD_VER >= 17
 
-template <class _InputIterator, class _Size, class _Function>
+template <class _InputIterator, class _Size, class _Func>
 inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _InputIterator
-for_each_n(_InputIterator __first, _Size __orig_n, _Function __f) {
+for_each_n(_InputIterator __first, _Size __orig_n, _Func __f) {
   __identity __proj;
   return std::__for_each_n(__first, __orig_n, __f, __proj);
 }
diff --git a/libcxx/include/__algorithm/for_each_n_segment.h b/libcxx/include/__algorithm/for_each_n_segment.h
index 67b0e5f3cdaeb..017aea71438ac 100644
--- a/libcxx/include/__algorithm/for_each_n_segment.h
+++ b/libcxx/include/__algorithm/for_each_n_segment.h
@@ -23,14 +23,13 @@
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 // __for_each_n_segment optimizes linear iteration over segmented iterators. It processes a segmented
-// input range defined by [__first, __first + __n), where __first is the starting segmented iterator
-// and __n is the number of elements to process. The functor __func is applied to each segment using
-// local iterator pairs for that segment. The return value of __func is ignored, and the function
-// returns an iterator pointing to one past the last processed element in the input range.
+// input range [__first, __first + __n) by applying the functor __func to the result of projection on
+// each element within that segment. The return value of __func is ignored, and the function returns
+// an iterator pointing to one past the last processed element in the input range.
 
-template <class _SegmentedIterator, class _Size, class _Functor>
+template <class _SegmentedIterator, class _Size, class _Functor, class _Proj>
 _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _SegmentedIterator
-__for_each_n_segment(_SegmentedIterator __first, _Size __orig_n, _Functor __func) {
+__for_each_n_segment(_SegmentedIterator __first, _Size __orig_n, _Functor __func, _Proj& __proj) {
   static_assert(__is_segmented_iterator<_SegmentedIterator>::value &&
                     __has_random_access_iterator_category<
                         typename __segmented_iterator_traits<_SegmentedIterator>::__local_iterator>::value,
@@ -50,13 +49,13 @@ __for_each_n_segment(_SegmentedIterator __first, _Size __orig_n, _Functor __func
   // Single-segment case: input range fits within a single segment (may not align with segment boundaries)
   if (__n <= __seg_size) {
     auto __llast = std::next(__lfirst, __n);
-    __func(__lfirst, __llast);
+    __func(__lfirst, __llast, __proj);
     return _Traits::__compose(__seg, __llast);
   }
 
   // Multi-segment case: input range spans multiple segments.
   // Process the first segment which might not start at the beginning of the segment
-  __func(__lfirst, __slast);
+  __func(__lfirst, __slast, __proj);
   ++__seg;
   __n -= __seg_size;
 
@@ -69,12 +68,12 @@ __for_each_n_segment(_SegmentedIterator __first, _Size __orig_n, _Functor __func
     // The last (potentially partial) segment
     if (__n <= __seg_size) {
       auto __llast = std::next(__sfirst, __n);
-      __func(__sfirst, __llast);
+      __func(__sfirst, __llast, __proj);
       return _Traits::__compose(__seg, __llast);
     }
 
     // Middle whole segments that are completely in the range
-    __func(__sfirst, __slast);
+    __func(__sfirst, __slast, __proj);
     ++__seg;
     __n -= __seg_size;
   }
diff --git a/libcxx/include/__algorithm/for_each_segment.h b/libcxx/include/__algorithm/for_each_segment.h
index 93aa8259b2f7f..64227054a823f 100644
--- a/libcxx/include/__algorithm/for_each_segment.h
+++ b/libcxx/include/__algorithm/for_each_segment.h
@@ -19,12 +19,12 @@
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 // __for_each_segment is a utility function for optimizing iterating over segmented iterators linearly.
-// __first and __last are expected to be a segmented range. __func is expected to take a range of local iterators.
-// Anything that is returned from __func is ignored.
+// __first and __last are expected to be a segmented range. __func is expected to take a range of local
+// iterators and a projection function __proj. Anything that is returned from __func is ignored.
 
-template <class _SegmentedIterator, class _Functor>
+template <class _SegmentedIterator, class _Functor, class _Proj>
 _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 void
-__for_each_segment(_SegmentedIterator __first, _SegmentedIterator __last, _Functor __func) {
+__for_each_segment(_SegmentedIterator __first, _SegmentedIterator __last, _Functor __func, _Proj& __proj) {
   using _Traits = __segmented_iterator_traits<_SegmentedIterator>;
 
   auto __sfirst = _Traits::__segment(__first);
@@ -32,20 +32,20 @@ __for_each_segment(_SegmentedIterator __first, _SegmentedIterator __last, _Funct
 
   // We are in a single segment, so we might not be at the beginning or end
   if (__sfirst == __slast) {
-    __func(_Traits::__local(__first), _Traits::__local(__last));
+    __func(_Traits::__local(__first), _Traits::__local(__last), __proj);
     return;
   }
 
   // We have more than one segment. Iterate over the first segment, since we might not start at the beginning
-  __func(_Traits::__local(__first), _Traits::__end(__sfirst));
+  __func(_Traits::__local(__first), _Traits::__end(__sfirst), __proj);
   ++__sfirst;
   // iterate over the segments which are guaranteed to be completely in the range
   while (__sfirst != __slast) {
-    __func(_Traits::__begin(__sfirst), _Traits::__end(__sfirst));
+    __func(_Traits::__begin(__sfirst), _Traits::__end(__sfirst), __proj);
     ++__sfirst;
   }
   // iterate over the last segment
-  __func(_Traits::__begin(__sfirst), _Traits::__local(__last));
+  __func(_Traits::__begin(__sfirst), _Traits::__local(__last), __proj);
 }
 
 _LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/include/__algorithm/ranges_for_each.h b/libcxx/include/__algorithm/ranges_for_each.h
index ed0dcde688406..1b11b52798dd6 100644
--- a/libcxx/include/__algorithm/ranges_for_each.h
+++ b/libcxx/include/__algorithm/ranges_for_each.h
@@ -15,7 +15,6 @@
 #include <__concepts/assignable.h>
 #include <__config>
 #include <__functional/identity.h>
-#include <__functional/invoke.h>
 #include <__iterator/concepts.h>
 #include <__iterator/projected.h>
 #include <__ranges/access.h>
diff --git a/libcxx/include/__algorithm/ranges_for_each_n.h b/libcxx/include/__algorithm/ranges_for_each_n.h
index ebcd38a8eef6f..38e5b4d94dc3c 100644
--- a/libcxx/include/__algorithm/ranges_for_each_n.h
+++ b/libcxx/include/__algorithm/ranges_for_each_n.h
@@ -12,8 +12,6 @@
 #include <__algorithm/for_each_n.h>
 #include <__algorithm/in_fun_result.h>
 #include <__config>
-#include <__functional/identity.h>
-#include <__functional/invoke.h>
 #include <__iterator/concepts.h>
 #include <__iterator/incrementable_traits.h>
 #include <__iterator/iterator_traits.h>



More information about the libcxx-commits mailing list