[libcxx-commits] [libcxx] [libc++] Optimize ranges::{for_each, for_each_n} for segmented iterators (PR #132896)

Peng Liu via libcxx-commits libcxx-commits at lists.llvm.org
Sat Jun 7 09:38:25 PDT 2025


https://github.com/winner245 updated https://github.com/llvm/llvm-project/pull/132896

>From f2b3f4adcb673b001c1373e46557a24ffbbc7ce2 Mon Sep 17 00:00:00 2001
From: Peng Liu <winner245 at hotmail.com>
Date: Tue, 25 Mar 2025 03:44:59 -0400
Subject: [PATCH 01/12] Optimize ranges::{for_each, for_each_n} for segmented
 iterators

---
 libcxx/include/__algorithm/ranges_for_each.h  | 14 ++++--
 .../include/__algorithm/ranges_for_each_n.h   | 15 ++++--
 .../nonmodifying/for_each_n.bench.cpp         |  2 +-
 .../alg.foreach/ranges.for_each.pass.cpp      | 46 +++++++++++++++++--
 .../alg.foreach/ranges.for_each_n.pass.cpp    | 46 ++++++++++++++++++-
 5 files changed, 108 insertions(+), 15 deletions(-)

diff --git a/libcxx/include/__algorithm/ranges_for_each.h b/libcxx/include/__algorithm/ranges_for_each.h
index de39bc5522753..475f85366188e 100644
--- a/libcxx/include/__algorithm/ranges_for_each.h
+++ b/libcxx/include/__algorithm/ranges_for_each.h
@@ -9,6 +9,7 @@
 #ifndef _LIBCPP___ALGORITHM_RANGES_FOR_EACH_H
 #define _LIBCPP___ALGORITHM_RANGES_FOR_EACH_H
 
+#include <__algorithm/for_each.h>
 #include <__algorithm/in_fun_result.h>
 #include <__config>
 #include <__functional/identity.h>
@@ -41,9 +42,16 @@ struct __for_each {
   template <class _Iter, class _Sent, class _Proj, class _Func>
   _LIBCPP_HIDE_FROM_ABI constexpr static for_each_result<_Iter, _Func>
   __for_each_impl(_Iter __first, _Sent __last, _Func& __func, _Proj& __proj) {
-    for (; __first != __last; ++__first)
-      std::invoke(__func, std::invoke(__proj, *__first));
-    return {std::move(__first), std::move(__func)};
+    if constexpr (random_access_iterator<_Iter> && sized_sentinel_for<_Sent, _Iter>) {
+      auto __n   = __last - __first;
+      auto __end = __first + __n;
+      std::for_each(__first, __end, [&](auto&& __val) { std::invoke(__func, std::invoke(__proj, __val)); });
+      return {std::move(__end), std::move(__func)};
+    } else {
+      for (; __first != __last; ++__first)
+        std::invoke(__func, std::invoke(__proj, *__first));
+      return {std::move(__first), std::move(__func)};
+    }
   }
 
 public:
diff --git a/libcxx/include/__algorithm/ranges_for_each_n.h b/libcxx/include/__algorithm/ranges_for_each_n.h
index 603cb723233c8..3108d66001295 100644
--- a/libcxx/include/__algorithm/ranges_for_each_n.h
+++ b/libcxx/include/__algorithm/ranges_for_each_n.h
@@ -9,6 +9,7 @@
 #ifndef _LIBCPP___ALGORITHM_RANGES_FOR_EACH_N_H
 #define _LIBCPP___ALGORITHM_RANGES_FOR_EACH_N_H
 
+#include <__algorithm/for_each.h>
 #include <__algorithm/in_fun_result.h>
 #include <__config>
 #include <__functional/identity.h>
@@ -40,11 +41,17 @@ struct __for_each_n {
   template <input_iterator _Iter, class _Proj = identity, indirectly_unary_invocable<projected<_Iter, _Proj>> _Func>
   _LIBCPP_HIDE_FROM_ABI constexpr for_each_n_result<_Iter, _Func>
   operator()(_Iter __first, iter_difference_t<_Iter> __count, _Func __func, _Proj __proj = {}) const {
-    while (__count-- > 0) {
-      std::invoke(__func, std::invoke(__proj, *__first));
-      ++__first;
+    if constexpr (random_access_iterator<_Iter>) {
+      auto __last = __first + __count;
+      std::for_each(__first, __last, [&](auto&& __val) { std::invoke(__func, std::invoke(__proj, __val)); });
+      return {std::move(__last), std::move(__func)};
+    } else {
+      while (__count-- > 0) {
+        std::invoke(__func, std::invoke(__proj, *__first));
+        ++__first;
+      }
+      return {std::move(__first), std::move(__func)};
     }
-    return {std::move(__first), std::move(__func)};
   }
 };
 
diff --git a/libcxx/test/benchmarks/algorithms/nonmodifying/for_each_n.bench.cpp b/libcxx/test/benchmarks/algorithms/nonmodifying/for_each_n.bench.cpp
index 784708c7e01eb..0de291395463a 100644
--- a/libcxx/test/benchmarks/algorithms/nonmodifying/for_each_n.bench.cpp
+++ b/libcxx/test/benchmarks/algorithms/nonmodifying/for_each_n.bench.cpp
@@ -21,7 +21,7 @@
 int main(int argc, char** argv) {
   auto std_for_each_n = [](auto first, auto n, auto f) { return std::for_each_n(first, n, f); };
 
-  // std::for_each_n
+  // {std,ranges}::for_each_n
   {
     auto bm = []<class Container>(std::string name, auto for_each_n) {
       using ElemType = typename Container::value_type;
diff --git a/libcxx/test/std/algorithms/alg.nonmodifying/alg.foreach/ranges.for_each.pass.cpp b/libcxx/test/std/algorithms/alg.nonmodifying/alg.foreach/ranges.for_each.pass.cpp
index 8b9b6e82cbcb2..2f4bfb9db6dba 100644
--- a/libcxx/test/std/algorithms/alg.nonmodifying/alg.foreach/ranges.for_each.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.nonmodifying/alg.foreach/ranges.for_each.pass.cpp
@@ -20,7 +20,10 @@
 
 #include <algorithm>
 #include <array>
+#include <cassert>
+#include <deque>
 #include <ranges>
+#include <vector>
 
 #include "almost_satisfies_types.h"
 #include "test_iterators.h"
@@ -30,7 +33,7 @@ struct Callable {
 };
 
 template <class Iter, class Sent = Iter>
-concept HasForEachIt = requires (Iter iter, Sent sent) { std::ranges::for_each(iter, sent, Callable{}); };
+concept HasForEachIt = requires(Iter iter, Sent sent) { std::ranges::for_each(iter, sent, Callable{}); };
 
 static_assert(HasForEachIt<int*>);
 static_assert(!HasForEachIt<InputIteratorNotDerivedFrom>);
@@ -47,7 +50,7 @@ static_assert(!HasForEachItFunc<IndirectUnaryPredicateNotPredicate>);
 static_assert(!HasForEachItFunc<IndirectUnaryPredicateNotCopyConstructible>);
 
 template <class Range>
-concept HasForEachR = requires (Range range) { std::ranges::for_each(range, Callable{}); };
+concept HasForEachR = requires(Range range) { std::ranges::for_each(range, Callable{}); };
 
 static_assert(HasForEachR<UncheckedRange<int*>>);
 static_assert(!HasForEachR<InputRangeNotDerivedFrom>);
@@ -68,7 +71,7 @@ constexpr void test_iterator() {
   { // simple test
     {
       auto func = [i = 0](int& a) mutable { a += i++; };
-      int a[] = {1, 6, 3, 4};
+      int a[]   = {1, 6, 3, 4};
       std::same_as<std::ranges::for_each_result<Iter, decltype(func)>> decltype(auto) ret =
           std::ranges::for_each(Iter(a), Sent(Iter(a + 4)), func);
       assert(a[0] == 1);
@@ -81,8 +84,8 @@ constexpr void test_iterator() {
       assert(i == 4);
     }
     {
-      auto func = [i = 0](int& a) mutable { a += i++; };
-      int a[] = {1, 6, 3, 4};
+      auto func  = [i = 0](int& a) mutable { a += i++; };
+      int a[]    = {1, 6, 3, 4};
       auto range = std::ranges::subrange(Iter(a), Sent(Iter(a + 4)));
       std::same_as<std::ranges::for_each_result<Iter, decltype(func)>> decltype(auto) ret =
           std::ranges::for_each(range, func);
@@ -110,6 +113,30 @@ constexpr void test_iterator() {
   }
 }
 
+struct deque_test {
+  std::deque<int>* d_;
+  int* i_;
+
+  deque_test(std::deque<int>& d, int& i) : d_(&d), i_(&i) {}
+
+  void operator()(int& v) {
+    assert(&(*d_)[*i_] == &v);
+    ++*i_;
+  }
+};
+
+/*TEST_CONSTEXPR_CXX23*/
+void test_segmented_deque_iterator() { // TODO: Mark as TEST_CONSTEXPR_CXX23 once std::deque is constexpr
+  // check that segmented iterators work properly
+  int sizes[] = {0, 1, 2, 1023, 1024, 1025, 2047, 2048, 2049};
+  for (const int size : sizes) {
+    std::deque<int> d(size);
+    int index = 0;
+
+    std::ranges::for_each(d, deque_test(d, index));
+  }
+}
+
 constexpr bool test() {
   test_iterator<cpp17_input_iterator<int*>, sentinel_wrapper<cpp17_input_iterator<int*>>>();
   test_iterator<cpp20_input_iterator<int*>, sentinel_wrapper<cpp20_input_iterator<int*>>>();
@@ -146,6 +173,15 @@ constexpr bool test() {
     }
   }
 
+  if (!TEST_IS_CONSTANT_EVALUATED) // TODO: Use TEST_STD_AT_LEAST_23_OR_RUNTIME_EVALUATED when std::deque is made constexpr
+    test_segmented_deque_iterator();
+
+  {
+    std::vector<std::vector<int>> vec = {{0}, {1, 2}, {3, 4, 5}, {6, 7, 8, 9}, {10}, {11, 12, 13}};
+    auto v                            = vec | std::views::join;
+    std::ranges::for_each(v, [i = 0](int x) mutable { assert(x == 2 * i++); }, [](int x) { return 2 * x; });
+  }
+
   return true;
 }
 
diff --git a/libcxx/test/std/algorithms/alg.nonmodifying/alg.foreach/ranges.for_each_n.pass.cpp b/libcxx/test/std/algorithms/alg.nonmodifying/alg.foreach/ranges.for_each_n.pass.cpp
index d4b2d053d08ce..ad1447b7348f5 100644
--- a/libcxx/test/std/algorithms/alg.nonmodifying/alg.foreach/ranges.for_each_n.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.nonmodifying/alg.foreach/ranges.for_each_n.pass.cpp
@@ -17,7 +17,12 @@
 
 #include <algorithm>
 #include <array>
+#include <cassert>
+#include <deque>
+#include <iterator>
 #include <ranges>
+#include <ranges>
+#include <vector>
 
 #include "almost_satisfies_types.h"
 #include "test_iterators.h"
@@ -27,7 +32,7 @@ struct Callable {
 };
 
 template <class Iter>
-concept HasForEachN = requires (Iter iter) { std::ranges::for_each_n(iter, 0, Callable{}); };
+concept HasForEachN = requires(Iter iter) { std::ranges::for_each_n(iter, 0, Callable{}); };
 
 static_assert(HasForEachN<int*>);
 static_assert(!HasForEachN<InputIteratorNotDerivedFrom>);
@@ -45,7 +50,7 @@ template <class Iter>
 constexpr void test_iterator() {
   { // simple test
     auto func = [i = 0](int& a) mutable { a += i++; };
-    int a[] = {1, 6, 3, 4};
+    int a[]   = {1, 6, 3, 4};
     std::same_as<std::ranges::for_each_result<Iter, decltype(func)>> auto ret =
         std::ranges::for_each_n(Iter(a), 4, func);
     assert(a[0] == 1);
@@ -64,6 +69,30 @@ constexpr void test_iterator() {
   }
 }
 
+struct deque_test {
+  std::deque<int>* d_;
+  int* i_;
+
+  deque_test(std::deque<int>& d, int& i) : d_(&d), i_(&i) {}
+
+  void operator()(int& v) {
+    assert(&(*d_)[*i_] == &v);
+    ++*i_;
+  }
+};
+
+/*TEST_CONSTEXPR_CXX23*/
+void test_segmented_deque_iterator() { // TODO: Mark as TEST_CONSTEXPR_CXX23 once std::deque is constexpr
+  // check that segmented iterators work properly
+  int sizes[] = {0, 1, 2, 1023, 1024, 1025, 2047, 2048, 2049};
+  for (const int size : sizes) {
+    std::deque<int> d(size);
+    int index = 0;
+
+    std::ranges::for_each_n(d.begin(), d.size(), deque_test(d, index));
+  }
+}
+
 constexpr bool test() {
   test_iterator<cpp17_input_iterator<int*>>();
   test_iterator<cpp20_input_iterator<int*>>();
@@ -89,6 +118,19 @@ constexpr bool test() {
     assert(a[2].other == 6);
   }
 
+  if (!TEST_IS_CONSTANT_EVALUATED) // TODO: Use TEST_STD_AT_LEAST_23_OR_RUNTIME_EVALUATED when std::deque is made constexpr
+    test_segmented_deque_iterator();
+
+  {
+    std::vector<std::vector<int>> vec = {{0}, {1, 2}, {3, 4, 5}, {6, 7, 8, 9}, {10}, {11, 12, 13}};
+    auto v                            = vec | std::views::join;
+    std::ranges::for_each_n(
+        v.begin(),
+        std::ranges::distance(v),
+        [i = 0](int x) mutable { assert(x == 2 * i++); },
+        [](int x) { return 2 * x; });
+  }
+
   return true;
 }
 

>From 18397b546e44cf64a7871838aa2784fe14a42103 Mon Sep 17 00:00:00 2001
From: Peng Liu <winner245 at hotmail.com>
Date: Tue, 25 Mar 2025 21:29:27 -0400
Subject: [PATCH 02/12] Address ldionne's review comments

---
 libcxx/include/__algorithm/for_each.h                         | 1 +
 libcxx/include/__algorithm/ranges_for_each.h                  | 4 +++-
 libcxx/include/__algorithm/ranges_for_each_n.h                | 4 +++-
 .../alg.nonmodifying/alg.foreach/ranges.for_each.pass.cpp     | 2 +-
 .../alg.nonmodifying/alg.foreach/ranges.for_each_n.pass.cpp   | 2 +-
 5 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/libcxx/include/__algorithm/for_each.h b/libcxx/include/__algorithm/for_each.h
index b6c2c7c056edd..2a44c1dc60704 100644
--- a/libcxx/include/__algorithm/for_each.h
+++ b/libcxx/include/__algorithm/for_each.h
@@ -11,6 +11,7 @@
 #define _LIBCPP___ALGORITHM_FOR_EACH_H
 
 #include <__algorithm/for_each_segment.h>
+#include <__algorithm/iterator_operations.h>
 #include <__config>
 #include <__iterator/segmented_iterator.h>
 #include <__type_traits/enable_if.h>
diff --git a/libcxx/include/__algorithm/ranges_for_each.h b/libcxx/include/__algorithm/ranges_for_each.h
index 475f85366188e..5d27befd9619f 100644
--- a/libcxx/include/__algorithm/ranges_for_each.h
+++ b/libcxx/include/__algorithm/ranges_for_each.h
@@ -11,6 +11,7 @@
 
 #include <__algorithm/for_each.h>
 #include <__algorithm/in_fun_result.h>
+#include <__algorithm/iterator_operations.h>
 #include <__config>
 #include <__functional/identity.h>
 #include <__functional/invoke.h>
@@ -45,7 +46,8 @@ struct __for_each {
     if constexpr (random_access_iterator<_Iter> && sized_sentinel_for<_Sent, _Iter>) {
       auto __n   = __last - __first;
       auto __end = __first + __n;
-      std::for_each(__first, __end, [&](auto&& __val) { std::invoke(__func, std::invoke(__proj, __val)); });
+      auto __f   = [&](auto&& __val) { std::invoke(__func, std::invoke(__proj, __val)); };
+      std::__for_each<_RangeAlgPolicy>(__first, __end, __f);
       return {std::move(__end), std::move(__func)};
     } else {
       for (; __first != __last; ++__first)
diff --git a/libcxx/include/__algorithm/ranges_for_each_n.h b/libcxx/include/__algorithm/ranges_for_each_n.h
index 3108d66001295..8384ba3bb14e6 100644
--- a/libcxx/include/__algorithm/ranges_for_each_n.h
+++ b/libcxx/include/__algorithm/ranges_for_each_n.h
@@ -11,6 +11,7 @@
 
 #include <__algorithm/for_each.h>
 #include <__algorithm/in_fun_result.h>
+#include <__algorithm/iterator_operations.h>
 #include <__config>
 #include <__functional/identity.h>
 #include <__functional/invoke.h>
@@ -43,7 +44,8 @@ struct __for_each_n {
   operator()(_Iter __first, iter_difference_t<_Iter> __count, _Func __func, _Proj __proj = {}) const {
     if constexpr (random_access_iterator<_Iter>) {
       auto __last = __first + __count;
-      std::for_each(__first, __last, [&](auto&& __val) { std::invoke(__func, std::invoke(__proj, __val)); });
+      auto __f    = [&](auto&& __val) { std::invoke(__func, std::invoke(__proj, __val)); };
+      std::__for_each<_RangeAlgPolicy>(__first, __last, __f);
       return {std::move(__last), std::move(__func)};
     } else {
       while (__count-- > 0) {
diff --git a/libcxx/test/std/algorithms/alg.nonmodifying/alg.foreach/ranges.for_each.pass.cpp b/libcxx/test/std/algorithms/alg.nonmodifying/alg.foreach/ranges.for_each.pass.cpp
index 2f4bfb9db6dba..14be4a42f667c 100644
--- a/libcxx/test/std/algorithms/alg.nonmodifying/alg.foreach/ranges.for_each.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.nonmodifying/alg.foreach/ranges.for_each.pass.cpp
@@ -127,7 +127,7 @@ struct deque_test {
 
 /*TEST_CONSTEXPR_CXX23*/
 void test_segmented_deque_iterator() { // TODO: Mark as TEST_CONSTEXPR_CXX23 once std::deque is constexpr
-  // check that segmented iterators work properly
+  // check that segmented deque iterators work properly
   int sizes[] = {0, 1, 2, 1023, 1024, 1025, 2047, 2048, 2049};
   for (const int size : sizes) {
     std::deque<int> d(size);
diff --git a/libcxx/test/std/algorithms/alg.nonmodifying/alg.foreach/ranges.for_each_n.pass.cpp b/libcxx/test/std/algorithms/alg.nonmodifying/alg.foreach/ranges.for_each_n.pass.cpp
index ad1447b7348f5..ac073d3052170 100644
--- a/libcxx/test/std/algorithms/alg.nonmodifying/alg.foreach/ranges.for_each_n.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.nonmodifying/alg.foreach/ranges.for_each_n.pass.cpp
@@ -83,7 +83,7 @@ struct deque_test {
 
 /*TEST_CONSTEXPR_CXX23*/
 void test_segmented_deque_iterator() { // TODO: Mark as TEST_CONSTEXPR_CXX23 once std::deque is constexpr
-  // check that segmented iterators work properly
+  // check that segmented deque iterators work properly
   int sizes[] = {0, 1, 2, 1023, 1024, 1025, 2047, 2048, 2049};
   for (const int size : sizes) {
     std::deque<int> d(size);

>From 9761e7bb84d0179753897111a9266ddf2591f620 Mon Sep 17 00:00:00 2001
From: Peng Liu <winner245 at hotmail.com>
Date: Tue, 25 Mar 2025 23:11:34 -0400
Subject: [PATCH 03/12] Fix test and ADL call

---
 .../alg.nonmodifying/alg.foreach/ranges.for_each.pass.cpp   | 6 +++---
 .../alg.nonmodifying/alg.foreach/ranges.for_each_n.pass.cpp | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/libcxx/test/std/algorithms/alg.nonmodifying/alg.foreach/ranges.for_each.pass.cpp b/libcxx/test/std/algorithms/alg.nonmodifying/alg.foreach/ranges.for_each.pass.cpp
index 14be4a42f667c..a6d0afde3186a 100644
--- a/libcxx/test/std/algorithms/alg.nonmodifying/alg.foreach/ranges.for_each.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.nonmodifying/alg.foreach/ranges.for_each.pass.cpp
@@ -125,8 +125,8 @@ struct deque_test {
   }
 };
 
-/*TEST_CONSTEXPR_CXX23*/
-void test_segmented_deque_iterator() { // TODO: Mark as TEST_CONSTEXPR_CXX23 once std::deque is constexpr
+/*TEST_CONSTEXPR_CXX26*/
+void test_segmented_deque_iterator() { // TODO: Mark as TEST_CONSTEXPR_CXX26 once std::deque is constexpr
   // check that segmented deque iterators work properly
   int sizes[] = {0, 1, 2, 1023, 1024, 1025, 2047, 2048, 2049};
   for (const int size : sizes) {
@@ -173,7 +173,7 @@ constexpr bool test() {
     }
   }
 
-  if (!TEST_IS_CONSTANT_EVALUATED) // TODO: Use TEST_STD_AT_LEAST_23_OR_RUNTIME_EVALUATED when std::deque is made constexpr
+  if (!TEST_IS_CONSTANT_EVALUATED) // TODO: Use TEST_STD_AT_LEAST_26_OR_RUNTIME_EVALUATED when std::deque is made constexpr
     test_segmented_deque_iterator();
 
   {
diff --git a/libcxx/test/std/algorithms/alg.nonmodifying/alg.foreach/ranges.for_each_n.pass.cpp b/libcxx/test/std/algorithms/alg.nonmodifying/alg.foreach/ranges.for_each_n.pass.cpp
index ac073d3052170..1578763694231 100644
--- a/libcxx/test/std/algorithms/alg.nonmodifying/alg.foreach/ranges.for_each_n.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.nonmodifying/alg.foreach/ranges.for_each_n.pass.cpp
@@ -81,8 +81,8 @@ struct deque_test {
   }
 };
 
-/*TEST_CONSTEXPR_CXX23*/
-void test_segmented_deque_iterator() { // TODO: Mark as TEST_CONSTEXPR_CXX23 once std::deque is constexpr
+/*TEST_CONSTEXPR_CXX26*/
+void test_segmented_deque_iterator() { // TODO: Mark as TEST_CONSTEXPR_CXX26 once std::deque is constexpr
   // check that segmented deque iterators work properly
   int sizes[] = {0, 1, 2, 1023, 1024, 1025, 2047, 2048, 2049};
   for (const int size : sizes) {
@@ -118,7 +118,7 @@ constexpr bool test() {
     assert(a[2].other == 6);
   }
 
-  if (!TEST_IS_CONSTANT_EVALUATED) // TODO: Use TEST_STD_AT_LEAST_23_OR_RUNTIME_EVALUATED when std::deque is made constexpr
+  if (!TEST_IS_CONSTANT_EVALUATED) // TODO: Use TEST_STD_AT_LEAST_26_OR_RUNTIME_EVALUATED when std::deque is made constexpr
     test_segmented_deque_iterator();
 
   {

>From 383595a0a8e6ff3b94bc83fa8ada659d0b6fd902 Mon Sep 17 00:00:00 2001
From: Peng Liu <winner245 at hotmail.com>
Date: Wed, 26 Mar 2025 11:10:37 -0400
Subject: [PATCH 04/12] Make for_each segmented iterator optimization valid for
 C++03

---
 libcxx/include/__algorithm/for_each_n.h        | 1 +
 libcxx/include/__algorithm/ranges_for_each_n.h | 5 +++--
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/libcxx/include/__algorithm/for_each_n.h b/libcxx/include/__algorithm/for_each_n.h
index 29351ec39f4e7..169de84b4d95f 100644
--- a/libcxx/include/__algorithm/for_each_n.h
+++ b/libcxx/include/__algorithm/for_each_n.h
@@ -14,6 +14,7 @@
 #include <__algorithm/for_each_n_segment.h>
 #include <__config>
 #include <__iterator/iterator_traits.h>
+#include <__iterator/next.h>
 #include <__iterator/segmented_iterator.h>
 #include <__type_traits/disjunction.h>
 #include <__type_traits/enable_if.h>
diff --git a/libcxx/include/__algorithm/ranges_for_each_n.h b/libcxx/include/__algorithm/ranges_for_each_n.h
index 8384ba3bb14e6..a5c81868c2062 100644
--- a/libcxx/include/__algorithm/ranges_for_each_n.h
+++ b/libcxx/include/__algorithm/ranges_for_each_n.h
@@ -18,6 +18,7 @@
 #include <__iterator/concepts.h>
 #include <__iterator/incrementable_traits.h>
 #include <__iterator/iterator_traits.h>
+#include <__iterator/next.h>
 #include <__iterator/projected.h>
 #include <__ranges/concepts.h>
 #include <__utility/move.h>
@@ -42,8 +43,8 @@ struct __for_each_n {
   template <input_iterator _Iter, class _Proj = identity, indirectly_unary_invocable<projected<_Iter, _Proj>> _Func>
   _LIBCPP_HIDE_FROM_ABI constexpr for_each_n_result<_Iter, _Func>
   operator()(_Iter __first, iter_difference_t<_Iter> __count, _Func __func, _Proj __proj = {}) const {
-    if constexpr (random_access_iterator<_Iter>) {
-      auto __last = __first + __count;
+    if constexpr (forward_iterator<_Iter>) {
+      auto __last = std::ranges::next(__first, __count);
       auto __f    = [&](auto&& __val) { std::invoke(__func, std::invoke(__proj, __val)); };
       std::__for_each<_RangeAlgPolicy>(__first, __last, __f);
       return {std::move(__last), std::move(__func)};

>From b1a6f8fe11a055fb6ad8ca87b14fb1508bc23e54 Mon Sep 17 00:00:00 2001
From: Peng Liu <winner245 at hotmail.com>
Date: Thu, 27 Mar 2025 11:50:12 -0400
Subject: [PATCH 05/12] Allow transitive include of <optional> in affected
 headers

---
 libcxx/include/experimental/iterator | 1 +
 libcxx/include/mutex                 | 1 +
 libcxx/include/shared_mutex          | 1 +
 3 files changed, 3 insertions(+)

diff --git a/libcxx/include/experimental/iterator b/libcxx/include/experimental/iterator
index d92613845a662..565bb83903ac3 100644
--- a/libcxx/include/experimental/iterator
+++ b/libcxx/include/experimental/iterator
@@ -127,6 +127,7 @@ _LIBCPP_POP_MACROS
 #  if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20
 #    include <cstddef>
 #    include <iosfwd>
+#    include <optional>
 #    include <type_traits>
 #  endif
 #endif // __cplusplus < 201103L && defined(_LIBCPP_USE_FROZEN_CXX03_HEADERS)
diff --git a/libcxx/include/mutex b/libcxx/include/mutex
index e058b3113073e..f616bad3ac171 100644
--- a/libcxx/include/mutex
+++ b/libcxx/include/mutex
@@ -504,6 +504,7 @@ _LIBCPP_POP_MACROS
 #    include <initializer_list>
 #    include <iosfwd>
 #    include <new>
+#    include <optional>
 #    include <stdexcept>
 #    include <system_error>
 #    include <type_traits>
diff --git a/libcxx/include/shared_mutex b/libcxx/include/shared_mutex
index e6759e413dfef..6469c02ca5874 100644
--- a/libcxx/include/shared_mutex
+++ b/libcxx/include/shared_mutex
@@ -457,6 +457,7 @@ _LIBCPP_POP_MACROS
 #  endif // _LIBCPP_HAS_THREADS
 
 #  if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20
+#    include <optional>
 #    include <system_error>
 #  endif
 #endif // __cplusplus < 201103L && defined(_LIBCPP_USE_FROZEN_CXX03_HEADERS)

>From cbf100940a7448ff2656a0a90654f0515c7680f3 Mon Sep 17 00:00:00 2001
From: Peng Liu <winner245 at hotmail.com>
Date: Thu, 27 Mar 2025 12:00:54 -0400
Subject: [PATCH 06/12] Remove unnecessary _AlgoPolicy template parameter

---
 libcxx/include/__algorithm/for_each.h          | 1 -
 libcxx/include/__algorithm/ranges_for_each.h   | 3 +--
 libcxx/include/__algorithm/ranges_for_each_n.h | 3 +--
 3 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/libcxx/include/__algorithm/for_each.h b/libcxx/include/__algorithm/for_each.h
index 2a44c1dc60704..b6c2c7c056edd 100644
--- a/libcxx/include/__algorithm/for_each.h
+++ b/libcxx/include/__algorithm/for_each.h
@@ -11,7 +11,6 @@
 #define _LIBCPP___ALGORITHM_FOR_EACH_H
 
 #include <__algorithm/for_each_segment.h>
-#include <__algorithm/iterator_operations.h>
 #include <__config>
 #include <__iterator/segmented_iterator.h>
 #include <__type_traits/enable_if.h>
diff --git a/libcxx/include/__algorithm/ranges_for_each.h b/libcxx/include/__algorithm/ranges_for_each.h
index 5d27befd9619f..096e60683e39d 100644
--- a/libcxx/include/__algorithm/ranges_for_each.h
+++ b/libcxx/include/__algorithm/ranges_for_each.h
@@ -11,7 +11,6 @@
 
 #include <__algorithm/for_each.h>
 #include <__algorithm/in_fun_result.h>
-#include <__algorithm/iterator_operations.h>
 #include <__config>
 #include <__functional/identity.h>
 #include <__functional/invoke.h>
@@ -47,7 +46,7 @@ struct __for_each {
       auto __n   = __last - __first;
       auto __end = __first + __n;
       auto __f   = [&](auto&& __val) { std::invoke(__func, std::invoke(__proj, __val)); };
-      std::__for_each<_RangeAlgPolicy>(__first, __end, __f);
+      std::__for_each(__first, __end, __f);
       return {std::move(__end), std::move(__func)};
     } else {
       for (; __first != __last; ++__first)
diff --git a/libcxx/include/__algorithm/ranges_for_each_n.h b/libcxx/include/__algorithm/ranges_for_each_n.h
index a5c81868c2062..9c6c2b97a2ad1 100644
--- a/libcxx/include/__algorithm/ranges_for_each_n.h
+++ b/libcxx/include/__algorithm/ranges_for_each_n.h
@@ -11,7 +11,6 @@
 
 #include <__algorithm/for_each.h>
 #include <__algorithm/in_fun_result.h>
-#include <__algorithm/iterator_operations.h>
 #include <__config>
 #include <__functional/identity.h>
 #include <__functional/invoke.h>
@@ -46,7 +45,7 @@ struct __for_each_n {
     if constexpr (forward_iterator<_Iter>) {
       auto __last = std::ranges::next(__first, __count);
       auto __f    = [&](auto&& __val) { std::invoke(__func, std::invoke(__proj, __val)); };
-      std::__for_each<_RangeAlgPolicy>(__first, __last, __f);
+      std::__for_each(__first, __last, __f);
       return {std::move(__last), std::move(__func)};
     } else {
       while (__count-- > 0) {

>From a0564e9d36ed33998f4e31bdf14b4974eb83db31 Mon Sep 17 00:00:00 2001
From: Peng Liu <winner245 at hotmail.com>
Date: Fri, 28 Mar 2025 20:26:31 -0400
Subject: [PATCH 07/12] Apply optimization for join_view segmented iterators

---
 libcxx/docs/ReleaseNotes/21.rst               |   6 +
 .../include/__algorithm/ranges_for_each_n.h   |   5 +-
 .../nonmodifying/for_each.bench.cpp           |  23 +++-
 .../nonmodifying/for_each_join_view.bench.cpp | 122 ++++++++++++++++++
 .../nonmodifying/for_each_n.bench.cpp         |  14 ++
 5 files changed, 165 insertions(+), 5 deletions(-)
 create mode 100644 libcxx/test/benchmarks/algorithms/nonmodifying/for_each_join_view.bench.cpp

diff --git a/libcxx/docs/ReleaseNotes/21.rst b/libcxx/docs/ReleaseNotes/21.rst
index 6cbc0baf29487..b652ed2f4eb1e 100644
--- a/libcxx/docs/ReleaseNotes/21.rst
+++ b/libcxx/docs/ReleaseNotes/21.rst
@@ -64,11 +64,17 @@ Improvements and New Features
 
 - The ``num_put::do_put`` integral overloads have been optimized, resulting in a performance improvement of up to 2.4x.
 
+<<<<<<< HEAD
 - The ``std::stable_sort`` algorithm uses radix sort for floating-point types now, which can improve the performance
   up to 10x, depending on type of sorted elements and the initial state of the sorted array.
 
 - The segmented iterator optimization for ``std::for_each`` has been backported to C++11. Previously it was only available
   in C++23 and later.
+=======
+- The ``std::ranges::for_each`` and ``std::ranges::for_each_n`` algorithms have been optimized for segmented iterators,
+  resulting in performance improvements of up to 21.2x for ``std::deque::iterator`` segmented inputs and 17.9x for
+  ``join_view`` of ``vector<vector<T>>``.
+>>>>>>> 50ac206d4a13 (Apply optimization for join_view segmented iterators)
 
 - The ``std::for_each_n`` algorithm has been optimized for segmented iterators, resulting in a performance improvement of
   up to 17.7x for ``std::deque<short>`` iterators, and up to 13.9x for ``std::join_view<vector<vector<short>>>`` iterators.
diff --git a/libcxx/include/__algorithm/ranges_for_each_n.h b/libcxx/include/__algorithm/ranges_for_each_n.h
index 9c6c2b97a2ad1..b92eeb6fa8d7c 100644
--- a/libcxx/include/__algorithm/ranges_for_each_n.h
+++ b/libcxx/include/__algorithm/ranges_for_each_n.h
@@ -9,7 +9,7 @@
 #ifndef _LIBCPP___ALGORITHM_RANGES_FOR_EACH_N_H
 #define _LIBCPP___ALGORITHM_RANGES_FOR_EACH_N_H
 
-#include <__algorithm/for_each.h>
+#include <__algorithm/for_each_n.h>
 #include <__algorithm/in_fun_result.h>
 #include <__config>
 #include <__functional/identity.h>
@@ -43,9 +43,8 @@ struct __for_each_n {
   _LIBCPP_HIDE_FROM_ABI constexpr for_each_n_result<_Iter, _Func>
   operator()(_Iter __first, iter_difference_t<_Iter> __count, _Func __func, _Proj __proj = {}) const {
     if constexpr (forward_iterator<_Iter>) {
-      auto __last = std::ranges::next(__first, __count);
       auto __f    = [&](auto&& __val) { std::invoke(__func, std::invoke(__proj, __val)); };
-      std::__for_each(__first, __last, __f);
+      auto __last = std::for_each_n(__first, __count, __f);
       return {std::move(__last), std::move(__func)};
     } else {
       while (__count-- > 0) {
diff --git a/libcxx/test/benchmarks/algorithms/nonmodifying/for_each.bench.cpp b/libcxx/test/benchmarks/algorithms/nonmodifying/for_each.bench.cpp
index 760accbe4d929..1e33cf70f8487 100644
--- a/libcxx/test/benchmarks/algorithms/nonmodifying/for_each.bench.cpp
+++ b/libcxx/test/benchmarks/algorithms/nonmodifying/for_each.bench.cpp
@@ -23,6 +23,7 @@ int main(int argc, char** argv) {
   // {std,ranges}::for_each
   {
     auto bm = []<class Container>(std::string name, auto for_each) {
+      using ElemType = typename Container::value_type;
       benchmark::RegisterBenchmark(
           name,
           [for_each](auto& st) {
@@ -33,16 +34,34 @@ int main(int argc, char** argv) {
 
             for ([[maybe_unused]] auto _ : st) {
               benchmark::DoNotOptimize(c);
-              auto result = for_each(first, last, [](int& x) { x = std::clamp(x, 10, 100); });
+              auto result = for_each(first, last, [](ElemType& x) { x = std::clamp<ElemType>(x, 10, 100); });
               benchmark::DoNotOptimize(result);
             }
           })
           ->Arg(8)
           ->Arg(32)
           ->Arg(50) // non power-of-two
+          ->Arg(1024)
+          ->Arg(4096)
           ->Arg(8192)
-          ->Arg(1 << 20);
+          ->Arg(1 << 14)
+          ->Arg(1 << 16)
+          ->Arg(1 << 18);
     };
+    bm.operator()<std::vector<char>>("std::for_each(vector<char>)", std_for_each);
+    bm.operator()<std::deque<char>>("std::for_each(deque<char>)", std_for_each);
+    bm.operator()<std::list<char>>("std::for_each(list<char>)", std_for_each);
+    bm.operator()<std::vector<char>>("rng::for_each(vector<char>)", std::ranges::for_each);
+    bm.operator()<std::deque<char>>("rng::for_each(deque<char>)", std::ranges::for_each);
+    bm.operator()<std::list<char>>("rng::for_each(list<char>)", std::ranges::for_each);
+
+    bm.operator()<std::vector<short>>("std::for_each(vector<short>)", std_for_each);
+    bm.operator()<std::deque<short>>("std::for_each(deque<short>)", std_for_each);
+    bm.operator()<std::list<short>>("std::for_each(list<short>)", std_for_each);
+    bm.operator()<std::vector<short>>("rng::for_each(vector<short>)", std::ranges::for_each);
+    bm.operator()<std::deque<short>>("rng::for_each(deque<short>)", std::ranges::for_each);
+    bm.operator()<std::list<short>>("rng::for_each(list<short>)", std::ranges::for_each);
+
     bm.operator()<std::vector<int>>("std::for_each(vector<int>)", std_for_each);
     bm.operator()<std::deque<int>>("std::for_each(deque<int>)", std_for_each);
     bm.operator()<std::list<int>>("std::for_each(list<int>)", std_for_each);
diff --git a/libcxx/test/benchmarks/algorithms/nonmodifying/for_each_join_view.bench.cpp b/libcxx/test/benchmarks/algorithms/nonmodifying/for_each_join_view.bench.cpp
new file mode 100644
index 0000000000000..28398ac988bf7
--- /dev/null
+++ b/libcxx/test/benchmarks/algorithms/nonmodifying/for_each_join_view.bench.cpp
@@ -0,0 +1,122 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+
+#include <algorithm>
+#include <cstddef>
+#include <deque>
+#include <list>
+#include <ranges>
+#include <string>
+#include <vector>
+
+#include <benchmark/benchmark.h>
+
+int main(int argc, char** argv) {
+  auto std_for_each   = [](auto first, auto last, auto f) { return std::for_each(first, last, f); };
+  auto std_for_each_n = [](auto first, auto n, auto f) { return std::for_each_n(first, n, f); };
+
+  // {std,ranges}::for_each
+  {
+    auto bm = []<class Container>(std::string name, auto for_each) {
+      using C1       = typename Container::value_type;
+      using ElemType = typename C1::value_type;
+
+      benchmark::RegisterBenchmark(
+          name,
+          [for_each](auto& st) {
+            std::size_t const size     = st.range(0);
+            std::size_t const seg_size = 256;
+            std::size_t const segments = (size + seg_size - 1) / seg_size;
+            Container c(segments);
+            for (std::size_t i = 0, n = size; i < segments; ++i, n -= seg_size) {
+              c[i].resize(std::min(seg_size, n), ElemType(1));
+            }
+
+            auto view  = c | std::views::join;
+            auto first = view.begin();
+            auto last  = view.end();
+
+            for ([[maybe_unused]] auto _ : st) {
+              benchmark::DoNotOptimize(c);
+              auto result = for_each(first, last, [](ElemType& x) { x = std::clamp<ElemType>(x, 10, 100); });
+              benchmark::DoNotOptimize(result);
+            }
+          })
+          ->Arg(8)
+          ->Arg(32)
+          ->Arg(50) // non power-of-two
+          ->Arg(1024)
+          ->Arg(4096)
+          ->Arg(8192)
+          ->Arg(1 << 14)
+          ->Arg(1 << 16)
+          ->Arg(1 << 18);
+    };
+    bm.operator()<std::vector<std::vector<char>>>("std::for_each(join_view(vector<vector<char>>))", std_for_each);
+    bm.operator()<std::vector<std::vector<short>>>("std::for_each(join_view(vector<vector<short>>))", std_for_each);
+    bm.operator()<std::vector<std::vector<int>>>("std::for_each(join_view(vector<vector<int>>))", std_for_each);
+    bm.operator()<std::vector<std::vector<char>>>(
+        "rng::for_each(join_view(vector<vector<char>>)", std::ranges::for_each);
+    bm.operator()<std::vector<std::vector<short>>>(
+        "rng::for_each(join_view(vector<vector<short>>)", std::ranges::for_each);
+    bm.operator()<std::vector<std::vector<int>>>("rng::for_each(join_view(vector<vector<int>>)", std::ranges::for_each);
+  }
+
+  // {std,ranges}::for_each_n
+  {
+    auto bm = []<class Container>(std::string name, auto for_each_n) {
+      using C1       = typename Container::value_type;
+      using ElemType = typename C1::value_type;
+      benchmark::RegisterBenchmark(
+          name,
+          [for_each_n](auto& st) {
+            std::size_t const size     = st.range(0);
+            std::size_t const seg_size = 256;
+            std::size_t const segments = (size + seg_size - 1) / seg_size;
+            Container c(segments);
+            for (std::size_t i = 0, n = size; i < segments; ++i, n -= seg_size) {
+              c[i].resize(std::min(seg_size, n), ElemType(1));
+            }
+
+            auto view  = c | std::views::join;
+            auto first = view.begin();
+
+            for ([[maybe_unused]] auto _ : st) {
+              benchmark::DoNotOptimize(c);
+              auto result = for_each_n(first, size, [](ElemType& x) { x = std::clamp<ElemType>(x, 10, 100); });
+              benchmark::DoNotOptimize(result);
+            }
+          })
+          ->Arg(8)
+          ->Arg(32)
+          ->Arg(50) // non power-of-two
+          ->Arg(1024)
+          ->Arg(4096)
+          ->Arg(8192)
+          ->Arg(1 << 14)
+          ->Arg(1 << 16)
+          ->Arg(1 << 18);
+    };
+    bm.operator()<std::vector<std::vector<char>>>("std::for_each_n(join_view(vector<vector<char>>))", std_for_each_n);
+    bm.operator()<std::vector<std::vector<short>>>("std::for_each_n(join_view(vector<vector<short>>))", std_for_each_n);
+    bm.operator()<std::vector<std::vector<int>>>("std::for_each_n(join_view(vector<vector<int>>))", std_for_each_n);
+    bm.operator()<std::vector<std::vector<char>>>(
+        "rng::for_each_n(join_view(vector<vector<char>>)", std::ranges::for_each_n);
+    bm.operator()<std::vector<std::vector<short>>>(
+        "rng::for_each_n(join_view(vector<vector<short>>)", std::ranges::for_each_n);
+    bm.operator()<std::vector<std::vector<int>>>(
+        "rng::for_each_n(join_view(vector<vector<int>>)", std::ranges::for_each_n);
+  }
+
+  benchmark::Initialize(&argc, argv);
+  benchmark::RunSpecifiedBenchmarks();
+  benchmark::Shutdown();
+  return 0;
+}
diff --git a/libcxx/test/benchmarks/algorithms/nonmodifying/for_each_n.bench.cpp b/libcxx/test/benchmarks/algorithms/nonmodifying/for_each_n.bench.cpp
index 0de291395463a..f0dcc30a39e14 100644
--- a/libcxx/test/benchmarks/algorithms/nonmodifying/for_each_n.bench.cpp
+++ b/libcxx/test/benchmarks/algorithms/nonmodifying/for_each_n.bench.cpp
@@ -48,6 +48,20 @@ int main(int argc, char** argv) {
           ->Arg(1 << 16)
           ->Arg(1 << 18);
     };
+    bm.operator()<std::vector<char>>("std::for_each_n(vector<char>)", std_for_each_n);
+    bm.operator()<std::deque<char>>("std::for_each_n(deque<char>)", std_for_each_n);
+    bm.operator()<std::list<char>>("std::for_each_n(list<char>)", std_for_each_n);
+    bm.operator()<std::vector<char>>("rng::for_each_n(vector<char>)", std::ranges::for_each_n);
+    bm.operator()<std::deque<char>>("rng::for_each_n(deque<char>)", std::ranges::for_each_n);
+    bm.operator()<std::list<char>>("rng::for_each_n(list<char>)", std::ranges::for_each_n);
+
+    bm.operator()<std::vector<short>>("std::for_each_n(vector<short>)", std_for_each_n);
+    bm.operator()<std::deque<short>>("std::for_each_n(deque<short>)", std_for_each_n);
+    bm.operator()<std::list<short>>("std::for_each_n(list<short>)", std_for_each_n);
+    bm.operator()<std::vector<short>>("rng::for_each_n(vector<short>)", std::ranges::for_each_n);
+    bm.operator()<std::deque<short>>("rng::for_each_n(deque<short>)", std::ranges::for_each_n);
+    bm.operator()<std::list<short>>("rng::for_each_n(list<short>)", std::ranges::for_each_n);
+
     bm.operator()<std::vector<int>>("std::for_each_n(vector<int>)", std_for_each_n);
     bm.operator()<std::deque<int>>("std::for_each_n(deque<int>)", std_for_each_n);
     bm.operator()<std::list<int>>("std::for_each_n(list<int>)", std_for_each_n);

>From dd8c5cae772251b548f4573a5b572dcba9da1ad3 Mon Sep 17 00:00:00 2001
From: Peng Liu <winner245 at hotmail.com>
Date: Sat, 29 Mar 2025 11:21:07 -0400
Subject: [PATCH 08/12] Consistently extend segmented iterator optimization to
 ranges::for_each

---
 libcxx/docs/ReleaseNotes/21.rst              |  2 +-
 libcxx/include/__algorithm/ranges_for_each.h | 15 ++++++++++-----
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/libcxx/docs/ReleaseNotes/21.rst b/libcxx/docs/ReleaseNotes/21.rst
index b652ed2f4eb1e..a3bbc59b9bd2b 100644
--- a/libcxx/docs/ReleaseNotes/21.rst
+++ b/libcxx/docs/ReleaseNotes/21.rst
@@ -72,7 +72,7 @@ Improvements and New Features
   in C++23 and later.
 =======
 - The ``std::ranges::for_each`` and ``std::ranges::for_each_n`` algorithms have been optimized for segmented iterators,
-  resulting in performance improvements of up to 21.2x for ``std::deque::iterator`` segmented inputs and 17.9x for
+  resulting in performance improvements of up to 21.3x for ``std::deque::iterator`` segmented inputs and 24.9x for
   ``join_view`` of ``vector<vector<T>>``.
 >>>>>>> 50ac206d4a13 (Apply optimization for join_view segmented iterators)
 
diff --git a/libcxx/include/__algorithm/ranges_for_each.h b/libcxx/include/__algorithm/ranges_for_each.h
index 096e60683e39d..961f7558149a3 100644
--- a/libcxx/include/__algorithm/ranges_for_each.h
+++ b/libcxx/include/__algorithm/ranges_for_each.h
@@ -10,7 +10,9 @@
 #define _LIBCPP___ALGORITHM_RANGES_FOR_EACH_H
 
 #include <__algorithm/for_each.h>
+#include <__algorithm/for_each_n.h>
 #include <__algorithm/in_fun_result.h>
+#include <__concepts/assignable.h>
 #include <__config>
 #include <__functional/identity.h>
 #include <__functional/invoke.h>
@@ -42,11 +44,14 @@ struct __for_each {
   template <class _Iter, class _Sent, class _Proj, class _Func>
   _LIBCPP_HIDE_FROM_ABI constexpr static for_each_result<_Iter, _Func>
   __for_each_impl(_Iter __first, _Sent __last, _Func& __func, _Proj& __proj) {
-    if constexpr (random_access_iterator<_Iter> && sized_sentinel_for<_Sent, _Iter>) {
-      auto __n   = __last - __first;
-      auto __end = __first + __n;
-      auto __f   = [&](auto&& __val) { std::invoke(__func, std::invoke(__proj, __val)); };
-      std::__for_each(__first, __end, __f);
+    if constexpr (std::assignable_from<_Iter&, _Sent>) {
+      _Iter __end = std::move(__last);
+      std::for_each(__first, __end, [&](auto&& __val) { std::invoke(__func, std::invoke(__proj, __val)); });
+      return {std::move(__end), std::move(__func)};
+    } else if constexpr (sized_sentinel_for<_Sent, _Iter>) {
+      auto __end = std::for_each_n(__first, __last - __first, [&](auto&& __val) {
+        std::invoke(__func, std::invoke(__proj, __val));
+      });
       return {std::move(__end), std::move(__func)};
     } else {
       for (; __first != __last; ++__first)

>From 4e66faab0159e2092ea50d45c298fd4948fb93e9 Mon Sep 17 00:00:00 2001
From: Peng Liu <winner245 at hotmail.com>
Date: Wed, 2 Apr 2025 23:15:57 -0400
Subject: [PATCH 09/12] Fix review comments

---
 libcxx/docs/ReleaseNotes/21.rst               |   5 +
 libcxx/include/__algorithm/for_each.h         |  18 ++-
 libcxx/include/__algorithm/for_each_n.h       |  19 ++-
 .../include/__algorithm/for_each_n_segment.h  |   6 +
 libcxx/include/__algorithm/ranges_for_each.h  |  16 +--
 .../include/__algorithm/ranges_for_each_n.h   |  14 +-
 .../nonmodifying/for_each.bench.cpp           |  56 ++++++--
 .../nonmodifying/for_each_join_view.bench.cpp | 122 ------------------
 .../nonmodifying/for_each_n.bench.cpp         |  54 ++++++--
 9 files changed, 124 insertions(+), 186 deletions(-)
 delete mode 100644 libcxx/test/benchmarks/algorithms/nonmodifying/for_each_join_view.bench.cpp

diff --git a/libcxx/docs/ReleaseNotes/21.rst b/libcxx/docs/ReleaseNotes/21.rst
index a3bbc59b9bd2b..49c188ebac420 100644
--- a/libcxx/docs/ReleaseNotes/21.rst
+++ b/libcxx/docs/ReleaseNotes/21.rst
@@ -72,9 +72,14 @@ Improvements and New Features
   in C++23 and later.
 =======
 - The ``std::ranges::for_each`` and ``std::ranges::for_each_n`` algorithms have been optimized for segmented iterators,
+<<<<<<< HEAD
   resulting in performance improvements of up to 21.3x for ``std::deque::iterator`` segmented inputs and 24.9x for
   ``join_view`` of ``vector<vector<T>>``.
 >>>>>>> 50ac206d4a13 (Apply optimization for join_view segmented iterators)
+=======
+  resulting in performance improvements of up to 21.3x for ``std::deque::iterator`` and 24.9x for ``join_view`` of
+  ``vector<vector<char>>``.
+>>>>>>> 590136ba0d9f (Fix review comments)
 
 - The ``std::for_each_n`` algorithm has been optimized for segmented iterators, resulting in a performance improvement of
   up to 17.7x for ``std::deque<short>`` iterators, and up to 13.9x for ``std::join_view<vector<vector<short>>>`` iterators.
diff --git a/libcxx/include/__algorithm/for_each.h b/libcxx/include/__algorithm/for_each.h
index b6c2c7c056edd..01ddad761bb57 100644
--- a/libcxx/include/__algorithm/for_each.h
+++ b/libcxx/include/__algorithm/for_each.h
@@ -12,6 +12,8 @@
 
 #include <__algorithm/for_each_segment.h>
 #include <__config>
+#include <__functional/identity.h>
+#include <__functional/invoke.h>
 #include <__iterator/segmented_iterator.h>
 #include <__type_traits/enable_if.h>
 
@@ -21,21 +23,24 @@
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
-template <class _InputIterator, class _Sent, class _Func>
-_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __for_each(_InputIterator __first, _Sent __last, _Func& __f) {
+template <class _InputIterator, class _Sent, class _Func, class _Proj>
+_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _InputIterator
+__for_each(_InputIterator __first, _Sent __last, _Func& __f, _Proj& __proj) {
   for (; __first != __last; ++__first)
-    __f(*__first);
+    std::invoke(__f, std::invoke(__proj, *__first));
+  return __first;
 }
 
 #ifndef _LIBCPP_CXX03_LANG
 template <class _SegmentedIterator,
           class _Function,
+          class _Proj,
           __enable_if_t<__is_segmented_iterator<_SegmentedIterator>::value, int> = 0>
 _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void
-__for_each(_SegmentedIterator __first, _SegmentedIterator __last, _Function& __func) {
+__for_each(_SegmentedIterator __first, _SegmentedIterator __last, _Function& __func, _Proj& __proj) {
   using __local_iterator_t = typename __segmented_iterator_traits<_SegmentedIterator>::__local_iterator;
   std::__for_each_segment(__first, __last, [&](__local_iterator_t __lfirst, __local_iterator_t __llast) {
-    std::__for_each(__lfirst, __llast, __func);
+    std::__for_each(__lfirst, __llast, __func, __proj);
   });
 }
 #endif // !_LIBCPP_CXX03_LANG
@@ -43,7 +48,8 @@ __for_each(_SegmentedIterator __first, _SegmentedIterator __last, _Function& __f
 template <class _InputIterator, class _Function>
 _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Function
 for_each(_InputIterator __first, _InputIterator __last, _Function __f) {
-  std::__for_each(__first, __last, __f);
+  __identity __proj;
+  std::__for_each(__first, __last, __f, __proj);
   return __f;
 }
 
diff --git a/libcxx/include/__algorithm/for_each_n.h b/libcxx/include/__algorithm/for_each_n.h
index 169de84b4d95f..953662afd6310 100644
--- a/libcxx/include/__algorithm/for_each_n.h
+++ b/libcxx/include/__algorithm/for_each_n.h
@@ -13,8 +13,9 @@
 #include <__algorithm/for_each.h>
 #include <__algorithm/for_each_n_segment.h>
 #include <__config>
+#include <__functional/identity.h>
+#include <__functional/invoke.h>
 #include <__iterator/iterator_traits.h>
-#include <__iterator/next.h>
 #include <__iterator/segmented_iterator.h>
 #include <__type_traits/disjunction.h>
 #include <__type_traits/enable_if.h>
@@ -34,16 +35,17 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 template <class _InputIterator,
           class _Size,
           class _Func,
+          class _Proj,
           __enable_if_t<!__has_random_access_iterator_category<_InputIterator>::value &&
                             _Or< _Not<__is_segmented_iterator<_InputIterator> >,
                                  _Not<__has_random_access_local_iterator<_InputIterator> > >::value,
                         int> = 0>
 _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _InputIterator
-__for_each_n(_InputIterator __first, _Size __orig_n, _Func& __f) {
+__for_each_n(_InputIterator __first, _Size __orig_n, _Func& __f, _Proj& __proj) {
   typedef decltype(std::__convert_to_integral(__orig_n)) _IntegralSize;
   _IntegralSize __n = __orig_n;
   while (__n > 0) {
-    __f(*__first);
+    std::invoke(__f, std::invoke(__proj, *__first));
     ++__first;
     --__n;
   }
@@ -53,12 +55,13 @@ __for_each_n(_InputIterator __first, _Size __orig_n, _Func& __f) {
 template <class _RandIter,
           class _Size,
           class _Func,
+          class _Proj,
           __enable_if_t<__has_random_access_iterator_category<_RandIter>::value, int> = 0>
 _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _RandIter
 __for_each_n(_RandIter __first, _Size __orig_n, _Func& __f) {
   typename std::iterator_traits<_RandIter>::difference_type __n = __orig_n;
   auto __last                                                   = __first + __n;
-  std::__for_each(__first, __last, __f);
+  std::__for_each(__first, __last, __f, __proj);
   return std::move(__last);
 }
 
@@ -66,16 +69,17 @@ __for_each_n(_RandIter __first, _Size __orig_n, _Func& __f) {
 template <class _SegmentedIterator,
           class _Size,
           class _Func,
+          class _Proj,
           __enable_if_t<!__has_random_access_iterator_category<_SegmentedIterator>::value &&
                             __is_segmented_iterator<_SegmentedIterator>::value &&
                             __has_random_access_iterator_category<
                                 typename __segmented_iterator_traits<_SegmentedIterator>::__local_iterator>::value,
                         int> = 0>
 _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _SegmentedIterator
-__for_each_n(_SegmentedIterator __first, _Size __orig_n, _Func& __f) {
+__for_each_n(_SegmentedIterator __first, _Size __orig_n, _Func& __f, _Proj& __proj) {
   using __local_iterator_t = typename __segmented_iterator_traits<_SegmentedIterator>::__local_iterator;
   return std::__for_each_n_segment(__first, __orig_n, [&](__local_iterator_t __lfirst, __local_iterator_t __llast) {
-    std::__for_each(__lfirst, __llast, __f);
+    std::__for_each(__lfirst, __llast, __f, __proj);
   });
 }
 #endif // !_LIBCPP_CXX03_LANG
@@ -85,7 +89,8 @@ __for_each_n(_SegmentedIterator __first, _Size __orig_n, _Func& __f) {
 template <class _InputIterator, class _Size, class _Function>
 inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _InputIterator
 for_each_n(_InputIterator __first, _Size __orig_n, _Function __f) {
-  return std::__for_each_n(__first, __orig_n, __f);
+  __identity __proj;
+  return std::__for_each_n(__first, __orig_n, __f, __proj);
 }
 
 #endif // _LIBCPP_STD_VER >= 17
diff --git a/libcxx/include/__algorithm/for_each_n_segment.h b/libcxx/include/__algorithm/for_each_n_segment.h
index 1b522fb373eee..6c257dbcdc3ea 100644
--- a/libcxx/include/__algorithm/for_each_n_segment.h
+++ b/libcxx/include/__algorithm/for_each_n_segment.h
@@ -10,7 +10,13 @@
 #define _LIBCPP___ALGORITHM_FOR_EACH_N_SEGMENT_H
 
 #include <__config>
+<<<<<<< HEAD
 #include <__iterator/iterator_traits.h>
+=======
+#include <__iterator/distance.h>
+#include <__iterator/iterator_traits.h>
+#include <__iterator/next.h>
+>>>>>>> 4a86118918e8 (Fix review comments)
 #include <__iterator/segmented_iterator.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
diff --git a/libcxx/include/__algorithm/ranges_for_each.h b/libcxx/include/__algorithm/ranges_for_each.h
index 961f7558149a3..ed0dcde688406 100644
--- a/libcxx/include/__algorithm/ranges_for_each.h
+++ b/libcxx/include/__algorithm/ranges_for_each.h
@@ -44,19 +44,13 @@ struct __for_each {
   template <class _Iter, class _Sent, class _Proj, class _Func>
   _LIBCPP_HIDE_FROM_ABI constexpr static for_each_result<_Iter, _Func>
   __for_each_impl(_Iter __first, _Sent __last, _Func& __func, _Proj& __proj) {
-    if constexpr (std::assignable_from<_Iter&, _Sent>) {
-      _Iter __end = std::move(__last);
-      std::for_each(__first, __end, [&](auto&& __val) { std::invoke(__func, std::invoke(__proj, __val)); });
-      return {std::move(__end), std::move(__func)};
-    } else if constexpr (sized_sentinel_for<_Sent, _Iter>) {
-      auto __end = std::for_each_n(__first, __last - __first, [&](auto&& __val) {
-        std::invoke(__func, std::invoke(__proj, __val));
-      });
+    if constexpr (!std::assignable_from<_Iter&, _Sent> && sized_sentinel_for<_Sent, _Iter>) {
+      auto __n   = __last - __first;
+      auto __end = std::__for_each_n(std::move(__first), __n, __func, __proj);
       return {std::move(__end), std::move(__func)};
     } else {
-      for (; __first != __last; ++__first)
-        std::invoke(__func, std::invoke(__proj, *__first));
-      return {std::move(__first), std::move(__func)};
+      auto __end = std::__for_each(std::move(__first), std::move(__last), __func, __proj);
+      return {std::move(__end), std::move(__func)};
     }
   }
 
diff --git a/libcxx/include/__algorithm/ranges_for_each_n.h b/libcxx/include/__algorithm/ranges_for_each_n.h
index b92eeb6fa8d7c..ebcd38a8eef6f 100644
--- a/libcxx/include/__algorithm/ranges_for_each_n.h
+++ b/libcxx/include/__algorithm/ranges_for_each_n.h
@@ -17,7 +17,6 @@
 #include <__iterator/concepts.h>
 #include <__iterator/incrementable_traits.h>
 #include <__iterator/iterator_traits.h>
-#include <__iterator/next.h>
 #include <__iterator/projected.h>
 #include <__ranges/concepts.h>
 #include <__utility/move.h>
@@ -42,17 +41,8 @@ struct __for_each_n {
   template <input_iterator _Iter, class _Proj = identity, indirectly_unary_invocable<projected<_Iter, _Proj>> _Func>
   _LIBCPP_HIDE_FROM_ABI constexpr for_each_n_result<_Iter, _Func>
   operator()(_Iter __first, iter_difference_t<_Iter> __count, _Func __func, _Proj __proj = {}) const {
-    if constexpr (forward_iterator<_Iter>) {
-      auto __f    = [&](auto&& __val) { std::invoke(__func, std::invoke(__proj, __val)); };
-      auto __last = std::for_each_n(__first, __count, __f);
-      return {std::move(__last), std::move(__func)};
-    } else {
-      while (__count-- > 0) {
-        std::invoke(__func, std::invoke(__proj, *__first));
-        ++__first;
-      }
-      return {std::move(__first), std::move(__func)};
-    }
+    auto __last = std::__for_each_n(std::move(__first), __count, __func, __proj);
+    return {std::move(__last), std::move(__func)};
   }
 };
 
diff --git a/libcxx/test/benchmarks/algorithms/nonmodifying/for_each.bench.cpp b/libcxx/test/benchmarks/algorithms/nonmodifying/for_each.bench.cpp
index 1e33cf70f8487..9151ca19c7862 100644
--- a/libcxx/test/benchmarks/algorithms/nonmodifying/for_each.bench.cpp
+++ b/libcxx/test/benchmarks/algorithms/nonmodifying/for_each.bench.cpp
@@ -12,6 +12,7 @@
 #include <cstddef>
 #include <deque>
 #include <list>
+#include <ranges>
 #include <string>
 #include <vector>
 
@@ -48,20 +49,6 @@ int main(int argc, char** argv) {
           ->Arg(1 << 16)
           ->Arg(1 << 18);
     };
-    bm.operator()<std::vector<char>>("std::for_each(vector<char>)", std_for_each);
-    bm.operator()<std::deque<char>>("std::for_each(deque<char>)", std_for_each);
-    bm.operator()<std::list<char>>("std::for_each(list<char>)", std_for_each);
-    bm.operator()<std::vector<char>>("rng::for_each(vector<char>)", std::ranges::for_each);
-    bm.operator()<std::deque<char>>("rng::for_each(deque<char>)", std::ranges::for_each);
-    bm.operator()<std::list<char>>("rng::for_each(list<char>)", std::ranges::for_each);
-
-    bm.operator()<std::vector<short>>("std::for_each(vector<short>)", std_for_each);
-    bm.operator()<std::deque<short>>("std::for_each(deque<short>)", std_for_each);
-    bm.operator()<std::list<short>>("std::for_each(list<short>)", std_for_each);
-    bm.operator()<std::vector<short>>("rng::for_each(vector<short>)", std::ranges::for_each);
-    bm.operator()<std::deque<short>>("rng::for_each(deque<short>)", std::ranges::for_each);
-    bm.operator()<std::list<short>>("rng::for_each(list<short>)", std::ranges::for_each);
-
     bm.operator()<std::vector<int>>("std::for_each(vector<int>)", std_for_each);
     bm.operator()<std::deque<int>>("std::for_each(deque<int>)", std_for_each);
     bm.operator()<std::list<int>>("std::for_each(list<int>)", std_for_each);
@@ -70,6 +57,47 @@ int main(int argc, char** argv) {
     bm.operator()<std::list<int>>("rng::for_each(list<int>)", std::ranges::for_each);
   }
 
+  // {std,ranges}::for_each for join_view
+  {
+    auto bm = []<class Container>(std::string name, auto for_each) {
+      using C1       = typename Container::value_type;
+      using ElemType = typename C1::value_type;
+
+      benchmark::RegisterBenchmark(
+          name,
+          [for_each](auto& st) {
+            std::size_t const size     = st.range(0);
+            std::size_t const seg_size = 256;
+            std::size_t const segments = (size + seg_size - 1) / seg_size;
+            Container c(segments);
+            for (std::size_t i = 0, n = size; i < segments; ++i, n -= seg_size) {
+              c[i].resize(std::min(seg_size, n), ElemType(1));
+            }
+
+            auto view  = c | std::views::join;
+            auto first = view.begin();
+            auto last  = view.end();
+
+            for ([[maybe_unused]] auto _ : st) {
+              benchmark::DoNotOptimize(c);
+              auto result = for_each(first, last, [](ElemType& x) { x = std::clamp<ElemType>(x, 10, 100); });
+              benchmark::DoNotOptimize(result);
+            }
+          })
+          ->Arg(8)
+          ->Arg(32)
+          ->Arg(50) // non power-of-two
+          ->Arg(1024)
+          ->Arg(4096)
+          ->Arg(8192)
+          ->Arg(1 << 14)
+          ->Arg(1 << 16)
+          ->Arg(1 << 18);
+    };
+    bm.operator()<std::vector<std::vector<int>>>("std::for_each(join_view(vector<vector<int>>))", std_for_each);
+    bm.operator()<std::vector<std::vector<int>>>("rng::for_each(join_view(vector<vector<int>>)", std::ranges::for_each);
+  }
+
   benchmark::Initialize(&argc, argv);
   benchmark::RunSpecifiedBenchmarks();
   benchmark::Shutdown();
diff --git a/libcxx/test/benchmarks/algorithms/nonmodifying/for_each_join_view.bench.cpp b/libcxx/test/benchmarks/algorithms/nonmodifying/for_each_join_view.bench.cpp
deleted file mode 100644
index 28398ac988bf7..0000000000000
--- a/libcxx/test/benchmarks/algorithms/nonmodifying/for_each_join_view.bench.cpp
+++ /dev/null
@@ -1,122 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// UNSUPPORTED: c++03, c++11, c++14, c++17
-
-#include <algorithm>
-#include <cstddef>
-#include <deque>
-#include <list>
-#include <ranges>
-#include <string>
-#include <vector>
-
-#include <benchmark/benchmark.h>
-
-int main(int argc, char** argv) {
-  auto std_for_each   = [](auto first, auto last, auto f) { return std::for_each(first, last, f); };
-  auto std_for_each_n = [](auto first, auto n, auto f) { return std::for_each_n(first, n, f); };
-
-  // {std,ranges}::for_each
-  {
-    auto bm = []<class Container>(std::string name, auto for_each) {
-      using C1       = typename Container::value_type;
-      using ElemType = typename C1::value_type;
-
-      benchmark::RegisterBenchmark(
-          name,
-          [for_each](auto& st) {
-            std::size_t const size     = st.range(0);
-            std::size_t const seg_size = 256;
-            std::size_t const segments = (size + seg_size - 1) / seg_size;
-            Container c(segments);
-            for (std::size_t i = 0, n = size; i < segments; ++i, n -= seg_size) {
-              c[i].resize(std::min(seg_size, n), ElemType(1));
-            }
-
-            auto view  = c | std::views::join;
-            auto first = view.begin();
-            auto last  = view.end();
-
-            for ([[maybe_unused]] auto _ : st) {
-              benchmark::DoNotOptimize(c);
-              auto result = for_each(first, last, [](ElemType& x) { x = std::clamp<ElemType>(x, 10, 100); });
-              benchmark::DoNotOptimize(result);
-            }
-          })
-          ->Arg(8)
-          ->Arg(32)
-          ->Arg(50) // non power-of-two
-          ->Arg(1024)
-          ->Arg(4096)
-          ->Arg(8192)
-          ->Arg(1 << 14)
-          ->Arg(1 << 16)
-          ->Arg(1 << 18);
-    };
-    bm.operator()<std::vector<std::vector<char>>>("std::for_each(join_view(vector<vector<char>>))", std_for_each);
-    bm.operator()<std::vector<std::vector<short>>>("std::for_each(join_view(vector<vector<short>>))", std_for_each);
-    bm.operator()<std::vector<std::vector<int>>>("std::for_each(join_view(vector<vector<int>>))", std_for_each);
-    bm.operator()<std::vector<std::vector<char>>>(
-        "rng::for_each(join_view(vector<vector<char>>)", std::ranges::for_each);
-    bm.operator()<std::vector<std::vector<short>>>(
-        "rng::for_each(join_view(vector<vector<short>>)", std::ranges::for_each);
-    bm.operator()<std::vector<std::vector<int>>>("rng::for_each(join_view(vector<vector<int>>)", std::ranges::for_each);
-  }
-
-  // {std,ranges}::for_each_n
-  {
-    auto bm = []<class Container>(std::string name, auto for_each_n) {
-      using C1       = typename Container::value_type;
-      using ElemType = typename C1::value_type;
-      benchmark::RegisterBenchmark(
-          name,
-          [for_each_n](auto& st) {
-            std::size_t const size     = st.range(0);
-            std::size_t const seg_size = 256;
-            std::size_t const segments = (size + seg_size - 1) / seg_size;
-            Container c(segments);
-            for (std::size_t i = 0, n = size; i < segments; ++i, n -= seg_size) {
-              c[i].resize(std::min(seg_size, n), ElemType(1));
-            }
-
-            auto view  = c | std::views::join;
-            auto first = view.begin();
-
-            for ([[maybe_unused]] auto _ : st) {
-              benchmark::DoNotOptimize(c);
-              auto result = for_each_n(first, size, [](ElemType& x) { x = std::clamp<ElemType>(x, 10, 100); });
-              benchmark::DoNotOptimize(result);
-            }
-          })
-          ->Arg(8)
-          ->Arg(32)
-          ->Arg(50) // non power-of-two
-          ->Arg(1024)
-          ->Arg(4096)
-          ->Arg(8192)
-          ->Arg(1 << 14)
-          ->Arg(1 << 16)
-          ->Arg(1 << 18);
-    };
-    bm.operator()<std::vector<std::vector<char>>>("std::for_each_n(join_view(vector<vector<char>>))", std_for_each_n);
-    bm.operator()<std::vector<std::vector<short>>>("std::for_each_n(join_view(vector<vector<short>>))", std_for_each_n);
-    bm.operator()<std::vector<std::vector<int>>>("std::for_each_n(join_view(vector<vector<int>>))", std_for_each_n);
-    bm.operator()<std::vector<std::vector<char>>>(
-        "rng::for_each_n(join_view(vector<vector<char>>)", std::ranges::for_each_n);
-    bm.operator()<std::vector<std::vector<short>>>(
-        "rng::for_each_n(join_view(vector<vector<short>>)", std::ranges::for_each_n);
-    bm.operator()<std::vector<std::vector<int>>>(
-        "rng::for_each_n(join_view(vector<vector<int>>)", std::ranges::for_each_n);
-  }
-
-  benchmark::Initialize(&argc, argv);
-  benchmark::RunSpecifiedBenchmarks();
-  benchmark::Shutdown();
-  return 0;
-}
diff --git a/libcxx/test/benchmarks/algorithms/nonmodifying/for_each_n.bench.cpp b/libcxx/test/benchmarks/algorithms/nonmodifying/for_each_n.bench.cpp
index f0dcc30a39e14..3ace25a6052b6 100644
--- a/libcxx/test/benchmarks/algorithms/nonmodifying/for_each_n.bench.cpp
+++ b/libcxx/test/benchmarks/algorithms/nonmodifying/for_each_n.bench.cpp
@@ -48,20 +48,6 @@ int main(int argc, char** argv) {
           ->Arg(1 << 16)
           ->Arg(1 << 18);
     };
-    bm.operator()<std::vector<char>>("std::for_each_n(vector<char>)", std_for_each_n);
-    bm.operator()<std::deque<char>>("std::for_each_n(deque<char>)", std_for_each_n);
-    bm.operator()<std::list<char>>("std::for_each_n(list<char>)", std_for_each_n);
-    bm.operator()<std::vector<char>>("rng::for_each_n(vector<char>)", std::ranges::for_each_n);
-    bm.operator()<std::deque<char>>("rng::for_each_n(deque<char>)", std::ranges::for_each_n);
-    bm.operator()<std::list<char>>("rng::for_each_n(list<char>)", std::ranges::for_each_n);
-
-    bm.operator()<std::vector<short>>("std::for_each_n(vector<short>)", std_for_each_n);
-    bm.operator()<std::deque<short>>("std::for_each_n(deque<short>)", std_for_each_n);
-    bm.operator()<std::list<short>>("std::for_each_n(list<short>)", std_for_each_n);
-    bm.operator()<std::vector<short>>("rng::for_each_n(vector<short>)", std::ranges::for_each_n);
-    bm.operator()<std::deque<short>>("rng::for_each_n(deque<short>)", std::ranges::for_each_n);
-    bm.operator()<std::list<short>>("rng::for_each_n(list<short>)", std::ranges::for_each_n);
-
     bm.operator()<std::vector<int>>("std::for_each_n(vector<int>)", std_for_each_n);
     bm.operator()<std::deque<int>>("std::for_each_n(deque<int>)", std_for_each_n);
     bm.operator()<std::list<int>>("std::for_each_n(list<int>)", std_for_each_n);
@@ -105,6 +91,46 @@ int main(int argc, char** argv) {
     bm.operator()<std::vector<std::vector<int>>>("std::for_each_n(join_view(vector<vector<int>>))", std_for_each_n);
   }
 
+  // {std,ranges}::for_each_n for join_view
+  {
+    auto bm = []<class Container>(std::string name, auto for_each_n) {
+      using C1       = typename Container::value_type;
+      using ElemType = typename C1::value_type;
+      benchmark::RegisterBenchmark(
+          name,
+          [for_each_n](auto& st) {
+            std::size_t const size     = st.range(0);
+            std::size_t const seg_size = 256;
+            std::size_t const segments = (size + seg_size - 1) / seg_size;
+            Container c(segments);
+            for (std::size_t i = 0, n = size; i < segments; ++i, n -= seg_size) {
+              c[i].resize(std::min(seg_size, n), ElemType(1));
+            }
+
+            auto view  = c | std::views::join;
+            auto first = view.begin();
+
+            for ([[maybe_unused]] auto _ : st) {
+              benchmark::DoNotOptimize(c);
+              auto result = for_each_n(first, size, [](ElemType& x) { x = std::clamp<ElemType>(x, 10, 100); });
+              benchmark::DoNotOptimize(result);
+            }
+          })
+          ->Arg(8)
+          ->Arg(32)
+          ->Arg(50) // non power-of-two
+          ->Arg(1024)
+          ->Arg(4096)
+          ->Arg(8192)
+          ->Arg(1 << 14)
+          ->Arg(1 << 16)
+          ->Arg(1 << 18);
+    };
+    bm.operator()<std::vector<std::vector<int>>>("std::for_each_n(join_view(vector<vector<int>>))", std_for_each_n);
+    bm.operator()<std::vector<std::vector<int>>>(
+        "rng::for_each_n(join_view(vector<vector<int>>)", std::ranges::for_each_n);
+  }
+
   benchmark::Initialize(&argc, argv);
   benchmark::RunSpecifiedBenchmarks();
   benchmark::Shutdown();

>From 5224ec967d5a431225cecba34d9f435a0458511b Mon Sep 17 00:00:00 2001
From: Peng Liu <winner245 at hotmail.com>
Date: Fri, 4 Apr 2025 21:17:22 -0400
Subject: [PATCH 10/12] Fix invoke call by using std::__invoke

---
 libcxx/include/__algorithm/for_each.h          | 12 ++++++------
 libcxx/include/__algorithm/for_each_n.h        |  9 +++++----
 libcxx/include/__algorithm/ranges_for_each.h   |  1 -
 libcxx/include/__algorithm/ranges_for_each_n.h |  1 -
 4 files changed, 11 insertions(+), 12 deletions(-)

diff --git a/libcxx/include/__algorithm/for_each.h b/libcxx/include/__algorithm/for_each.h
index 01ddad761bb57..a3cee6783154f 100644
--- a/libcxx/include/__algorithm/for_each.h
+++ b/libcxx/include/__algorithm/for_each.h
@@ -13,9 +13,9 @@
 #include <__algorithm/for_each_segment.h>
 #include <__config>
 #include <__functional/identity.h>
-#include <__functional/invoke.h>
 #include <__iterator/segmented_iterator.h>
 #include <__type_traits/enable_if.h>
+#include <__type_traits/invoke.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
@@ -27,13 +27,13 @@ template <class _InputIterator, class _Sent, class _Func, class _Proj>
 _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _InputIterator
 __for_each(_InputIterator __first, _Sent __last, _Func& __f, _Proj& __proj) {
   for (; __first != __last; ++__first)
-    std::invoke(__f, std::invoke(__proj, *__first));
+    std::__invoke(__f, std::__invoke(__proj, *__first));
   return __first;
 }
 
 #ifndef _LIBCPP_CXX03_LANG
 template <class _SegmentedIterator,
-          class _Function,
+          class _Func,
           class _Proj,
           __enable_if_t<__is_segmented_iterator<_SegmentedIterator>::value, int> = 0>
 _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void
@@ -45,9 +45,9 @@ __for_each(_SegmentedIterator __first, _SegmentedIterator __last, _Function& __f
 }
 #endif // !_LIBCPP_CXX03_LANG
 
-template <class _InputIterator, class _Function>
-_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Function
-for_each(_InputIterator __first, _InputIterator __last, _Function __f) {
+template <class _InputIterator, class _Func>
+_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Func
+for_each(_InputIterator __first, _InputIterator __last, _Func __f) {
   __identity __proj;
   std::__for_each(__first, __last, __f, __proj);
   return __f;
diff --git a/libcxx/include/__algorithm/for_each_n.h b/libcxx/include/__algorithm/for_each_n.h
index 953662afd6310..fb0f14fae49ae 100644
--- a/libcxx/include/__algorithm/for_each_n.h
+++ b/libcxx/include/__algorithm/for_each_n.h
@@ -14,12 +14,13 @@
 #include <__algorithm/for_each_n_segment.h>
 #include <__config>
 #include <__functional/identity.h>
-#include <__functional/invoke.h>
 #include <__iterator/iterator_traits.h>
+#include <__iterator/next.h>
 #include <__iterator/segmented_iterator.h>
 #include <__type_traits/disjunction.h>
 #include <__type_traits/enable_if.h>
 #include <__type_traits/negation.h>
+#include <__type_traits/invoke.h>
 #include <__utility/convert_to_integral.h>
 #include <__utility/move.h>
 
@@ -45,7 +46,7 @@ __for_each_n(_InputIterator __first, _Size __orig_n, _Func& __f, _Proj& __proj)
   typedef decltype(std::__convert_to_integral(__orig_n)) _IntegralSize;
   _IntegralSize __n = __orig_n;
   while (__n > 0) {
-    std::invoke(__f, std::invoke(__proj, *__first));
+    std::__invoke(__f, std::__invoke(__proj, *__first));
     ++__first;
     --__n;
   }
@@ -86,9 +87,9 @@ __for_each_n(_SegmentedIterator __first, _Size __orig_n, _Func& __f, _Proj& __pr
 
 #if _LIBCPP_STD_VER >= 17
 
-template <class _InputIterator, class _Size, class _Function>
+template <class _InputIterator, class _Size, class _Func>
 inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _InputIterator
-for_each_n(_InputIterator __first, _Size __orig_n, _Function __f) {
+for_each_n(_InputIterator __first, _Size __orig_n, _Func __f) {
   __identity __proj;
   return std::__for_each_n(__first, __orig_n, __f, __proj);
 }
diff --git a/libcxx/include/__algorithm/ranges_for_each.h b/libcxx/include/__algorithm/ranges_for_each.h
index ed0dcde688406..1b11b52798dd6 100644
--- a/libcxx/include/__algorithm/ranges_for_each.h
+++ b/libcxx/include/__algorithm/ranges_for_each.h
@@ -15,7 +15,6 @@
 #include <__concepts/assignable.h>
 #include <__config>
 #include <__functional/identity.h>
-#include <__functional/invoke.h>
 #include <__iterator/concepts.h>
 #include <__iterator/projected.h>
 #include <__ranges/access.h>
diff --git a/libcxx/include/__algorithm/ranges_for_each_n.h b/libcxx/include/__algorithm/ranges_for_each_n.h
index ebcd38a8eef6f..3aab1b79c10a1 100644
--- a/libcxx/include/__algorithm/ranges_for_each_n.h
+++ b/libcxx/include/__algorithm/ranges_for_each_n.h
@@ -13,7 +13,6 @@
 #include <__algorithm/in_fun_result.h>
 #include <__config>
 #include <__functional/identity.h>
-#include <__functional/invoke.h>
 #include <__iterator/concepts.h>
 #include <__iterator/incrementable_traits.h>
 #include <__iterator/iterator_traits.h>

>From 2000f99c5a1de90e19828c335e00ebe56059d3ac Mon Sep 17 00:00:00 2001
From: Peng Liu <winner245 at hotmail.com>
Date: Sat, 5 Apr 2025 16:45:17 -0400
Subject: [PATCH 11/12] Refactor to simplify logic of for_each_n_segment.h

---
 libcxx/docs/ReleaseNotes/21.rst               | 15 ++-----
 libcxx/include/__algorithm/for_each.h         | 11 ++++-
 libcxx/include/__algorithm/for_each_n.h       |  7 ++--
 .../include/__algorithm/for_each_n_segment.h  |  6 ---
 .../nonmodifying/for_each_n.bench.cpp         | 41 ++-----------------
 5 files changed, 19 insertions(+), 61 deletions(-)

diff --git a/libcxx/docs/ReleaseNotes/21.rst b/libcxx/docs/ReleaseNotes/21.rst
index 49c188ebac420..9f1a32a222f0d 100644
--- a/libcxx/docs/ReleaseNotes/21.rst
+++ b/libcxx/docs/ReleaseNotes/21.rst
@@ -64,22 +64,11 @@ Improvements and New Features
 
 - The ``num_put::do_put`` integral overloads have been optimized, resulting in a performance improvement of up to 2.4x.
 
-<<<<<<< HEAD
 - The ``std::stable_sort`` algorithm uses radix sort for floating-point types now, which can improve the performance
   up to 10x, depending on type of sorted elements and the initial state of the sorted array.
 
 - The segmented iterator optimization for ``std::for_each`` has been backported to C++11. Previously it was only available
   in C++23 and later.
-=======
-- The ``std::ranges::for_each`` and ``std::ranges::for_each_n`` algorithms have been optimized for segmented iterators,
-<<<<<<< HEAD
-  resulting in performance improvements of up to 21.3x for ``std::deque::iterator`` segmented inputs and 24.9x for
-  ``join_view`` of ``vector<vector<T>>``.
->>>>>>> 50ac206d4a13 (Apply optimization for join_view segmented iterators)
-=======
-  resulting in performance improvements of up to 21.3x for ``std::deque::iterator`` and 24.9x for ``join_view`` of
-  ``vector<vector<char>>``.
->>>>>>> 590136ba0d9f (Fix review comments)
 
 - The ``std::for_each_n`` algorithm has been optimized for segmented iterators, resulting in a performance improvement of
   up to 17.7x for ``std::deque<short>`` iterators, and up to 13.9x for ``std::join_view<vector<vector<short>>>`` iterators.
@@ -87,6 +76,10 @@ Improvements and New Features
 - The ``bitset::to_string`` function has been optimized, resulting in a performance improvement of up to 8.3x for bitsets
   with uniformly distributed zeros and ones, and up to 13.5x and 16.1x for sparse and dense bitsets, respectively.
 
+- The ``std::ranges::for_each`` and ``std::ranges::for_each_n`` algorithms have been optimized for segmented iterators,
+  resulting in performance improvements of up to 21.3x for ``std::deque::iterator`` and 24.9x for ``join_view`` of
+  ``vector<vector<char>>``.
+
 Deprecations and Removals
 -------------------------
 
diff --git a/libcxx/include/__algorithm/for_each.h b/libcxx/include/__algorithm/for_each.h
index a3cee6783154f..4167eec3506e4 100644
--- a/libcxx/include/__algorithm/for_each.h
+++ b/libcxx/include/__algorithm/for_each.h
@@ -16,11 +16,15 @@
 #include <__iterator/segmented_iterator.h>
 #include <__type_traits/enable_if.h>
 #include <__type_traits/invoke.h>
+#include <__utility/move.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 template <class _InputIterator, class _Sent, class _Func, class _Proj>
@@ -36,12 +40,13 @@ template <class _SegmentedIterator,
           class _Func,
           class _Proj,
           __enable_if_t<__is_segmented_iterator<_SegmentedIterator>::value, int> = 0>
-_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void
-__for_each(_SegmentedIterator __first, _SegmentedIterator __last, _Function& __func, _Proj& __proj) {
+_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _SegmentedIterator
+__for_each(_SegmentedIterator __first, _SegmentedIterator __last, _Func& __func, _Proj& __proj) {
   using __local_iterator_t = typename __segmented_iterator_traits<_SegmentedIterator>::__local_iterator;
   std::__for_each_segment(__first, __last, [&](__local_iterator_t __lfirst, __local_iterator_t __llast) {
     std::__for_each(__lfirst, __llast, __func, __proj);
   });
+  return __last;
 }
 #endif // !_LIBCPP_CXX03_LANG
 
@@ -55,4 +60,6 @@ for_each(_InputIterator __first, _InputIterator __last, _Func __f) {
 
 _LIBCPP_END_NAMESPACE_STD
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_FOR_EACH_H
diff --git a/libcxx/include/__algorithm/for_each_n.h b/libcxx/include/__algorithm/for_each_n.h
index fb0f14fae49ae..9a6c6bb5175d6 100644
--- a/libcxx/include/__algorithm/for_each_n.h
+++ b/libcxx/include/__algorithm/for_each_n.h
@@ -15,12 +15,11 @@
 #include <__config>
 #include <__functional/identity.h>
 #include <__iterator/iterator_traits.h>
-#include <__iterator/next.h>
 #include <__iterator/segmented_iterator.h>
 #include <__type_traits/disjunction.h>
 #include <__type_traits/enable_if.h>
-#include <__type_traits/negation.h>
 #include <__type_traits/invoke.h>
+#include <__type_traits/negation.h>
 #include <__utility/convert_to_integral.h>
 #include <__utility/move.h>
 
@@ -59,11 +58,11 @@ template <class _RandIter,
           class _Proj,
           __enable_if_t<__has_random_access_iterator_category<_RandIter>::value, int> = 0>
 _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _RandIter
-__for_each_n(_RandIter __first, _Size __orig_n, _Func& __f) {
+__for_each_n(_RandIter __first, _Size __orig_n, _Func& __f, _Proj& __proj) {
   typename std::iterator_traits<_RandIter>::difference_type __n = __orig_n;
   auto __last                                                   = __first + __n;
   std::__for_each(__first, __last, __f, __proj);
-  return std::move(__last);
+  return __last;
 }
 
 #ifndef _LIBCPP_CXX03_LANG
diff --git a/libcxx/include/__algorithm/for_each_n_segment.h b/libcxx/include/__algorithm/for_each_n_segment.h
index 6c257dbcdc3ea..1b522fb373eee 100644
--- a/libcxx/include/__algorithm/for_each_n_segment.h
+++ b/libcxx/include/__algorithm/for_each_n_segment.h
@@ -10,13 +10,7 @@
 #define _LIBCPP___ALGORITHM_FOR_EACH_N_SEGMENT_H
 
 #include <__config>
-<<<<<<< HEAD
 #include <__iterator/iterator_traits.h>
-=======
-#include <__iterator/distance.h>
-#include <__iterator/iterator_traits.h>
-#include <__iterator/next.h>
->>>>>>> 4a86118918e8 (Fix review comments)
 #include <__iterator/segmented_iterator.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
diff --git a/libcxx/test/benchmarks/algorithms/nonmodifying/for_each_n.bench.cpp b/libcxx/test/benchmarks/algorithms/nonmodifying/for_each_n.bench.cpp
index 3ace25a6052b6..e6624bd304447 100644
--- a/libcxx/test/benchmarks/algorithms/nonmodifying/for_each_n.bench.cpp
+++ b/libcxx/test/benchmarks/algorithms/nonmodifying/for_each_n.bench.cpp
@@ -51,44 +51,9 @@ int main(int argc, char** argv) {
     bm.operator()<std::vector<int>>("std::for_each_n(vector<int>)", std_for_each_n);
     bm.operator()<std::deque<int>>("std::for_each_n(deque<int>)", std_for_each_n);
     bm.operator()<std::list<int>>("std::for_each_n(list<int>)", std_for_each_n);
-  }
-
-  // std::for_each_n for join_view
-  {
-    auto bm = []<class Container>(std::string name, auto for_each_n) {
-      using C1       = typename Container::value_type;
-      using ElemType = typename C1::value_type;
-      benchmark::RegisterBenchmark(
-          name,
-          [for_each_n](auto& st) {
-            std::size_t const size     = st.range(0);
-            std::size_t const seg_size = 256;
-            std::size_t const segments = (size + seg_size - 1) / seg_size;
-            Container c(segments);
-            for (std::size_t i = 0, n = size; i < segments; ++i, n -= seg_size) {
-              c[i].resize(std::min(seg_size, n), ElemType(1));
-            }
-
-            auto view  = c | std::views::join;
-            auto first = view.begin();
-
-            for ([[maybe_unused]] auto _ : st) {
-              benchmark::DoNotOptimize(c);
-              auto result = for_each_n(first, size, [](ElemType& x) { x = std::clamp<ElemType>(x, 10, 100); });
-              benchmark::DoNotOptimize(result);
-            }
-          })
-          ->Arg(8)
-          ->Arg(32)
-          ->Arg(50) // non power-of-two
-          ->Arg(1024)
-          ->Arg(4096)
-          ->Arg(8192)
-          ->Arg(1 << 14)
-          ->Arg(1 << 16)
-          ->Arg(1 << 18);
-    };
-    bm.operator()<std::vector<std::vector<int>>>("std::for_each_n(join_view(vector<vector<int>>))", std_for_each_n);
+    bm.operator()<std::vector<int>>("rng::for_each_n(vector<int>)", std::ranges::for_each_n);
+    bm.operator()<std::deque<int>>("rng::for_each_n(deque<int>)", std::ranges::for_each_n);
+    bm.operator()<std::list<int>>("rng::for_each_n(list<int>)", std::ranges::for_each_n);
   }
 
   // {std,ranges}::for_each_n for join_view

>From df7ac697c04c1b5fb97932ba86c54f3375c0f053 Mon Sep 17 00:00:00 2001
From: Peng Liu <winner245 at hotmail.com>
Date: Sat, 7 Jun 2025 07:56:55 -0400
Subject: [PATCH 12/12] Address ldionne's comments

---
 libcxx/docs/ReleaseNotes/21.rst              | 11 ++++-------
 libcxx/include/__algorithm/ranges_for_each.h |  5 ++++-
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/libcxx/docs/ReleaseNotes/21.rst b/libcxx/docs/ReleaseNotes/21.rst
index 9f1a32a222f0d..dc787e3486c81 100644
--- a/libcxx/docs/ReleaseNotes/21.rst
+++ b/libcxx/docs/ReleaseNotes/21.rst
@@ -68,18 +68,15 @@ Improvements and New Features
   up to 10x, depending on type of sorted elements and the initial state of the sorted array.
 
 - The segmented iterator optimization for ``std::for_each`` has been backported to C++11. Previously it was only available
-  in C++23 and later.
+  in C++23 and later. 
 
-- The ``std::for_each_n`` algorithm has been optimized for segmented iterators, resulting in a performance improvement of
-  up to 17.7x for ``std::deque<short>`` iterators, and up to 13.9x for ``std::join_view<vector<vector<short>>>`` iterators.
+- The ``std::for_each_n``, ``std::ranges::for_each`` and ``std::ranges::for_each_n`` algorithms have been optimized for
+  segmented iterators, resulting in a performance improvement of up to 17.7x for ``std::deque<short>`` iterators, and up
+  to 13.9x for ``std::join_view<vector<vector<short>>>`` iterators.
 
 - The ``bitset::to_string`` function has been optimized, resulting in a performance improvement of up to 8.3x for bitsets
   with uniformly distributed zeros and ones, and up to 13.5x and 16.1x for sparse and dense bitsets, respectively.
 
-- The ``std::ranges::for_each`` and ``std::ranges::for_each_n`` algorithms have been optimized for segmented iterators,
-  resulting in performance improvements of up to 21.3x for ``std::deque::iterator`` and 24.9x for ``join_view`` of
-  ``vector<vector<char>>``.
-
 Deprecations and Removals
 -------------------------
 
diff --git a/libcxx/include/__algorithm/ranges_for_each.h b/libcxx/include/__algorithm/ranges_for_each.h
index 1b11b52798dd6..e9c84e8583f87 100644
--- a/libcxx/include/__algorithm/ranges_for_each.h
+++ b/libcxx/include/__algorithm/ranges_for_each.h
@@ -43,7 +43,10 @@ struct __for_each {
   template <class _Iter, class _Sent, class _Proj, class _Func>
   _LIBCPP_HIDE_FROM_ABI constexpr static for_each_result<_Iter, _Func>
   __for_each_impl(_Iter __first, _Sent __last, _Func& __func, _Proj& __proj) {
-    if constexpr (!std::assignable_from<_Iter&, _Sent> && sized_sentinel_for<_Sent, _Iter>) {
+    // In the case where we have different iterator and sentinel types, the segmented iterator optimization
+    // in std::for_each will not kick in. Therefore, we prefer std::for_each_n in that case (whenever we can
+    // obtain the `n`).
+    if constexpr (!std::assignable_from<_Iter&, _Sent> && std::sized_sentinel_for<_Sent, _Iter>) {
       auto __n   = __last - __first;
       auto __end = std::__for_each_n(std::move(__first), __n, __func, __proj);
       return {std::move(__end), std::move(__func)};



More information about the libcxx-commits mailing list