[libcxx-commits] [libcxx] [libc++][PSTL] Introduce cpu traits (PR #88134)

Tue Apr 9 07:44:44 PDT 2024

llvmbot wrote:




@llvm/pr-subscribers-libcxx

Author: Louis Dionne (ldionne)

<details>
<summary>Changes</summary>

Currently, CPU backends in the PSTL are created by defining functions
in the __par_backend namespace. Then, the PSTL includes the CPU backend
that gets configured via CMake and gets those definitions.

This prevents CPU backends from easily co-existing and is a bit confusing.
To solve this problem, this patch introduces the notion of __cpu_traits,
which is a cheap encapsulation of the basis operations required to
implement a CPU-based PSTL. Different backends can now define their own
tag and coexist, and the CPU-based PSTL will simply use __cpu_traits to
dispatch to the right implementation of e.g. __for_each.

Note that this patch doesn't change the actual implementation of the
backends in any way, it only modifies how that implementation is accessed
to implement PSTL algorithms.

This patch is a step towards #88131.

---

Patch is 55.98 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/88134.diff


18 Files Affected:

- (modified) libcxx/include/CMakeLists.txt (+1) 
- (modified) libcxx/include/__algorithm/pstl_backends/cpu_backend.h (-45) 
- (modified) libcxx/include/__algorithm/pstl_backends/cpu_backends/any_of.h (+5-4) 
- (modified) libcxx/include/__algorithm/pstl_backends/cpu_backends/backend.h (+7-3) 
- (modified) libcxx/include/__algorithm/pstl_backends/cpu_backends/fill.h (+2-1) 
- (modified) libcxx/include/__algorithm/pstl_backends/cpu_backends/find_if.h (+12-10) 
- (modified) libcxx/include/__algorithm/pstl_backends/cpu_backends/for_each.h (+2-1) 
- (modified) libcxx/include/__algorithm/pstl_backends/cpu_backends/libdispatch.h (+231-226) 
- (modified) libcxx/include/__algorithm/pstl_backends/cpu_backends/merge.h (+2-1) 
- (modified) libcxx/include/__algorithm/pstl_backends/cpu_backends/serial.h (+50-48) 
- (modified) libcxx/include/__algorithm/pstl_backends/cpu_backends/stable_sort.h (+2-1) 
- (modified) libcxx/include/__algorithm/pstl_backends/cpu_backends/thread.h (+50-46) 
- (modified) libcxx/include/__algorithm/pstl_backends/cpu_backends/transform.h (+3-2) 
- (modified) libcxx/include/__algorithm/pstl_backends/cpu_backends/transform_reduce.h (+11-7) 
- (added) libcxx/include/__pstl/cpu_algos/cpu_traits.h (+84) 
- (modified) libcxx/include/module.modulemap (+2) 
- (modified) libcxx/src/pstl/libdispatch.cpp (+2-5) 
- (modified) libcxx/utils/generate_iwyu_mapping.py (+1) 


``````````diff

diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt
index 097a41d4c41740..1f90dd6db5b158 100644
--- a/libcxx/include/CMakeLists.txt
+++ b/libcxx/include/CMakeLists.txt
@@ -591,6 +591,7 @@ set(files
   __numeric/transform_exclusive_scan.h
   __numeric/transform_inclusive_scan.h
   __numeric/transform_reduce.h
+  __pstl/cpu_algos/cpu_traits.h
   __random/bernoulli_distribution.h
   __random/binomial_distribution.h
   __random/cauchy_distribution.h
diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backend.h b/libcxx/include/__algorithm/pstl_backends/cpu_backend.h
index 6980ded189ea2a..c93139243af459 100644
--- a/libcxx/include/__algorithm/pstl_backends/cpu_backend.h
+++ b/libcxx/include/__algorithm/pstl_backends/cpu_backend.h
@@ -10,51 +10,6 @@
 #define _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKEND_H
 
 #include <__config>
-
-/*
-
-  // _Functor takes a subrange for [__first, __last) that should be executed in serial
-  template <class _RandomAccessIterator, class _Functor>
-  optional<__empty> __parallel_for(_RandomAccessIterator __first, _RandomAccessIterator __last, _Functor __func);
-
-  template <class _Iterator, class _UnaryOp, class _Tp, class _BinaryOp, class _Reduction>
-  optional<_Tp>
-  __parallel_transform_reduce(_Iterator __first, _Iterator __last, _UnaryOp, _Tp __init, _BinaryOp, _Reduction);
-
-  // Cancel the execution of other jobs - they aren't needed anymore
-  void __cancel_execution();
-
-  template <class _RandomAccessIterator1,
-            class _RandomAccessIterator2,
-            class _RandomAccessIterator3,
-            class _Compare,
-            class _LeafMerge>
-  optional<void> __parallel_merge(
-      _RandomAccessIterator1 __first1,
-      _RandomAccessIterator1 __last1,
-      _RandomAccessIterator2 __first2,
-      _RandomAccessIterator2 __last2,
-      _RandomAccessIterator3 __outit,
-      _Compare __comp,
-      _LeafMerge __leaf_merge);
-
-  template <class _RandomAccessIterator, class _Comp, class _LeafSort>
-  void __parallel_stable_sort(_RandomAccessIterator __first,
-                              _RandomAccessIterator __last,
-                              _Comp __comp,
-                              _LeafSort __leaf_sort);
-
-  TODO: Document the parallel backend
-
-Exception handling
-==================
-
-CPU backends are expected to report errors (i.e. failure to allocate) by returning a disengaged `optional` from their
-implementation. Exceptions shouldn't be used to report an internal failure-to-allocate, since all exceptions are turned
-into a program termination at the front-end level. When a backend returns a disengaged `optional` to the frontend, the
-frontend will turn that into a call to `std::__throw_bad_alloc();` to report the internal failure to the user.
-*/
-
 #include <__algorithm/pstl_backends/cpu_backends/any_of.h>
 #include <__algorithm/pstl_backends/cpu_backends/backend.h>
 #include <__algorithm/pstl_backends/cpu_backends/fill.h>
diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/any_of.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/any_of.h
index 13dff80086e72b..be5e54f3fa5c85 100644
--- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/any_of.h
+++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/any_of.h
@@ -17,6 +17,7 @@
 #include <__config>
 #include <__functional/operations.h>
 #include <__iterator/concepts.h>
+#include <__pstl/cpu_algos/cpu_traits.h>
 #include <__type_traits/is_execution_policy.h>
 #include <__utility/move.h>
 #include <__utility/pair.h>
@@ -30,13 +31,13 @@ _LIBCPP_PUSH_MACROS
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
-template <class _Index, class _Brick>
+template <class _Backend, class _Index, class _Brick>
 _LIBCPP_HIDE_FROM_ABI optional<bool> __parallel_or(_Index __first, _Index __last, _Brick __f) {
   std::atomic<bool> __found(false);
-  auto __ret = __par_backend::__parallel_for(__first, __last, [__f, &__found](_Index __i, _Index __j) {
+  auto __ret = __pstl::__cpu_traits<_Backend>::__parallel_for(__first, __last, [__f, &__found](_Index __i, _Index __j) {
     if (!__found.load(std::memory_order_relaxed) && __f(__i, __j)) {
       __found.store(true, std::memory_order_relaxed);
-      __par_backend::__cancel_execution();
+      __pstl::__cpu_traits<_Backend>::__cancel_execution();
     }
   });
   if (!__ret)
@@ -74,7 +75,7 @@ _LIBCPP_HIDE_FROM_ABI optional<bool>
 __pstl_any_of(__cpu_backend_tag, _ForwardIterator __first, _ForwardIterator __last, _Predicate __pred) {
   if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> &&
                 __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
-    return std::__parallel_or(
+    return std::__parallel_or<__cpu_backend_tag>(
         __first, __last, [&__pred](_ForwardIterator __brick_first, _ForwardIterator __brick_last) {
           auto __res = std::__pstl_any_of<__remove_parallel_policy_t<_ExecutionPolicy>>(
               __cpu_backend_tag{}, __brick_first, __brick_last, __pred);
diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/backend.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/backend.h
index ea2210a4a7adbd..0641a51e6823e3 100644
--- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/backend.h
+++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/backend.h
@@ -30,9 +30,13 @@
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
-struct __cpu_backend_tag {};
-
-inline constexpr size_t __lane_size = 64;
+#  if defined(_LIBCPP_PSTL_CPU_BACKEND_SERIAL)
+using __cpu_backend_tag = __pstl::__serial_backend_tag;
+#  elif defined(_LIBCPP_PSTL_CPU_BACKEND_THREAD)
+using __cpu_backend_tag = __pstl::__std_thread_backend_tag;
+#  elif defined(_LIBCPP_PSTL_CPU_BACKEND_LIBDISPATCH)
+using __cpu_backend_tag = __pstl::__libdispatch_backend_tag;
+#  endif
 
 _LIBCPP_END_NAMESPACE_STD
 
diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/fill.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/fill.h
index 64babe9fd2bdae..49a32f6c5ce551 100644
--- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/fill.h
+++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/fill.h
@@ -13,6 +13,7 @@
 #include <__algorithm/pstl_backends/cpu_backends/backend.h>
 #include <__config>
 #include <__iterator/concepts.h>
+#include <__pstl/cpu_algos/cpu_traits.h>
 #include <__type_traits/is_execution_policy.h>
 #include <__utility/empty.h>
 #include <optional>
@@ -39,7 +40,7 @@ _LIBCPP_HIDE_FROM_ABI optional<__empty>
 __pstl_fill(__cpu_backend_tag, _ForwardIterator __first, _ForwardIterator __last, const _Tp& __value) {
   if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> &&
                 __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
-    return __par_backend::__parallel_for(
+    return __pstl::__cpu_traits<__cpu_backend_tag>::__parallel_for(
         __first, __last, [&__value](_ForwardIterator __brick_first, _ForwardIterator __brick_last) {
           [[maybe_unused]] auto __res = std::__pstl_fill<__remove_parallel_policy_t<_ExecutionPolicy>>(
               __cpu_backend_tag{}, __brick_first, __brick_last, __value);
diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/find_if.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/find_if.h
index 170470e4fb7edd..11a5668bf25af1 100644
--- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/find_if.h
+++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/find_if.h
@@ -16,6 +16,7 @@
 #include <__functional/operations.h>
 #include <__iterator/concepts.h>
 #include <__iterator/iterator_traits.h>
+#include <__pstl/cpu_algos/cpu_traits.h>
 #include <__type_traits/is_execution_policy.h>
 #include <__utility/move.h>
 #include <__utility/pair.h>
@@ -33,7 +34,7 @@ _LIBCPP_PUSH_MACROS
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
-template <class _Index, class _Brick, class _Compare>
+template <class _Backend, class _Index, class _Brick, class _Compare>
 _LIBCPP_HIDE_FROM_ABI optional<_Index>
 __parallel_find(_Index __first, _Index __last, _Brick __f, _Compare __comp, bool __b_first) {
   typedef typename std::iterator_traits<_Index>::difference_type _DifferenceType;
@@ -41,8 +42,8 @@ __parallel_find(_Index __first, _Index __last, _Brick __f, _Compare __comp, bool
   _DifferenceType __initial_dist = __b_first ? __n : -1;
   std::atomic<_DifferenceType> __extremum(__initial_dist);
   // TODO: find out what is better here: parallel_for or parallel_reduce
-  auto __res =
-      __par_backend::__parallel_for(__first, __last, [__comp, __f, __first, &__extremum](_Index __i, _Index __j) {
+  auto __res = __pstl::__cpu_traits<_Backend>::__parallel_for(
+      __first, __last, [__comp, __f, __first, &__extremum](_Index __i, _Index __j) {
         // See "Reducing Contention Through Priority Updates", PPoPP '13, for discussion of
         // why using a shared variable scales fairly well in this situation.
         if (__comp(__i - __first, __extremum)) {
@@ -61,12 +62,12 @@ __parallel_find(_Index __first, _Index __last, _Brick __f, _Compare __comp, bool
   return __extremum.load() != __initial_dist ? __first + __extremum.load() : __last;
 }
 
-template <class _Index, class _DifferenceType, class _Compare>
+template <class _Backend, class _Index, class _DifferenceType, class _Compare>
 _LIBCPP_HIDE_FROM_ABI _Index
 __simd_first(_Index __first, _DifferenceType __begin, _DifferenceType __end, _Compare __comp) noexcept {
   // Experiments show good block sizes like this
-  const _DifferenceType __block_size                        = 8;
-  alignas(__lane_size) _DifferenceType __lane[__block_size] = {0};
+  const _DifferenceType __block_size                                                        = 8;
+  alignas(__pstl::__cpu_traits<_Backend>::__lane_size) _DifferenceType __lane[__block_size] = {0};
   while (__end - __begin >= __block_size) {
     _DifferenceType __found = 0;
     _PSTL_PRAGMA_SIMD_REDUCTION(| : __found) for (_DifferenceType __i = __begin; __i < __begin + __block_size; ++__i) {
@@ -102,7 +103,7 @@ _LIBCPP_HIDE_FROM_ABI optional<_ForwardIterator>
 __pstl_find_if(__cpu_backend_tag, _ForwardIterator __first, _ForwardIterator __last, _Predicate __pred) {
   if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> &&
                 __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
-    return std::__parallel_find(
+    return std::__parallel_find<__cpu_backend_tag>(
         __first,
         __last,
         [&__pred](_ForwardIterator __brick_first, _ForwardIterator __brick_last) {
@@ -116,9 +117,10 @@ __pstl_find_if(__cpu_backend_tag, _ForwardIterator __first, _ForwardIterator __l
   } else if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
                        __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
     using __diff_t = __iter_diff_t<_ForwardIterator>;
-    return std::__simd_first(__first, __diff_t(0), __last - __first, [&__pred](_ForwardIterator __iter, __diff_t __i) {
-      return __pred(__iter[__i]);
-    });
+    return std::__simd_first<__cpu_backend_tag>(
+        __first, __diff_t(0), __last - __first, [&__pred](_ForwardIterator __iter, __diff_t __i) {
+          return __pred(__iter[__i]);
+        });
   } else {
     return std::find_if(__first, __last, __pred);
   }
diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/for_each.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/for_each.h
index 81fd4526b8dbf1..1667ec0f0c4f41 100644
--- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/for_each.h
+++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/for_each.h
@@ -13,6 +13,7 @@
 #include <__algorithm/pstl_backends/cpu_backends/backend.h>
 #include <__config>
 #include <__iterator/concepts.h>
+#include <__pstl/cpu_algos/cpu_traits.h>
 #include <__type_traits/is_execution_policy.h>
 #include <__utility/empty.h>
 #include <optional>
@@ -39,7 +40,7 @@ _LIBCPP_HIDE_FROM_ABI optional<__empty>
 __pstl_for_each(__cpu_backend_tag, _ForwardIterator __first, _ForwardIterator __last, _Functor __func) {
   if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> &&
                 __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
-    return std::__par_backend::__parallel_for(
+    return __pstl::__cpu_traits<__cpu_backend_tag>::__parallel_for(
         __first, __last, [__func](_ForwardIterator __brick_first, _ForwardIterator __brick_last) {
           [[maybe_unused]] auto __res = std::__pstl_for_each<__remove_parallel_policy_t<_ExecutionPolicy>>(
               __cpu_backend_tag{}, __brick_first, __brick_last, __func);
diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/libdispatch.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/libdispatch.h
index e885e7f225172c..8757f249680375 100644
--- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/libdispatch.h
+++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/libdispatch.h
@@ -23,6 +23,7 @@
 #include <__memory/construct_at.h>
 #include <__memory/unique_ptr.h>
 #include <__numeric/reduce.h>
+#include <__pstl/cpu_algos/cpu_traits.h>
 #include <__utility/empty.h>
 #include <__utility/exception_guard.h>
 #include <__utility/move.h>
@@ -37,10 +38,11 @@ _LIBCPP_PUSH_MACROS
 #if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
 
 _LIBCPP_BEGIN_NAMESPACE_STD
+namespace __pstl {
 
-namespace __par_backend {
-inline namespace __libdispatch {
+struct __libdispatch_backend_tag {};
 
+namespace __libdispatch {
 // ::dispatch_apply is marked as __attribute__((nothrow)) because it doesn't let exceptions propagate, and neither do
 // we.
 // TODO: Do we want to add [[_Clang::__callback__(__func, __context, __)]]?
@@ -77,267 +79,270 @@ __dispatch_parallel_for(__chunk_partitions __partitions, _RandomAccessIterator _
 
   return __empty{};
 }
+} // namespace __libdispatch
 
-template <class _RandomAccessIterator, class _Functor>
-_LIBCPP_HIDE_FROM_ABI optional<__empty>
-__parallel_for(_RandomAccessIterator __first, _RandomAccessIterator __last, _Functor __func) {
-  return __libdispatch::__dispatch_parallel_for(
-      __libdispatch::__partition_chunks(__last - __first), std::move(__first), std::move(__func));
-}
-
-template <class _RandomAccessIterator1, class _RandomAccessIterator2, class _RandomAccessIteratorOut>
-struct __merge_range {
-  __merge_range(_RandomAccessIterator1 __mid1, _RandomAccessIterator2 __mid2, _RandomAccessIteratorOut __result)
-      : __mid1_(__mid1), __mid2_(__mid2), __result_(__result) {}
+template <>
+struct __cpu_traits<__libdispatch_backend_tag> {
+  template <class _RandomAccessIterator, class _Functor>
+  _LIBCPP_HIDE_FROM_ABI static optional<__empty>
+  __parallel_for(_RandomAccessIterator __first, _RandomAccessIterator __last, _Functor __func) {
+    return __libdispatch::__dispatch_parallel_for(
+        __libdispatch::__partition_chunks(__last - __first), std::move(__first), std::move(__func));
+  }
 
-  _RandomAccessIterator1 __mid1_;
-  _RandomAccessIterator2 __mid2_;
-  _RandomAccessIteratorOut __result_;
-};
+  template <class _RandomAccessIterator1, class _RandomAccessIterator2, class _RandomAccessIteratorOut>
+  struct __merge_range {
+    __merge_range(_RandomAccessIterator1 __mid1, _RandomAccessIterator2 __mid2, _RandomAccessIteratorOut __result)
+        : __mid1_(__mid1), __mid2_(__mid2), __result_(__result) {}
 
-template <typename _RandomAccessIterator1,
-          typename _RandomAccessIterator2,
-          typename _RandomAccessIterator3,
-          typename _Compare,
-          typename _LeafMerge>
-_LIBCPP_HIDE_FROM_ABI optional<__empty> __parallel_merge(
-    _RandomAccessIterator1 __first1,
-    _RandomAccessIterator1 __last1,
-    _RandomAccessIterator2 __first2,
-    _RandomAccessIterator2 __last2,
-    _RandomAccessIterator3 __result,
-    _Compare __comp,
-    _LeafMerge __leaf_merge) noexcept {
-  __chunk_partitions __partitions =
-      __libdispatch::__partition_chunks(std::max<ptrdiff_t>(__last1 - __first1, __last2 - __first2));
-
-  if (__partitions.__chunk_count_ == 0)
-    return __empty{};
+    _RandomAccessIterator1 __mid1_;
+    _RandomAccessIterator2 __mid2_;
+    _RandomAccessIteratorOut __result_;
+  };
 
-  if (__partitions.__chunk_count_ == 1) {
-    __leaf_merge(__first1, __last1, __first2, __last2, __result, __comp);
-    return __empty{};
-  }
+  template <typename _RandomAccessIterator1,
+            typename _RandomAccessIterator2,
+            typename _RandomAccessIterator3,
+            typename _Compare,
+            typename _LeafMerge>
+  _LIBCPP_HIDE_FROM_ABI static optional<__empty> __parallel_merge(
+      _RandomAccessIterator1 __first1,
+      _RandomAccessIterator1 __last1,
+      _RandomAccessIterator2 __first2,
+      _RandomAccessIterator2 __last2,
+      _RandomAccessIterator3 __result,
+      _Compare __comp,
+      _LeafMerge __leaf_merge) noexcept {
+    __libdispatch::__chunk_partitions __partitions =
+        __libdispatch::__partition_chunks(std::max<ptrdiff_t>(__last1 - __first1, __last2 - __first2));
+
+    if (__partitions.__chunk_count_ == 0)
+      return __empty{};
+
+    if (__partitions.__chunk_count_ == 1) {
+      __leaf_merge(__first1, __last1, __first2, __last2, __result, __comp);
+      return __empty{};
+    }
 
-  using __merge_range_t = __merge_range<_RandomAccessIterator1, _RandomAccessIterator2, _RandomAccessIterator3>;
-  auto const __n_ranges = __partitions.__chunk_count_ + 1;
+    using __merge_range_t = __merge_range<_RandomAccessIterator1, _RandomAccessIterator2, _RandomAccessIterator3>;
+    auto const __n_ranges = __partitions.__chunk_count_ + 1;
 
-  // TODO: use __uninitialized_buffer
-  auto __destroy = [=](__merge_range_t* __ptr) {
-    std::destroy_n(__ptr, __n_ranges);
-    std::allocator<__merge_range_t>().deallocate(__ptr, __n_ranges);
-  };
+    // TODO: use __uninitialized_buffer
+    auto __destroy = [=](__merge_range_t* __ptr) {
+      std::destroy_n(__ptr, __n_ranges);
+      std::allocator<__merge_range_t>().deallocate(__ptr, __n_ranges);
+    };
 
-  unique_ptr<__merge_range_t[], decltype(__destroy)> __ranges(
-      [&]() -> __merge_range_t* {
+    unique_ptr<__merge_range_t[], decltype(__destroy)> __ranges(
+        [&]() -> __merge_range_t* {
 #  ifndef _LIBCPP_HAS_NO_EXCEPTIONS
-        try {
+          try {
 #  endif
-          return std::allocator<__merge_range_t>().allocate(__n_ranges);
+            return std::allocator<__merge_range_t>().allocate(__n_ranges);
 #  ifndef _LIBCPP_HAS_NO_EXCEPTIONS
-        } catch (const std::bad_alloc&) {
-          return nullptr;
-        }
+          } catch (const std::bad_alloc&) {
+            return nullptr;
+          }
 #  endif
-      }(),
-      __destroy);
-
-  if (!__ranges)
-    return nullopt;
+        }(),
+        __destroy);
+
+    if (!__ranges)
+      return nullopt;
+
+    // TODO: Improve the case where the smaller range is merged into just a few (or even one) chunks of the larger case
+    __merge_range_t* __r = __ranges.get();
+    std::__construct_at(__r++, __first1, __first2, __result);
+
+    bool __iterate_first_range = __last1 - __first1 > __last2 - __first2;
+
+    auto __compute_chunk = [&](size_t __chunk_size) -> __merge_range_t {
+      auto [__mid1, __mid2] = [&] {
+        if (__iterate_first_range) {
+          auto __m1 = __first1 + __chunk_size;
+          auto __m2 = std::lower_bound(__first2, __last2, __m1[-1], __comp);
+          return std::make_pair(__m1, __m2);
+        } else {
+          auto __m2 = __first2 + __chunk_size;
+          auto __m1 = std::lower_bound(__first1, __last1, __m2[-1], __comp);
+          return std::make_pair(__m1, __m2);
+        }
+      }();
 
-  // TODO: Improve the case where the smaller range is merged into just a few (or even one) chunks of the larger case
-  __merge_range_t* __r = __ranges.get();
-  std::__construct_at(__r++, __first1, __first2, __result);
+      __result += (__mid1 - __first1) + (__mid2 - __first2);
+      __first1 = __mid1;
+      __first2 = __mid2;
+      return {std::move(__mid1), std::move(__mid2), __result};
+    };
 
-  bool __iterate_first_range = __last1 - __first1 > __last2 - __first2;
+    // handle first chunk
+    std::__construct_at(__r++, __compute_chunk(__partitions.__first_chunk_size_));
 
-  auto __compute_chunk = [&](size_t __chunk_size) -> __merge_range_t {
-    auto [__mid1, __mid2] =...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/88134