[libcxx-commits] [libcxx] 2b2e7f6 - [libc++][PSTL] Add a GCD backend

Wed Jul 12 13:27:38 PDT 2023

Author: Nikolas Klauser
Date: 2023-07-12T13:27:33-07:00
New Revision: 2b2e7f6e5727d523ca595ee85a6b1592c8609b4a

URL: https://github.com/llvm/llvm-project/commit/2b2e7f6e5727d523ca595ee85a6b1592c8609b4a
DIFF: https://github.com/llvm/llvm-project/commit/2b2e7f6e5727d523ca595ee85a6b1592c8609b4a.diff

LOG: [libc++][PSTL] Add a GCD backend

Reviewed By: ldionne, #libc

Spies: arichardson, mgrang, krytarowski, libcxx-commits, h-vetinari

Differential Revision: https://reviews.llvm.org/D151717

Added: 
    libcxx/include/__algorithm/pstl_backends/cpu_backends/libdispatch.h
    libcxx/src/pstl/libdispatch.cpp
    libcxx/test/libcxx/algorithms/pstl.libdispatch.chunk_partitions.pass.cpp

Modified: 
    libcxx/CMakeLists.txt
    libcxx/cmake/caches/Apple.cmake
    libcxx/include/CMakeLists.txt
    libcxx/include/__algorithm/pstl_backend.h
    libcxx/include/__algorithm/pstl_backends/cpu_backends/backend.h
    libcxx/include/__algorithm/pstl_backends/cpu_backends/transform_reduce.h
    libcxx/include/__config_site.in
    libcxx/include/__numeric/reduce.h
    libcxx/include/__utility/terminate_on_exception.h
    libcxx/include/module.modulemap.in
    libcxx/src/CMakeLists.txt
    libcxx/test/std/algorithms/alg.sorting/alg.merge/pstl.merge.pass.cpp
    libcxx/test/std/algorithms/numeric.ops/transform.reduce/pstl.transform_reduce.binary.pass.cpp
    libcxx/utils/data/ignore_format.txt
    libcxx/utils/libcxx/test/features.py

Removed: 
    


################################################################################
diff  --git a/libcxx/CMakeLists.txt b/libcxx/CMakeLists.txt
index 8ae2a7afbe4768..2660e80f31d06a 100644

--- a/libcxx/CMakeLists.txt
+++ b/libcxx/CMakeLists.txt
@@ -797,9 +797,11 @@ if (LIBCXX_PSTL_CPU_BACKEND STREQUAL "serial")
   config_define(1 _LIBCPP_PSTL_CPU_BACKEND_SERIAL)
 elseif(LIBCXX_PSTL_CPU_BACKEND STREQUAL "std_thread")
   config_define(1 _LIBCPP_PSTL_CPU_BACKEND_THREAD)
+elseif(LIBCXX_PSTL_CPU_BACKEND STREQUAL "libdispatch")
+  config_define(1 _LIBCPP_PSTL_CPU_BACKEND_LIBDISPATCH)
 else()
   message(FATAL_ERROR "LIBCXX_PSTL_CPU_BACKEND is set to ${LIBCXX_PSTL_CPU_BACKEND}, which is not a valid backend.
-                       Valid backends are: serial, std_thread")
+                       Valid backends are: serial, std_thread and libdispatch")
 endif()
 
 if (LIBCXX_ABI_DEFINES)

diff  --git a/libcxx/cmake/caches/Apple.cmake b/libcxx/cmake/caches/Apple.cmake
index c9ee3b075bf775..099b5387478f53 100644
--- a/libcxx/cmake/caches/Apple.cmake
+++ b/libcxx/cmake/caches/Apple.cmake
@@ -8,6 +8,7 @@ set(LIBCXX_ENABLE_STATIC ON CACHE BOOL "")
 set(LIBCXX_ENABLE_SHARED ON CACHE BOOL "")
 set(LIBCXX_CXX_ABI libcxxabi CACHE STRING "")
 set(LIBCXX_ENABLE_VENDOR_AVAILABILITY_ANNOTATIONS ON CACHE BOOL "")
+set(LIBCXX_PSTL_CPU_BACKEND libdispatch)
 
 set(LIBCXX_HERMETIC_STATIC_LIBRARY ON CACHE BOOL "")
 set(LIBCXXABI_HERMETIC_STATIC_LIBRARY ON CACHE BOOL "")

diff  --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt
index 29f86d96f678f7..12cb12259e110a 100644
--- a/libcxx/include/CMakeLists.txt
+++ b/libcxx/include/CMakeLists.txt
@@ -78,6 +78,7 @@ set(files
   __algorithm/pstl_backends/cpu_backends/fill.h
   __algorithm/pstl_backends/cpu_backends/find_if.h
   __algorithm/pstl_backends/cpu_backends/for_each.h
+  __algorithm/pstl_backends/cpu_backends/libdispatch.h
   __algorithm/pstl_backends/cpu_backends/merge.h
   __algorithm/pstl_backends/cpu_backends/serial.h
   __algorithm/pstl_backends/cpu_backends/stable_sort.h

diff  --git a/libcxx/include/__algorithm/pstl_backend.h b/libcxx/include/__algorithm/pstl_backend.h
index 368235931fcb88..4d0213555fd8ad 100644
--- a/libcxx/include/__algorithm/pstl_backend.h
+++ b/libcxx/include/__algorithm/pstl_backend.h
@@ -169,7 +169,8 @@ struct __select_backend<std::execution::unsequenced_policy> {
 };
 #  endif
 
-#  if defined(_LIBCPP_PSTL_CPU_BACKEND_SERIAL) || defined(_LIBCPP_PSTL_CPU_BACKEND_THREAD)
+#  if defined(_LIBCPP_PSTL_CPU_BACKEND_SERIAL) || defined(_LIBCPP_PSTL_CPU_BACKEND_THREAD) ||                          \
+      defined(_LIBCPP_PSTL_CPU_BACKEND_LIBDISPATCH)
 template <>
 struct __select_backend<std::execution::parallel_policy> {
   using type = __cpu_backend_tag;

diff  --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/backend.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/backend.h
index fa353a69869f94..ea2210a4a7adbd 100644
--- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/backend.h
+++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/backend.h
@@ -16,6 +16,8 @@
 #  include <__algorithm/pstl_backends/cpu_backends/serial.h>
 #elif defined(_LIBCPP_PSTL_CPU_BACKEND_THREAD)
 #  include <__algorithm/pstl_backends/cpu_backends/thread.h>
+#elif defined(_LIBCPP_PSTL_CPU_BACKEND_LIBDISPATCH)
+#  include <__algorithm/pstl_backends/cpu_backends/libdispatch.h>
 #else
 #  error "Invalid CPU backend choice"
 #endif

diff  --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/libdispatch.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/libdispatch.h
new file mode 100644
index 00000000000000..49af1c4b1c0564
--- /dev/null
+++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/libdispatch.h
@@ -0,0 +1,226 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKENDS_LIBDISPATCH_H
+#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKENDS_LIBDISPATCH_H
+
+#include <__algorithm/lower_bound.h>
+#include <__algorithm/upper_bound.h>
+#include <__atomic/atomic.h>
+#include <__config>
+#include <__exception/terminate.h>
+#include <__iterator/iterator_traits.h>
+#include <__iterator/move_iterator.h>
+#include <__memory/construct_at.h>
+#include <__memory/unique_ptr.h>
+#include <__memory_resource/memory_resource.h>
+#include <__numeric/reduce.h>
+#include <__utility/exception_guard.h>
+#include <__utility/move.h>
+#include <__utility/terminate_on_exception.h>
+#include <cstddef>
+#include <new>
+#include <vector>
+
+#if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+namespace __par_backend {
+inline namespace __libdispatch {
+
+// ::dispatch_apply is marked as __attribute__((nothrow)) because it doesn't let exceptions propagate, and neither do
+// we.
+// TODO: Do we want to add [[_Clang::__callback__(__func, __context, __)]]?
+_LIBCPP_EXPORTED_FROM_ABI void
+__dispatch_apply(size_t __chunk_count, void* __context, void (*__func)(void* __context, size_t __chunk)) noexcept;
+
+template <class _Func>
+_LIBCPP_HIDE_FROM_ABI void __dispatch_apply(size_t __chunk_count, _Func __func) noexcept {
+  __libdispatch::__dispatch_apply(__chunk_count, &__func, [](void* __context, size_t __chunk) {
+    (*static_cast<_Func*>(__context))(__chunk);
+  });
+}
+
+struct __chunk_partitions {
+  ptr
diff _t __chunk_count_; // includes the first chunk
+  ptr
diff _t __chunk_size_;
+  ptr
diff _t __first_chunk_size_;
+};
+
+[[__gnu__::__const__]] _LIBCPP_EXPORTED_FROM_ABI pmr::memory_resource* __get_memory_resource();
+[[__gnu__::__const__]] _LIBCPP_EXPORTED_FROM_ABI __chunk_partitions __partition_chunks(ptr
diff _t __size);
+
+template <class _RandomAccessIterator, class _Functor>
+_LIBCPP_HIDE_FROM_ABI void
+__parallel_for(_RandomAccessIterator __first, _RandomAccessIterator __last, _Functor __func) {
+  auto __partitions = __libdispatch::__partition_chunks(__last - __first);
+
+  // Perform the chunked execution.
+  __libdispatch::__dispatch_apply(__partitions.__chunk_count_, [&](size_t __chunk) {
+    auto __this_chunk_size = __chunk == 0 ? __partitions.__first_chunk_size_ : __partitions.__chunk_size_;
+    auto __index =
+        __chunk == 0
+            ? 0
+            : (__chunk * __partitions.__chunk_size_) + (__partitions.__first_chunk_size_ - __partitions.__chunk_size_);
+    __func(__first + __index, __first + __index + __this_chunk_size);
+  });
+}
+
+template <class _RandomAccessIterator1, class _RandomAccessIterator2, class _RandomAccessIteratorOut>
+struct __merge_range {
+  __merge_range(_RandomAccessIterator1 __mid1, _RandomAccessIterator2 __mid2, _RandomAccessIteratorOut __result)
+      : __mid1_(__mid1), __mid2_(__mid2), __result_(__result) {}
+
+  _RandomAccessIterator1 __mid1_;
+  _RandomAccessIterator2 __mid2_;
+  _RandomAccessIteratorOut __result_;
+};
+
+template <typename _RandomAccessIterator1,
+          typename _RandomAccessIterator2,
+          typename _RandomAccessIterator3,
+          typename _Compare,
+          typename _LeafMerge>
+_LIBCPP_HIDE_FROM_ABI void __parallel_merge(
+    _RandomAccessIterator1 __first1,
+    _RandomAccessIterator1 __last1,
+    _RandomAccessIterator2 __first2,
+    _RandomAccessIterator2 __last2,
+    _RandomAccessIterator3 __result,
+    _Compare __comp,
+    _LeafMerge __leaf_merge) {
+  __chunk_partitions __partitions =
+      __libdispatch::__partition_chunks(std::max<ptr
diff _t>(__last1 - __first1, __last2 - __first2));
+
+  if (__partitions.__chunk_count_ == 0)
+    return;
+
+  if (__partitions.__chunk_count_ == 1) {
+    __leaf_merge(__first1, __last1, __first2, __last2, __result, __comp);
+    return;
+  }
+
+  using __merge_range_t = __merge_range<_RandomAccessIterator1, _RandomAccessIterator2, _RandomAccessIterator3>;
+
+  vector<__merge_range_t> __ranges;
+  __ranges.reserve(__partitions.__chunk_count_ + 1);
+
+  // TODO: Improve the case where the smaller range is merged into just a few (or even one) chunks of the larger case
+  std::__terminate_on_exception([&] {
+    __ranges.emplace_back(__first1, __first2, __result);
+
+    bool __iterate_first_range = __last1 - __first1 > __last2 - __first2;
+
+    auto __compute_chunk = [&](size_t __chunk_size) -> __merge_range_t {
+      auto [__mid1, __mid2] = [&] {
+        if (__iterate_first_range) {
+          auto __m1 = __first1 + __chunk_size;
+          auto __m2 = std::lower_bound(__first2, __last2, __m1[-1], __comp);
+          return std::make_pair(__m1, __m2);
+        } else {
+          auto __m2 = __first2 + __chunk_size;
+          auto __m1 = std::lower_bound(__first1, __last1, __m2[-1], __comp);
+          return std::make_pair(__m1, __m2);
+        }
+      }();
+
+      __result += (__mid1 - __first1) + (__mid2 - __first2);
+      __first1 = __mid1;
+      __first2 = __mid2;
+      return {std::move(__mid1), std::move(__mid2), __result};
+    };
+
+    // handle first chunk
+    __ranges.emplace_back(__compute_chunk(__partitions.__first_chunk_size_));
+
+    // handle 2 -> N - 1 chunks
+    for (ptr
diff _t __i = 0; __i != __partitions.__chunk_count_ - 2; ++__i)
+      __ranges.emplace_back(__compute_chunk(__partitions.__chunk_size_));
+
+    // handle last chunk
+    __ranges.emplace_back(__last1, __last2, __result);
+
+    __libdispatch::__dispatch_apply(__partitions.__chunk_count_, [&](size_t __index) {
+      auto __first_iters = __ranges[__index];
+      auto __last_iters  = __ranges[__index + 1];
+      __leaf_merge(
+          __first_iters.__mid1_,
+          __last_iters.__mid1_,
+          __first_iters.__mid2_,
+          __last_iters.__mid2_,
+          __first_iters.__result_,
+          __comp);
+    });
+  });
+}
+
+template <class _RandomAccessIterator, class _Transform, class _Value, class _Combiner, class _Reduction>
+_LIBCPP_HIDE_FROM_ABI _Value __parallel_transform_reduce(
+    _RandomAccessIterator __first,
+    _RandomAccessIterator __last,
+    _Transform __transform,
+    _Value __init,
+    _Combiner __combiner,
+    _Reduction __reduction) {
+  auto __partitions = __libdispatch::__partition_chunks(__last - __first);
+
+  auto __destroy = [__count = __partitions.__chunk_count_](_Value* __ptr) {
+    std::destroy_n(__ptr, __count);
+    std::allocator<_Value>().deallocate(__ptr, __count);
+  };
+
+  // TODO: use __uninitialized_buffer
+  // TODO: allocate one element per worker instead of one element per chunk
+  unique_ptr<_Value[], decltype(__destroy)> __values(
+      std::allocator<_Value>().allocate(__partitions.__chunk_count_), __destroy);
+
+  // __dispatch_apply is noexcept
+  __libdispatch::__dispatch_apply(__partitions.__chunk_count_, [&](size_t __chunk) {
+    auto __this_chunk_size = __chunk == 0 ? __partitions.__first_chunk_size_ : __partitions.__chunk_size_;
+    auto __index =
+        __chunk == 0
+            ? 0
+            : (__chunk * __partitions.__chunk_size_) + (__partitions.__first_chunk_size_ - __partitions.__chunk_size_);
+    if (__this_chunk_size != 1) {
+      std::__construct_at(
+          __values.get() + __chunk,
+          __reduction(__first + __index + 2,
+                      __first + __index + __this_chunk_size,
+                      __combiner(__transform(__first + __index), __transform(__first + __index + 1))));
+    } else {
+      std::__construct_at(__values.get() + __chunk, __transform(__first + __index));
+    }
+  });
+
+  return std::__terminate_on_exception([&] {
+    return std::reduce(
+        std::make_move_iterator(__values.get()),
+        std::make_move_iterator(__values.get() + __partitions.__chunk_count_),
+        std::move(__init),
+        __combiner);
+  });
+}
+
+// TODO: parallelize this
+template <class _RandomAccessIterator, class _Comp, class _LeafSort>
+_LIBCPP_HIDE_FROM_ABI void __parallel_stable_sort(
+    _RandomAccessIterator __first, _RandomAccessIterator __last, _Comp __comp, _LeafSort __leaf_sort) {
+  __leaf_sort(__first, __last, __comp);
+}
+
+_LIBCPP_HIDE_FROM_ABI inline void __cancel_execution() {}
+
+} // namespace __libdispatch
+} // namespace __par_backend
+
+_LIBCPP_END_NAMESPACE_STD
+
+#endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
+
+#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKENDS_LIBDISPATCH_H

diff  --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform_reduce.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform_reduce.h
index 09e4ffce9e2927..c51c312d938a95 100644
--- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform_reduce.h
+++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform_reduce.h
@@ -164,7 +164,7 @@ _LIBCPP_HIDE_FROM_ABI _Tp __pstl_transform_reduce(
           [__transform](_ForwardIterator __iter) { return __transform(*__iter); },
           std::move(__init),
           __reduce,
-          [=](_ForwardIterator __brick_first, _ForwardIterator __brick_last, _Tp __brick_init) {
+          [__transform, __reduce](auto __brick_first, auto __brick_last, _Tp __brick_init) {
             return std::__pstl_transform_reduce<__remove_parallel_policy_t<_ExecutionPolicy>>(
                 __cpu_backend_tag{},
                 std::move(__brick_first),

diff  --git a/libcxx/include/__config_site.in b/libcxx/include/__config_site.in
index e006f758098f82..0d500718b801db 100644
--- a/libcxx/include/__config_site.in
+++ b/libcxx/include/__config_site.in
@@ -33,6 +33,7 @@
 // PSTL backends
 #cmakedefine _LIBCPP_PSTL_CPU_BACKEND_SERIAL
 #cmakedefine _LIBCPP_PSTL_CPU_BACKEND_THREAD
+#cmakedefine _LIBCPP_PSTL_CPU_BACKEND_LIBDISPATCH
 
 // Hardening.
 #cmakedefine01 _LIBCPP_ENABLE_HARDENED_MODE_DEFAULT

diff  --git a/libcxx/include/__numeric/reduce.h b/libcxx/include/__numeric/reduce.h
index 9e710fc993f4f2..8daa7cf60e25f7 100644
--- a/libcxx/include/__numeric/reduce.h
+++ b/libcxx/include/__numeric/reduce.h
@@ -13,6 +13,7 @@
 #include <__config>
 #include <__functional/operations.h>
 #include <__iterator/iterator_traits.h>
+#include <__utility/move.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
@@ -25,7 +26,7 @@ template <class _InputIterator, class _Tp, class _BinaryOp>
 _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_SINCE_CXX20 _Tp reduce(_InputIterator __first, _InputIterator __last,
                                                                    _Tp __init, _BinaryOp __b) {
   for (; __first != __last; ++__first)
-    __init = __b(__init, *__first);
+    __init = __b(std::move(__init), *__first);
   return __init;
 }
 

diff  --git a/libcxx/include/__utility/terminate_on_exception.h b/libcxx/include/__utility/terminate_on_exception.h
index b539db6be978db..e035ec3409ae52 100644
--- a/libcxx/include/__utility/terminate_on_exception.h
+++ b/libcxx/include/__utility/terminate_on_exception.h
@@ -11,6 +11,7 @@
 
 #include <__config>
 #include <__exception/terminate.h>
+#include <new>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header

diff  --git a/libcxx/include/module.modulemap.in b/libcxx/include/module.modulemap.in
index 1992f36884e8cc..395e6fa318268e 100644
--- a/libcxx/include/module.modulemap.in
+++ b/libcxx/include/module.modulemap.in
@@ -344,6 +344,9 @@ module std [system] {
       module pstl_backends_cpu_backends_for_each {
         private header "__algorithm/pstl_backends/cpu_backends/for_each.h"
       }
+      module pstl_backends_cpu_backends_libdispatch {
+        private header "__algorithm/pstl_backends/cpu_backends/libdispatch.h"
+      }
       module pstl_backends_cpu_backends_merge {
         private header "__algorithm/pstl_backends/cpu_backends/merge.h"
       }

diff  --git a/libcxx/src/CMakeLists.txt b/libcxx/src/CMakeLists.txt
index 623d9190544390..a3f8f4dc3a3cd1 100644
--- a/libcxx/src/CMakeLists.txt
+++ b/libcxx/src/CMakeLists.txt
@@ -318,6 +318,10 @@ set(LIBCXX_EXPERIMENTAL_SOURCES
   experimental/memory_resource.cpp
   )
 
+if (LIBCXX_PSTL_CPU_BACKEND STREQUAL "libdispatch")
+  list(APPEND LIBCXX_EXPERIMENTAL_SOURCES pstl/libdispatch.cpp)
+endif()
+
 add_library(cxx_experimental STATIC ${LIBCXX_EXPERIMENTAL_SOURCES})
 target_link_libraries(cxx_experimental PUBLIC cxx-headers)
 if (LIBCXX_ENABLE_SHARED)

diff  --git a/libcxx/src/pstl/libdispatch.cpp b/libcxx/src/pstl/libdispatch.cpp
new file mode 100644
index 00000000000000..e264aade0298d5
--- /dev/null
+++ b/libcxx/src/pstl/libdispatch.cpp
@@ -0,0 +1,71 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <__algorithm/min.h>
+#include <__algorithm/pstl_backends/cpu_backends/libdispatch.h>
+#include <__config>
+#include <dispatch/dispatch.h>
+#include <memory_resource>
+#include <thread>
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+namespace __par_backend::inline __libdispatch {
+
+pmr::memory_resource* __get_memory_resource() {
+  static std::pmr::synchronized_pool_resource pool{pmr::new_delete_resource()};
+  return &pool;
+}
+
+void __dispatch_apply(size_t chunk_count, void* context, void (*func)(void* context, size_t chunk)) noexcept {
+  ::dispatch_apply_f(chunk_count, DISPATCH_APPLY_AUTO, context, func);
+}
+
+__chunk_partitions __partition_chunks(ptr
diff _t element_count) {
+  __chunk_partitions partitions;
+  partitions.__chunk_count_ = [&] {
+    ptr
diff _t cores = std::max(1u, thread::hardware_concurrency());
+
+    auto medium = [&](ptr
diff _t n) { return cores + ((n - cores) / cores); };
+
+    // This is an approximation of `log(1.01, sqrt(n))` which seemes to be reasonable for `n` larger than 500 and tops
+    // at 800 tasks for n ~ 8 million
+    auto large = [](ptr
diff _t n) { return static_cast<ptr
diff _t>(100.499 * std::log(std::sqrt(n))); };
+
+    if (element_count < cores)
+      return element_count;
+    else if (element_count < 500)
+      return medium(element_count);
+    else
+      return std::min(medium(element_count), large(element_count)); // provide a "smooth" transition
+  }();
+  partitions.__chunk_size_       = element_count / partitions.__chunk_count_;
+  partitions.__first_chunk_size_ = partitions.__chunk_size_;
+
+  const ptr
diff _t leftover_item_count = element_count - (partitions.__chunk_count_ * partitions.__chunk_size_);
+
+  if (leftover_item_count == 0)
+    return partitions;
+
+  if (leftover_item_count == partitions.__chunk_size_) {
+    partitions.__chunk_count_ += 1;
+    return partitions;
+  }
+
+  const ptr
diff _t n_extra_items_per_chunk = leftover_item_count / partitions.__chunk_count_;
+  const ptr
diff _t n_final_leftover_items  = leftover_item_count - (n_extra_items_per_chunk * partitions.__chunk_count_);
+
+  partitions.__chunk_size_ += n_extra_items_per_chunk;
+  partitions.__first_chunk_size_ = partitions.__chunk_size_ + n_final_leftover_items;
+  return partitions;
+}
+
+// NOLINTNEXTLINE(llvm-namespace-comment) // This is https://llvm.org/PR56804
+} // namespace __par_backend::inline __libdispatch
+
+_LIBCPP_END_NAMESPACE_STD

diff  --git a/libcxx/test/libcxx/algorithms/pstl.libdispatch.chunk_partitions.pass.cpp b/libcxx/test/libcxx/algorithms/pstl.libdispatch.chunk_partitions.pass.cpp
new file mode 100644
index 00000000000000..5c4fa6bfae4540
--- /dev/null
+++ b/libcxx/test/libcxx/algorithms/pstl.libdispatch.chunk_partitions.pass.cpp
@@ -0,0 +1,28 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// <algorithm>
+
+// REQUIRES: libcpp-pstl-cpu-backend-libdispatch
+
+// ADDITIONAL_COMPILE_FLAGS: -Wno-private-header
+
+// __chunk_partitions __partition_chunks(ptr
diff _t);
+
+#include <__algorithm/pstl_backends/cpu_backends/libdispatch.h>
+#include <cassert>
+#include <cstddef>
+
+int main(int, char**) {
+  for (std::ptr
diff _t i = 0; i != 2ll << 20; ++i) {
+    auto chunks = std::__par_backend::__libdispatch::__partition_chunks(i);
+    assert(chunks.__chunk_count_ <= i);
+    assert((chunks.__chunk_count_ - 1) * chunks.__chunk_size_ + chunks.__first_chunk_size_ == i);
+  }
+  return 0;
+}

diff  --git a/libcxx/test/std/algorithms/alg.sorting/alg.merge/pstl.merge.pass.cpp b/libcxx/test/std/algorithms/alg.sorting/alg.merge/pstl.merge.pass.cpp
index 0667f6f8de4932..1feadfb377a686 100644
--- a/libcxx/test/std/algorithms/alg.sorting/alg.merge/pstl.merge.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.sorting/alg.merge/pstl.merge.pass.cpp
@@ -49,6 +49,13 @@ struct Test {
       assert((out == std::array{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}));
     }
 
+    { // check that it works with both ranges being empty
+      std::array<int, 0> a;
+      std::array<int, 0> b;
+      std::array<int, std::size(a) + std::size(b)> out;
+      std::merge(
+          policy, Iter1(std::begin(a)), Iter1(std::end(a)), Iter2(std::begin(b)), Iter2(std::end(b)), std::begin(out));
+    }
     { // check that it works with the first range being empty
       std::array<int, 0> a;
       int b[] = {2, 4, 6, 8, 10};

diff  --git a/libcxx/test/std/algorithms/numeric.ops/transform.reduce/pstl.transform_reduce.binary.pass.cpp b/libcxx/test/std/algorithms/numeric.ops/transform.reduce/pstl.transform_reduce.binary.pass.cpp
index 38eaf81f3fe08e..18b56f237c3e62 100644
--- a/libcxx/test/std/algorithms/numeric.ops/transform.reduce/pstl.transform_reduce.binary.pass.cpp
+++ b/libcxx/test/std/algorithms/numeric.ops/transform.reduce/pstl.transform_reduce.binary.pass.cpp
@@ -38,17 +38,30 @@
 #include "test_macros.h"
 #include "type_algorithms.h"
 
+template <class T>
+struct constructible_from {
+  T v_;
+
+  explicit constructible_from(T v) : v_(v) {}
+
+  friend constructible_from operator+(constructible_from lhs, constructible_from rhs) {
+    return constructible_from{lhs.get() + rhs.get()};
+  }
+
+  T get() const { return v_; }
+};
+
 template <class Iter1, class Iter2, class ValueT>
 struct Test {
   template <class Policy>
   void operator()(Policy&& policy) {
-    for (const auto& pair : {std::pair{0, 34}, {1, 33}, {2, 30}, {100, 313434}, {350, 14046934}}) {
+    for (const auto& pair : {std::pair{0, 34}, {1, 40}, {2, 48}, {100, 10534}, {350, 124284}}) {
       auto [size, expected] = pair;
       std::vector<int> a(size);
       std::vector<int> b(size);
       for (int i = 0; i != size; ++i) {
         a[i] = i + 1;
-        b[i] = i - 4;
+        b[i] = i + 4;
       }
 
       decltype(auto) ret = std::transform_reduce(
@@ -57,8 +70,8 @@ struct Test {
           Iter1(std::data(a) + std::size(a)),
           Iter2(std::data(b)),
           ValueT(34),
-          [](ValueT i, ValueT j) { return i + j + 3; },
-          [](ValueT i, ValueT j) { return i * j; });
+          std::plus{},
+          [](ValueT i, ValueT j) { return i + j + 1; });
       static_assert(std::is_same_v<decltype(ret), ValueT>);
       assert(ret == expected);
     }
@@ -77,6 +90,20 @@ struct Test {
       static_assert(std::is_same_v<decltype(ret), int>);
       assert(ret == expected);
     }
+    {
+      int a[] = {1, 2, 3, 4, 5, 6, 7, 8};
+      int b[] = {8, 7, 6, 5, 4, 3, 2, 1};
+
+      auto ret = std::transform_reduce(
+          policy,
+          Iter1(std::begin(a)),
+          Iter1(std::end(a)),
+          Iter2(std::begin(b)),
+          constructible_from<int>{0},
+          std::plus{},
+          [](int i, int j) { return constructible_from<int>{i + j}; });
+      assert(ret.get() == 72);
+    }
   }
 };
 

diff  --git a/libcxx/utils/data/ignore_format.txt b/libcxx/utils/data/ignore_format.txt
index bcba849f7c30a6..1353bc99b74da3 100644
--- a/libcxx/utils/data/ignore_format.txt
+++ b/libcxx/utils/data/ignore_format.txt
@@ -553,6 +553,7 @@ libcxx/src/mutex.cpp
 libcxx/src/mutex_destructor.cpp
 libcxx/src/new.cpp
 libcxx/src/optional.cpp
+libcxx/src/pstl/libdispatch.cpp
 libcxx/src/random.cpp
 libcxx/src/random_shuffle.cpp
 libcxx/src/regex.cpp

diff  --git a/libcxx/utils/libcxx/test/features.py b/libcxx/utils/libcxx/test/features.py
index 54a1e2772e014d..9a4c20a0eec9e3 100644
--- a/libcxx/utils/libcxx/test/features.py
+++ b/libcxx/utils/libcxx/test/features.py
@@ -307,6 +307,7 @@ def _getSuitableClangTidy(cfg):
     "_LIBCPP_HAS_NO_LOCALIZATION": "no-localization",
     "_LIBCPP_HAS_NO_WIDE_CHARACTERS": "no-wide-characters",
     "_LIBCPP_HAS_NO_UNICODE": "libcpp-has-no-unicode",
+    "_LIBCPP_PSTL_CPU_BACKEND_LIBDISPATCH": "libcpp-pstl-cpu-backend-libdispatch",
 }
 for macro, feature in macros.items():
     DEFAULT_FEATURES.append(