[libcxx-commits] [libcxx] [libc++][PSTL] Introduce cpu traits (PR #88134)
Louis Dionne via libcxx-commits
libcxx-commits at lists.llvm.org
Fri Apr 12 14:00:37 PDT 2024
https://github.com/ldionne updated https://github.com/llvm/llvm-project/pull/88134
>From f4c092d929022cbc0b85194ff2d39b352a84ec45 Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2 at gmail.com>
Date: Fri, 5 Apr 2024 15:11:35 -0400
Subject: [PATCH 1/6] [libc++] Introduce cpu traits
Currently, CPU backends in the PSTL are created by defining functions
in the __par_backend namespace. Then, the PSTL includes the CPU backend
that gets configured via CMake and gets those definitions.
This prevents CPU backends from easily co-existing and is a bit confusing.
To solve this problem, this patch introduces the notion of __cpu_traits,
which is a cheap encapsulation of the basis operations required to
implement a CPU-based PSTL. Different backends can now define their own
tag and coexist, and the CPU-based PSTL will simply use __cpu_traits to
dispatch to the right implementation of e.g. __for_each.
Note that this patch doesn't change the actual implementation of the
backends in any way, it only modifies how that implementation is accessed
to implement PSTL algorithms.
This patch is a step towards #88131.
---
libcxx/include/CMakeLists.txt | 1 +
.../__algorithm/pstl_backends/cpu_backend.h | 45 --
.../pstl_backends/cpu_backends/any_of.h | 9 +-
.../pstl_backends/cpu_backends/backend.h | 10 +-
.../pstl_backends/cpu_backends/fill.h | 3 +-
.../pstl_backends/cpu_backends/find_if.h | 22 +-
.../pstl_backends/cpu_backends/for_each.h | 3 +-
.../pstl_backends/cpu_backends/libdispatch.h | 457 +++++++++---------
.../pstl_backends/cpu_backends/merge.h | 3 +-
.../pstl_backends/cpu_backends/serial.h | 98 ++--
.../pstl_backends/cpu_backends/stable_sort.h | 3 +-
.../pstl_backends/cpu_backends/thread.h | 96 ++--
.../pstl_backends/cpu_backends/transform.h | 5 +-
.../cpu_backends/transform_reduce.h | 18 +-
libcxx/include/__pstl/cpu_algos/cpu_traits.h | 84 ++++
libcxx/include/module.modulemap | 2 +
libcxx/src/pstl/libdispatch.cpp | 7 +-
libcxx/utils/generate_iwyu_mapping.py | 1 +
18 files changed, 467 insertions(+), 400 deletions(-)
create mode 100644 libcxx/include/__pstl/cpu_algos/cpu_traits.h
diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt
index 097a41d4c41740..1f90dd6db5b158 100644
--- a/libcxx/include/CMakeLists.txt
+++ b/libcxx/include/CMakeLists.txt
@@ -591,6 +591,7 @@ set(files
__numeric/transform_exclusive_scan.h
__numeric/transform_inclusive_scan.h
__numeric/transform_reduce.h
+ __pstl/cpu_algos/cpu_traits.h
__random/bernoulli_distribution.h
__random/binomial_distribution.h
__random/cauchy_distribution.h
diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backend.h b/libcxx/include/__algorithm/pstl_backends/cpu_backend.h
index 6980ded189ea2a..c93139243af459 100644
--- a/libcxx/include/__algorithm/pstl_backends/cpu_backend.h
+++ b/libcxx/include/__algorithm/pstl_backends/cpu_backend.h
@@ -10,51 +10,6 @@
#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKEND_H
#include <__config>
-
-/*
-
- // _Functor takes a subrange for [__first, __last) that should be executed in serial
- template <class _RandomAccessIterator, class _Functor>
- optional<__empty> __parallel_for(_RandomAccessIterator __first, _RandomAccessIterator __last, _Functor __func);
-
- template <class _Iterator, class _UnaryOp, class _Tp, class _BinaryOp, class _Reduction>
- optional<_Tp>
- __parallel_transform_reduce(_Iterator __first, _Iterator __last, _UnaryOp, _Tp __init, _BinaryOp, _Reduction);
-
- // Cancel the execution of other jobs - they aren't needed anymore
- void __cancel_execution();
-
- template <class _RandomAccessIterator1,
- class _RandomAccessIterator2,
- class _RandomAccessIterator3,
- class _Compare,
- class _LeafMerge>
- optional<void> __parallel_merge(
- _RandomAccessIterator1 __first1,
- _RandomAccessIterator1 __last1,
- _RandomAccessIterator2 __first2,
- _RandomAccessIterator2 __last2,
- _RandomAccessIterator3 __outit,
- _Compare __comp,
- _LeafMerge __leaf_merge);
-
- template <class _RandomAccessIterator, class _Comp, class _LeafSort>
- void __parallel_stable_sort(_RandomAccessIterator __first,
- _RandomAccessIterator __last,
- _Comp __comp,
- _LeafSort __leaf_sort);
-
- TODO: Document the parallel backend
-
-Exception handling
-==================
-
-CPU backends are expected to report errors (i.e. failure to allocate) by returning a disengaged `optional` from their
-implementation. Exceptions shouldn't be used to report an internal failure-to-allocate, since all exceptions are turned
-into a program termination at the front-end level. When a backend returns a disengaged `optional` to the frontend, the
-frontend will turn that into a call to `std::__throw_bad_alloc();` to report the internal failure to the user.
-*/
-
#include <__algorithm/pstl_backends/cpu_backends/any_of.h>
#include <__algorithm/pstl_backends/cpu_backends/backend.h>
#include <__algorithm/pstl_backends/cpu_backends/fill.h>
diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/any_of.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/any_of.h
index 13dff80086e72b..3755d288047e0b 100644
--- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/any_of.h
+++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/any_of.h
@@ -17,6 +17,7 @@
#include <__config>
#include <__functional/operations.h>
#include <__iterator/concepts.h>
+#include <__pstl/cpu_algos/cpu_traits.h>
#include <__type_traits/is_execution_policy.h>
#include <__utility/move.h>
#include <__utility/pair.h>
@@ -30,13 +31,13 @@ _LIBCPP_PUSH_MACROS
_LIBCPP_BEGIN_NAMESPACE_STD
-template <class _Index, class _Brick>
+template <class _Backend, class _Index, class _Brick>
_LIBCPP_HIDE_FROM_ABI optional<bool> __parallel_or(_Index __first, _Index __last, _Brick __f) {
std::atomic<bool> __found(false);
- auto __ret = __par_backend::__parallel_for(__first, __last, [__f, &__found](_Index __i, _Index __j) {
+ auto __ret = __pstl::__cpu_traits<_Backend>::__for_each(__first, __last, [__f, &__found](_Index __i, _Index __j) {
if (!__found.load(std::memory_order_relaxed) && __f(__i, __j)) {
__found.store(true, std::memory_order_relaxed);
- __par_backend::__cancel_execution();
+ __pstl::__cpu_traits<_Backend>::__cancel_execution();
}
});
if (!__ret)
@@ -74,7 +75,7 @@ _LIBCPP_HIDE_FROM_ABI optional<bool>
__pstl_any_of(__cpu_backend_tag, _ForwardIterator __first, _ForwardIterator __last, _Predicate __pred) {
if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> &&
__has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
- return std::__parallel_or(
+ return std::__parallel_or<__cpu_backend_tag>(
__first, __last, [&__pred](_ForwardIterator __brick_first, _ForwardIterator __brick_last) {
auto __res = std::__pstl_any_of<__remove_parallel_policy_t<_ExecutionPolicy>>(
__cpu_backend_tag{}, __brick_first, __brick_last, __pred);
diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/backend.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/backend.h
index ea2210a4a7adbd..0641a51e6823e3 100644
--- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/backend.h
+++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/backend.h
@@ -30,9 +30,13 @@
_LIBCPP_BEGIN_NAMESPACE_STD
-struct __cpu_backend_tag {};
-
-inline constexpr size_t __lane_size = 64;
+# if defined(_LIBCPP_PSTL_CPU_BACKEND_SERIAL)
+using __cpu_backend_tag = __pstl::__serial_backend_tag;
+# elif defined(_LIBCPP_PSTL_CPU_BACKEND_THREAD)
+using __cpu_backend_tag = __pstl::__std_thread_backend_tag;
+# elif defined(_LIBCPP_PSTL_CPU_BACKEND_LIBDISPATCH)
+using __cpu_backend_tag = __pstl::__libdispatch_backend_tag;
+# endif
_LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/fill.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/fill.h
index 64babe9fd2bdae..0c20bdff62675a 100644
--- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/fill.h
+++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/fill.h
@@ -13,6 +13,7 @@
#include <__algorithm/pstl_backends/cpu_backends/backend.h>
#include <__config>
#include <__iterator/concepts.h>
+#include <__pstl/cpu_algos/cpu_traits.h>
#include <__type_traits/is_execution_policy.h>
#include <__utility/empty.h>
#include <optional>
@@ -39,7 +40,7 @@ _LIBCPP_HIDE_FROM_ABI optional<__empty>
__pstl_fill(__cpu_backend_tag, _ForwardIterator __first, _ForwardIterator __last, const _Tp& __value) {
if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> &&
__has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
- return __par_backend::__parallel_for(
+ return __pstl::__cpu_traits<__cpu_backend_tag>::__for_each(
__first, __last, [&__value](_ForwardIterator __brick_first, _ForwardIterator __brick_last) {
[[maybe_unused]] auto __res = std::__pstl_fill<__remove_parallel_policy_t<_ExecutionPolicy>>(
__cpu_backend_tag{}, __brick_first, __brick_last, __value);
diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/find_if.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/find_if.h
index 170470e4fb7edd..626293faef6921 100644
--- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/find_if.h
+++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/find_if.h
@@ -16,6 +16,7 @@
#include <__functional/operations.h>
#include <__iterator/concepts.h>
#include <__iterator/iterator_traits.h>
+#include <__pstl/cpu_algos/cpu_traits.h>
#include <__type_traits/is_execution_policy.h>
#include <__utility/move.h>
#include <__utility/pair.h>
@@ -33,7 +34,7 @@ _LIBCPP_PUSH_MACROS
_LIBCPP_BEGIN_NAMESPACE_STD
-template <class _Index, class _Brick, class _Compare>
+template <class _Backend, class _Index, class _Brick, class _Compare>
_LIBCPP_HIDE_FROM_ABI optional<_Index>
__parallel_find(_Index __first, _Index __last, _Brick __f, _Compare __comp, bool __b_first) {
typedef typename std::iterator_traits<_Index>::difference_type _DifferenceType;
@@ -41,8 +42,8 @@ __parallel_find(_Index __first, _Index __last, _Brick __f, _Compare __comp, bool
_DifferenceType __initial_dist = __b_first ? __n : -1;
std::atomic<_DifferenceType> __extremum(__initial_dist);
// TODO: find out what is better here: parallel_for or parallel_reduce
- auto __res =
- __par_backend::__parallel_for(__first, __last, [__comp, __f, __first, &__extremum](_Index __i, _Index __j) {
+ auto __res = __pstl::__cpu_traits<_Backend>::__for_each(
+ __first, __last, [__comp, __f, __first, &__extremum](_Index __i, _Index __j) {
// See "Reducing Contention Through Priority Updates", PPoPP '13, for discussion of
// why using a shared variable scales fairly well in this situation.
if (__comp(__i - __first, __extremum)) {
@@ -61,12 +62,12 @@ __parallel_find(_Index __first, _Index __last, _Brick __f, _Compare __comp, bool
return __extremum.load() != __initial_dist ? __first + __extremum.load() : __last;
}
-template <class _Index, class _DifferenceType, class _Compare>
+template <class _Backend, class _Index, class _DifferenceType, class _Compare>
_LIBCPP_HIDE_FROM_ABI _Index
__simd_first(_Index __first, _DifferenceType __begin, _DifferenceType __end, _Compare __comp) noexcept {
// Experiments show good block sizes like this
- const _DifferenceType __block_size = 8;
- alignas(__lane_size) _DifferenceType __lane[__block_size] = {0};
+ const _DifferenceType __block_size = 8;
+ alignas(__pstl::__cpu_traits<_Backend>::__lane_size) _DifferenceType __lane[__block_size] = {0};
while (__end - __begin >= __block_size) {
_DifferenceType __found = 0;
_PSTL_PRAGMA_SIMD_REDUCTION(| : __found) for (_DifferenceType __i = __begin; __i < __begin + __block_size; ++__i) {
@@ -102,7 +103,7 @@ _LIBCPP_HIDE_FROM_ABI optional<_ForwardIterator>
__pstl_find_if(__cpu_backend_tag, _ForwardIterator __first, _ForwardIterator __last, _Predicate __pred) {
if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> &&
__has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
- return std::__parallel_find(
+ return std::__parallel_find<__cpu_backend_tag>(
__first,
__last,
[&__pred](_ForwardIterator __brick_first, _ForwardIterator __brick_last) {
@@ -116,9 +117,10 @@ __pstl_find_if(__cpu_backend_tag, _ForwardIterator __first, _ForwardIterator __l
} else if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
__has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
using __diff_t = __iter_diff_t<_ForwardIterator>;
- return std::__simd_first(__first, __diff_t(0), __last - __first, [&__pred](_ForwardIterator __iter, __diff_t __i) {
- return __pred(__iter[__i]);
- });
+ return std::__simd_first<__cpu_backend_tag>(
+ __first, __diff_t(0), __last - __first, [&__pred](_ForwardIterator __iter, __diff_t __i) {
+ return __pred(__iter[__i]);
+ });
} else {
return std::find_if(__first, __last, __pred);
}
diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/for_each.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/for_each.h
index 81fd4526b8dbf1..d637084e151d81 100644
--- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/for_each.h
+++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/for_each.h
@@ -13,6 +13,7 @@
#include <__algorithm/pstl_backends/cpu_backends/backend.h>
#include <__config>
#include <__iterator/concepts.h>
+#include <__pstl/cpu_algos/cpu_traits.h>
#include <__type_traits/is_execution_policy.h>
#include <__utility/empty.h>
#include <optional>
@@ -39,7 +40,7 @@ _LIBCPP_HIDE_FROM_ABI optional<__empty>
__pstl_for_each(__cpu_backend_tag, _ForwardIterator __first, _ForwardIterator __last, _Functor __func) {
if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> &&
__has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
- return std::__par_backend::__parallel_for(
+ return __pstl::__cpu_traits<__cpu_backend_tag>::__for_each(
__first, __last, [__func](_ForwardIterator __brick_first, _ForwardIterator __brick_last) {
[[maybe_unused]] auto __res = std::__pstl_for_each<__remove_parallel_policy_t<_ExecutionPolicy>>(
__cpu_backend_tag{}, __brick_first, __brick_last, __func);
diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/libdispatch.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/libdispatch.h
index e885e7f225172c..17faadf55dd4fa 100644
--- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/libdispatch.h
+++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/libdispatch.h
@@ -23,6 +23,7 @@
#include <__memory/construct_at.h>
#include <__memory/unique_ptr.h>
#include <__numeric/reduce.h>
+#include <__pstl/cpu_algos/cpu_traits.h>
#include <__utility/empty.h>
#include <__utility/exception_guard.h>
#include <__utility/move.h>
@@ -37,10 +38,11 @@ _LIBCPP_PUSH_MACROS
#if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
_LIBCPP_BEGIN_NAMESPACE_STD
+namespace __pstl {
-namespace __par_backend {
-inline namespace __libdispatch {
+struct __libdispatch_backend_tag {};
+namespace __libdispatch {
// ::dispatch_apply is marked as __attribute__((nothrow)) because it doesn't let exceptions propagate, and neither do
// we.
// TODO: Do we want to add [[_Clang::__callback__(__func, __context, __)]]?
@@ -77,267 +79,270 @@ __dispatch_parallel_for(__chunk_partitions __partitions, _RandomAccessIterator _
return __empty{};
}
+} // namespace __libdispatch
-template <class _RandomAccessIterator, class _Functor>
-_LIBCPP_HIDE_FROM_ABI optional<__empty>
-__parallel_for(_RandomAccessIterator __first, _RandomAccessIterator __last, _Functor __func) {
- return __libdispatch::__dispatch_parallel_for(
- __libdispatch::__partition_chunks(__last - __first), std::move(__first), std::move(__func));
-}
-
-template <class _RandomAccessIterator1, class _RandomAccessIterator2, class _RandomAccessIteratorOut>
-struct __merge_range {
- __merge_range(_RandomAccessIterator1 __mid1, _RandomAccessIterator2 __mid2, _RandomAccessIteratorOut __result)
- : __mid1_(__mid1), __mid2_(__mid2), __result_(__result) {}
+template <>
+struct __cpu_traits<__libdispatch_backend_tag> {
+ template <class _RandomAccessIterator, class _Functor>
+ _LIBCPP_HIDE_FROM_ABI static optional<__empty>
+ __for_each(_RandomAccessIterator __first, _RandomAccessIterator __last, _Functor __func) {
+ return __libdispatch::__dispatch_parallel_for(
+ __libdispatch::__partition_chunks(__last - __first), std::move(__first), std::move(__func));
+ }
- _RandomAccessIterator1 __mid1_;
- _RandomAccessIterator2 __mid2_;
- _RandomAccessIteratorOut __result_;
-};
+ template <class _RandomAccessIterator1, class _RandomAccessIterator2, class _RandomAccessIteratorOut>
+ struct __merge_range {
+ __merge_range(_RandomAccessIterator1 __mid1, _RandomAccessIterator2 __mid2, _RandomAccessIteratorOut __result)
+ : __mid1_(__mid1), __mid2_(__mid2), __result_(__result) {}
-template <typename _RandomAccessIterator1,
- typename _RandomAccessIterator2,
- typename _RandomAccessIterator3,
- typename _Compare,
- typename _LeafMerge>
-_LIBCPP_HIDE_FROM_ABI optional<__empty> __parallel_merge(
- _RandomAccessIterator1 __first1,
- _RandomAccessIterator1 __last1,
- _RandomAccessIterator2 __first2,
- _RandomAccessIterator2 __last2,
- _RandomAccessIterator3 __result,
- _Compare __comp,
- _LeafMerge __leaf_merge) noexcept {
- __chunk_partitions __partitions =
- __libdispatch::__partition_chunks(std::max<ptrdiff_t>(__last1 - __first1, __last2 - __first2));
-
- if (__partitions.__chunk_count_ == 0)
- return __empty{};
+ _RandomAccessIterator1 __mid1_;
+ _RandomAccessIterator2 __mid2_;
+ _RandomAccessIteratorOut __result_;
+ };
- if (__partitions.__chunk_count_ == 1) {
- __leaf_merge(__first1, __last1, __first2, __last2, __result, __comp);
- return __empty{};
- }
+ template <typename _RandomAccessIterator1,
+ typename _RandomAccessIterator2,
+ typename _RandomAccessIterator3,
+ typename _Compare,
+ typename _LeafMerge>
+ _LIBCPP_HIDE_FROM_ABI static optional<__empty>
+ __merge(_RandomAccessIterator1 __first1,
+ _RandomAccessIterator1 __last1,
+ _RandomAccessIterator2 __first2,
+ _RandomAccessIterator2 __last2,
+ _RandomAccessIterator3 __result,
+ _Compare __comp,
+ _LeafMerge __leaf_merge) noexcept {
+ __libdispatch::__chunk_partitions __partitions =
+ __libdispatch::__partition_chunks(std::max<ptrdiff_t>(__last1 - __first1, __last2 - __first2));
+
+ if (__partitions.__chunk_count_ == 0)
+ return __empty{};
+
+ if (__partitions.__chunk_count_ == 1) {
+ __leaf_merge(__first1, __last1, __first2, __last2, __result, __comp);
+ return __empty{};
+ }
- using __merge_range_t = __merge_range<_RandomAccessIterator1, _RandomAccessIterator2, _RandomAccessIterator3>;
- auto const __n_ranges = __partitions.__chunk_count_ + 1;
+ using __merge_range_t = __merge_range<_RandomAccessIterator1, _RandomAccessIterator2, _RandomAccessIterator3>;
+ auto const __n_ranges = __partitions.__chunk_count_ + 1;
- // TODO: use __uninitialized_buffer
- auto __destroy = [=](__merge_range_t* __ptr) {
- std::destroy_n(__ptr, __n_ranges);
- std::allocator<__merge_range_t>().deallocate(__ptr, __n_ranges);
- };
+ // TODO: use __uninitialized_buffer
+ auto __destroy = [=](__merge_range_t* __ptr) {
+ std::destroy_n(__ptr, __n_ranges);
+ std::allocator<__merge_range_t>().deallocate(__ptr, __n_ranges);
+ };
- unique_ptr<__merge_range_t[], decltype(__destroy)> __ranges(
- [&]() -> __merge_range_t* {
+ unique_ptr<__merge_range_t[], decltype(__destroy)> __ranges(
+ [&]() -> __merge_range_t* {
# ifndef _LIBCPP_HAS_NO_EXCEPTIONS
- try {
+ try {
# endif
- return std::allocator<__merge_range_t>().allocate(__n_ranges);
+ return std::allocator<__merge_range_t>().allocate(__n_ranges);
# ifndef _LIBCPP_HAS_NO_EXCEPTIONS
- } catch (const std::bad_alloc&) {
- return nullptr;
- }
+ } catch (const std::bad_alloc&) {
+ return nullptr;
+ }
# endif
- }(),
- __destroy);
-
- if (!__ranges)
- return nullopt;
+ }(),
+ __destroy);
+
+ if (!__ranges)
+ return nullopt;
+
+ // TODO: Improve the case where the smaller range is merged into just a few (or even one) chunks of the larger case
+ __merge_range_t* __r = __ranges.get();
+ std::__construct_at(__r++, __first1, __first2, __result);
+
+ bool __iterate_first_range = __last1 - __first1 > __last2 - __first2;
+
+ auto __compute_chunk = [&](size_t __chunk_size) -> __merge_range_t {
+ auto [__mid1, __mid2] = [&] {
+ if (__iterate_first_range) {
+ auto __m1 = __first1 + __chunk_size;
+ auto __m2 = std::lower_bound(__first2, __last2, __m1[-1], __comp);
+ return std::make_pair(__m1, __m2);
+ } else {
+ auto __m2 = __first2 + __chunk_size;
+ auto __m1 = std::lower_bound(__first1, __last1, __m2[-1], __comp);
+ return std::make_pair(__m1, __m2);
+ }
+ }();
- // TODO: Improve the case where the smaller range is merged into just a few (or even one) chunks of the larger case
- __merge_range_t* __r = __ranges.get();
- std::__construct_at(__r++, __first1, __first2, __result);
+ __result += (__mid1 - __first1) + (__mid2 - __first2);
+ __first1 = __mid1;
+ __first2 = __mid2;
+ return {std::move(__mid1), std::move(__mid2), __result};
+ };
- bool __iterate_first_range = __last1 - __first1 > __last2 - __first2;
+ // handle first chunk
+ std::__construct_at(__r++, __compute_chunk(__partitions.__first_chunk_size_));
- auto __compute_chunk = [&](size_t __chunk_size) -> __merge_range_t {
- auto [__mid1, __mid2] = [&] {
- if (__iterate_first_range) {
- auto __m1 = __first1 + __chunk_size;
- auto __m2 = std::lower_bound(__first2, __last2, __m1[-1], __comp);
- return std::make_pair(__m1, __m2);
- } else {
- auto __m2 = __first2 + __chunk_size;
- auto __m1 = std::lower_bound(__first1, __last1, __m2[-1], __comp);
- return std::make_pair(__m1, __m2);
- }
- }();
+ // handle 2 -> N - 1 chunks
+ for (ptrdiff_t __i = 0; __i != __partitions.__chunk_count_ - 2; ++__i)
+ std::__construct_at(__r++, __compute_chunk(__partitions.__chunk_size_));
- __result += (__mid1 - __first1) + (__mid2 - __first2);
- __first1 = __mid1;
- __first2 = __mid2;
- return {std::move(__mid1), std::move(__mid2), __result};
- };
+ // handle last chunk
+ std::__construct_at(__r, __last1, __last2, __result);
- // handle first chunk
- std::__construct_at(__r++, __compute_chunk(__partitions.__first_chunk_size_));
-
- // handle 2 -> N - 1 chunks
- for (ptrdiff_t __i = 0; __i != __partitions.__chunk_count_ - 2; ++__i)
- std::__construct_at(__r++, __compute_chunk(__partitions.__chunk_size_));
-
- // handle last chunk
- std::__construct_at(__r, __last1, __last2, __result);
-
- __libdispatch::__dispatch_apply(__partitions.__chunk_count_, [&](size_t __index) {
- auto __first_iters = __ranges[__index];
- auto __last_iters = __ranges[__index + 1];
- __leaf_merge(
- __first_iters.__mid1_,
- __last_iters.__mid1_,
- __first_iters.__mid2_,
- __last_iters.__mid2_,
- __first_iters.__result_,
- __comp);
- });
+ __libdispatch::__dispatch_apply(__partitions.__chunk_count_, [&](size_t __index) {
+ auto __first_iters = __ranges[__index];
+ auto __last_iters = __ranges[__index + 1];
+ __leaf_merge(
+ __first_iters.__mid1_,
+ __last_iters.__mid1_,
+ __first_iters.__mid2_,
+ __last_iters.__mid2_,
+ __first_iters.__result_,
+ __comp);
+ });
- return __empty{};
-}
+ return __empty{};
+ }
-template <class _RandomAccessIterator, class _Transform, class _Value, class _Combiner, class _Reduction>
-_LIBCPP_HIDE_FROM_ABI optional<_Value> __parallel_transform_reduce(
- _RandomAccessIterator __first,
- _RandomAccessIterator __last,
- _Transform __transform,
- _Value __init,
- _Combiner __combiner,
- _Reduction __reduction) {
- if (__first == __last)
- return __init;
-
- auto __partitions = __libdispatch::__partition_chunks(__last - __first);
-
- auto __destroy = [__count = __partitions.__chunk_count_](_Value* __ptr) {
- std::destroy_n(__ptr, __count);
- std::allocator<_Value>().deallocate(__ptr, __count);
- };
+ template <class _RandomAccessIterator, class _Transform, class _Value, class _Combiner, class _Reduction>
+ _LIBCPP_HIDE_FROM_ABI static optional<_Value> __transform_reduce(
+ _RandomAccessIterator __first,
+ _RandomAccessIterator __last,
+ _Transform __transform,
+ _Value __init,
+ _Combiner __combiner,
+ _Reduction __reduction) {
+ if (__first == __last)
+ return __init;
+
+ auto __partitions = __libdispatch::__partition_chunks(__last - __first);
+
+ auto __destroy = [__count = __partitions.__chunk_count_](_Value* __ptr) {
+ std::destroy_n(__ptr, __count);
+ std::allocator<_Value>().deallocate(__ptr, __count);
+ };
- // TODO: use __uninitialized_buffer
- // TODO: allocate one element per worker instead of one element per chunk
- unique_ptr<_Value[], decltype(__destroy)> __values(
- std::allocator<_Value>().allocate(__partitions.__chunk_count_), __destroy);
+ // TODO: use __uninitialized_buffer
+ // TODO: allocate one element per worker instead of one element per chunk
+ unique_ptr<_Value[], decltype(__destroy)> __values(
+ std::allocator<_Value>().allocate(__partitions.__chunk_count_), __destroy);
+
+ // __dispatch_apply is noexcept
+ __libdispatch::__dispatch_apply(__partitions.__chunk_count_, [&](size_t __chunk) {
+ auto __this_chunk_size = __chunk == 0 ? __partitions.__first_chunk_size_ : __partitions.__chunk_size_;
+ auto __index = __chunk == 0 ? 0
+ : (__chunk * __partitions.__chunk_size_) +
+ (__partitions.__first_chunk_size_ - __partitions.__chunk_size_);
+ if (__this_chunk_size != 1) {
+ std::__construct_at(
+ __values.get() + __chunk,
+ __reduction(__first + __index + 2,
+ __first + __index + __this_chunk_size,
+ __combiner(__transform(__first + __index), __transform(__first + __index + 1))));
+ } else {
+ std::__construct_at(__values.get() + __chunk, __transform(__first + __index));
+ }
+ });
- // __dispatch_apply is noexcept
- __libdispatch::__dispatch_apply(__partitions.__chunk_count_, [&](size_t __chunk) {
- auto __this_chunk_size = __chunk == 0 ? __partitions.__first_chunk_size_ : __partitions.__chunk_size_;
- auto __index =
- __chunk == 0
- ? 0
- : (__chunk * __partitions.__chunk_size_) + (__partitions.__first_chunk_size_ - __partitions.__chunk_size_);
- if (__this_chunk_size != 1) {
- std::__construct_at(
- __values.get() + __chunk,
- __reduction(__first + __index + 2,
- __first + __index + __this_chunk_size,
- __combiner(__transform(__first + __index), __transform(__first + __index + 1))));
- } else {
- std::__construct_at(__values.get() + __chunk, __transform(__first + __index));
- }
- });
+ return std::reduce(
+ std::make_move_iterator(__values.get()),
+ std::make_move_iterator(__values.get() + __partitions.__chunk_count_),
+ std::move(__init),
+ __combiner);
+ }
- return std::reduce(
- std::make_move_iterator(__values.get()),
- std::make_move_iterator(__values.get() + __partitions.__chunk_count_),
- std::move(__init),
- __combiner);
-}
+ template <class _RandomAccessIterator, class _Comp, class _LeafSort>
+ _LIBCPP_HIDE_FROM_ABI static optional<__empty>
+ __stable_sort(_RandomAccessIterator __first, _RandomAccessIterator __last, _Comp __comp, _LeafSort __leaf_sort) {
+ const auto __size = __last - __first;
+ auto __partitions = __libdispatch::__partition_chunks(__size);
-template <class _RandomAccessIterator, class _Comp, class _LeafSort>
-_LIBCPP_HIDE_FROM_ABI optional<__empty> __parallel_stable_sort(
- _RandomAccessIterator __first, _RandomAccessIterator __last, _Comp __comp, _LeafSort __leaf_sort) {
- const auto __size = __last - __first;
- auto __partitions = __libdispatch::__partition_chunks(__size);
+ if (__partitions.__chunk_count_ == 0)
+ return __empty{};
- if (__partitions.__chunk_count_ == 0)
- return __empty{};
+ if (__partitions.__chunk_count_ == 1) {
+ __leaf_sort(__first, __last, __comp);
+ return __empty{};
+ }
- if (__partitions.__chunk_count_ == 1) {
- __leaf_sort(__first, __last, __comp);
- return __empty{};
- }
+ using _Value = __iter_value_type<_RandomAccessIterator>;
- using _Value = __iter_value_type<_RandomAccessIterator>;
+ auto __destroy = [__size](_Value* __ptr) {
+ std::destroy_n(__ptr, __size);
+ std::allocator<_Value>().deallocate(__ptr, __size);
+ };
- auto __destroy = [__size](_Value* __ptr) {
- std::destroy_n(__ptr, __size);
- std::allocator<_Value>().deallocate(__ptr, __size);
- };
+ // TODO: use __uninitialized_buffer
+ unique_ptr<_Value[], decltype(__destroy)> __values(std::allocator<_Value>().allocate(__size), __destroy);
- // TODO: use __uninitialized_buffer
- unique_ptr<_Value[], decltype(__destroy)> __values(std::allocator<_Value>().allocate(__size), __destroy);
+ // Initialize all elements to a moved-from state
+ // TODO: Don't do this - this can be done in the first merge - see https://llvm.org/PR63928
+ std::__construct_at(__values.get(), std::move(*__first));
+ for (__iter_diff_t<_RandomAccessIterator> __i = 1; __i != __size; ++__i) {
+ std::__construct_at(__values.get() + __i, std::move(__values.get()[__i - 1]));
+ }
+ *__first = std::move(__values.get()[__size - 1]);
+
+ __libdispatch::__dispatch_parallel_for(
+ __partitions,
+ __first,
+ [&__leaf_sort, &__comp](_RandomAccessIterator __chunk_first, _RandomAccessIterator __chunk_last) {
+ __leaf_sort(std::move(__chunk_first), std::move(__chunk_last), __comp);
+ });
+
+ bool __objects_are_in_buffer = false;
+ do {
+ const auto __old_chunk_size = __partitions.__chunk_size_;
+ if (__partitions.__chunk_count_ % 2 == 1) {
+ auto __inplace_merge_chunks = [&__comp, &__partitions](auto __first_chunk_begin) {
+ std::inplace_merge(
+ __first_chunk_begin,
+ __first_chunk_begin + __partitions.__first_chunk_size_,
+ __first_chunk_begin + __partitions.__first_chunk_size_ + __partitions.__chunk_size_,
+ __comp);
+ };
+ if (__objects_are_in_buffer)
+ __inplace_merge_chunks(__values.get());
+ else
+ __inplace_merge_chunks(__first);
+ __partitions.__first_chunk_size_ += 2 * __partitions.__chunk_size_;
+ } else {
+ __partitions.__first_chunk_size_ += __partitions.__chunk_size_;
+ }
- // Initialize all elements to a moved-from state
- // TODO: Don't do this - this can be done in the first merge - see https://llvm.org/PR63928
- std::__construct_at(__values.get(), std::move(*__first));
- for (__iter_diff_t<_RandomAccessIterator> __i = 1; __i != __size; ++__i) {
- std::__construct_at(__values.get() + __i, std::move(__values.get()[__i - 1]));
- }
- *__first = std::move(__values.get()[__size - 1]);
-
- __libdispatch::__dispatch_parallel_for(
- __partitions,
- __first,
- [&__leaf_sort, &__comp](_RandomAccessIterator __chunk_first, _RandomAccessIterator __chunk_last) {
- __leaf_sort(std::move(__chunk_first), std::move(__chunk_last), __comp);
- });
-
- bool __objects_are_in_buffer = false;
- do {
- const auto __old_chunk_size = __partitions.__chunk_size_;
- if (__partitions.__chunk_count_ % 2 == 1) {
- auto __inplace_merge_chunks = [&__comp, &__partitions](auto __first_chunk_begin) {
- std::inplace_merge(
- __first_chunk_begin,
- __first_chunk_begin + __partitions.__first_chunk_size_,
- __first_chunk_begin + __partitions.__first_chunk_size_ + __partitions.__chunk_size_,
- __comp);
+ __partitions.__chunk_size_ *= 2;
+ __partitions.__chunk_count_ /= 2;
+
+ auto __merge_chunks = [__partitions, __old_chunk_size, &__comp](auto __from_first, auto __to_first) {
+ __libdispatch::__dispatch_parallel_for(
+ __partitions,
+ __from_first,
+ [__old_chunk_size, &__from_first, &__to_first, &__comp](auto __chunk_first, auto __chunk_last) {
+ std::merge(std::make_move_iterator(__chunk_first),
+ std::make_move_iterator(__chunk_last - __old_chunk_size),
+ std::make_move_iterator(__chunk_last - __old_chunk_size),
+ std::make_move_iterator(__chunk_last),
+ __to_first + (__chunk_first - __from_first),
+ __comp);
+ });
};
+
if (__objects_are_in_buffer)
- __inplace_merge_chunks(__values.get());
+ __merge_chunks(__values.get(), __first);
else
- __inplace_merge_chunks(__first);
- __partitions.__first_chunk_size_ += 2 * __partitions.__chunk_size_;
- } else {
- __partitions.__first_chunk_size_ += __partitions.__chunk_size_;
- }
-
- __partitions.__chunk_size_ *= 2;
- __partitions.__chunk_count_ /= 2;
-
- auto __merge_chunks = [__partitions, __old_chunk_size, &__comp](auto __from_first, auto __to_first) {
- __libdispatch::__dispatch_parallel_for(
- __partitions,
- __from_first,
- [__old_chunk_size, &__from_first, &__to_first, &__comp](auto __chunk_first, auto __chunk_last) {
- std::merge(std::make_move_iterator(__chunk_first),
- std::make_move_iterator(__chunk_last - __old_chunk_size),
- std::make_move_iterator(__chunk_last - __old_chunk_size),
- std::make_move_iterator(__chunk_last),
- __to_first + (__chunk_first - __from_first),
- __comp);
- });
- };
+ __merge_chunks(__first, __values.get());
+ __objects_are_in_buffer = !__objects_are_in_buffer;
+ } while (__partitions.__chunk_count_ > 1);
- if (__objects_are_in_buffer)
- __merge_chunks(__values.get(), __first);
- else
- __merge_chunks(__first, __values.get());
- __objects_are_in_buffer = !__objects_are_in_buffer;
- } while (__partitions.__chunk_count_ > 1);
+ if (__objects_are_in_buffer) {
+ std::move(__values.get(), __values.get() + __size, __first);
+ }
- if (__objects_are_in_buffer) {
- std::move(__values.get(), __values.get() + __size, __first);
+ return __empty{};
}
- return __empty{};
-}
+ _LIBCPP_HIDE_FROM_ABI static void __cancel_execution() {}
-_LIBCPP_HIDE_FROM_ABI inline void __cancel_execution() {}
-
-} // namespace __libdispatch
-} // namespace __par_backend
+ static constexpr size_t __lane_size = 64;
+};
+} // namespace __pstl
_LIBCPP_END_NAMESPACE_STD
#endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/merge.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/merge.h
index b0db70f58b2ef4..c93f4051c9d094 100644
--- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/merge.h
+++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/merge.h
@@ -13,6 +13,7 @@
#include <__algorithm/pstl_backends/cpu_backends/backend.h>
#include <__config>
#include <__iterator/concepts.h>
+#include <__pstl/cpu_algos/cpu_traits.h>
#include <__type_traits/is_execution_policy.h>
#include <__utility/move.h>
#include <optional>
@@ -45,7 +46,7 @@ _LIBCPP_HIDE_FROM_ABI optional<_ForwardOutIterator> __pstl_merge(
__has_random_access_iterator_category_or_concept<_ForwardIterator1>::value &&
__has_random_access_iterator_category_or_concept<_ForwardIterator2>::value &&
__has_random_access_iterator_category_or_concept<_ForwardOutIterator>::value) {
- auto __res = __par_backend::__parallel_merge(
+ auto __res = __pstl::__cpu_traits<__cpu_backend_tag>::__merge(
__first1,
__last1,
__first2,
diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/serial.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/serial.h
index afcc7ffb266130..7544619a8eefd8 100644
--- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/serial.h
+++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/serial.h
@@ -11,6 +11,7 @@
#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKENDS_SERIAL_H
#include <__config>
+#include <__pstl/cpu_algos/cpu_traits.h>
#include <__utility/empty.h>
#include <__utility/move.h>
#include <cstddef>
@@ -26,54 +27,55 @@ _LIBCPP_PUSH_MACROS
# include <__undef_macros>
_LIBCPP_BEGIN_NAMESPACE_STD
-
-namespace __par_backend {
-inline namespace __serial_cpu_backend {
-
-template <class _RandomAccessIterator, class _Fp>
-_LIBCPP_HIDE_FROM_ABI optional<__empty>
-__parallel_for(_RandomAccessIterator __first, _RandomAccessIterator __last, _Fp __f) {
- __f(__first, __last);
- return __empty{};
-}
-
-template <class _Index, class _UnaryOp, class _Tp, class _BinaryOp, class _Reduce>
-_LIBCPP_HIDE_FROM_ABI optional<_Tp>
-__parallel_transform_reduce(_Index __first, _Index __last, _UnaryOp, _Tp __init, _BinaryOp, _Reduce __reduce) {
- return __reduce(std::move(__first), std::move(__last), std::move(__init));
-}
-
-template <class _RandomAccessIterator, class _Compare, class _LeafSort>
-_LIBCPP_HIDE_FROM_ABI optional<__empty> __parallel_stable_sort(
- _RandomAccessIterator __first, _RandomAccessIterator __last, _Compare __comp, _LeafSort __leaf_sort) {
- __leaf_sort(__first, __last, __comp);
- return __empty{};
-}
-
-_LIBCPP_HIDE_FROM_ABI inline void __cancel_execution() {}
-
-template <class _RandomAccessIterator1,
- class _RandomAccessIterator2,
- class _RandomAccessIterator3,
- class _Compare,
- class _LeafMerge>
-_LIBCPP_HIDE_FROM_ABI optional<__empty> __parallel_merge(
- _RandomAccessIterator1 __first1,
- _RandomAccessIterator1 __last1,
- _RandomAccessIterator2 __first2,
- _RandomAccessIterator2 __last2,
- _RandomAccessIterator3 __outit,
- _Compare __comp,
- _LeafMerge __leaf_merge) {
- __leaf_merge(__first1, __last1, __first2, __last2, __outit, __comp);
- return __empty{};
-}
-
-// TODO: Complete this list
-
-} // namespace __serial_cpu_backend
-} // namespace __par_backend
-
+namespace __pstl {
+
+struct __serial_backend_tag {};
+
+template <>
+struct __cpu_traits<__serial_backend_tag> {
+ template <class _RandomAccessIterator, class _Fp>
+ _LIBCPP_HIDE_FROM_ABI static optional<__empty>
+ __for_each(_RandomAccessIterator __first, _RandomAccessIterator __last, _Fp __f) {
+ __f(__first, __last);
+ return __empty{};
+ }
+
+ template <class _Index, class _UnaryOp, class _Tp, class _BinaryOp, class _Reduce>
+ _LIBCPP_HIDE_FROM_ABI static optional<_Tp>
+ __transform_reduce(_Index __first, _Index __last, _UnaryOp, _Tp __init, _BinaryOp, _Reduce __reduce) {
+ return __reduce(std::move(__first), std::move(__last), std::move(__init));
+ }
+
+ template <class _RandomAccessIterator, class _Compare, class _LeafSort>
+ _LIBCPP_HIDE_FROM_ABI static optional<__empty>
+ __stable_sort(_RandomAccessIterator __first, _RandomAccessIterator __last, _Compare __comp, _LeafSort __leaf_sort) {
+ __leaf_sort(__first, __last, __comp);
+ return __empty{};
+ }
+
+ _LIBCPP_HIDE_FROM_ABI static void __cancel_execution() {}
+
+ template <class _RandomAccessIterator1,
+ class _RandomAccessIterator2,
+ class _RandomAccessIterator3,
+ class _Compare,
+ class _LeafMerge>
+ _LIBCPP_HIDE_FROM_ABI static optional<__empty>
+ __merge(_RandomAccessIterator1 __first1,
+ _RandomAccessIterator1 __last1,
+ _RandomAccessIterator2 __first2,
+ _RandomAccessIterator2 __last2,
+ _RandomAccessIterator3 __outit,
+ _Compare __comp,
+ _LeafMerge __leaf_merge) {
+ __leaf_merge(__first1, __last1, __first2, __last2, __outit, __comp);
+ return __empty{};
+ }
+
+ static constexpr size_t __lane_size = 64;
+};
+
+} // namespace __pstl
_LIBCPP_END_NAMESPACE_STD
_LIBCPP_POP_MACROS
diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/stable_sort.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/stable_sort.h
index 34c423586c4b74..8c60cf897ff860 100644
--- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/stable_sort.h
+++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/stable_sort.h
@@ -12,6 +12,7 @@
#include <__algorithm/pstl_backends/cpu_backends/backend.h>
#include <__algorithm/stable_sort.h>
#include <__config>
+#include <__pstl/cpu_algos/cpu_traits.h>
#include <__type_traits/is_execution_policy.h>
#include <__utility/empty.h>
#include <optional>
@@ -28,7 +29,7 @@ template <class _ExecutionPolicy, class _RandomAccessIterator, class _Comp>
_LIBCPP_HIDE_FROM_ABI optional<__empty>
__pstl_stable_sort(__cpu_backend_tag, _RandomAccessIterator __first, _RandomAccessIterator __last, _Comp __comp) {
if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy>) {
- return __par_backend::__parallel_stable_sort(
+ return __pstl::__cpu_traits<__cpu_backend_tag>::__stable_sort(
__first, __last, __comp, [](_RandomAccessIterator __g_first, _RandomAccessIterator __g_last, _Comp __g_comp) {
std::stable_sort(__g_first, __g_last, __g_comp);
});
diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/thread.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/thread.h
index eb11a961b760c3..2acf912264a001 100644
--- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/thread.h
+++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/thread.h
@@ -11,6 +11,7 @@
#include <__assert>
#include <__config>
+#include <__pstl/cpu_algos/cpu_traits.h>
#include <__utility/empty.h>
#include <__utility/move.h>
#include <cstddef>
@@ -29,52 +30,55 @@ _LIBCPP_PUSH_MACROS
// by a proper implementation once the PSTL implementation is somewhat stable.
_LIBCPP_BEGIN_NAMESPACE_STD
-
-namespace __par_backend {
-inline namespace __thread_cpu_backend {
-
-template <class _RandomAccessIterator, class _Fp>
-_LIBCPP_HIDE_FROM_ABI optional<__empty>
-__parallel_for(_RandomAccessIterator __first, _RandomAccessIterator __last, _Fp __f) {
- __f(__first, __last);
- return __empty{};
-}
-
-template <class _Index, class _UnaryOp, class _Tp, class _BinaryOp, class _Reduce>
-_LIBCPP_HIDE_FROM_ABI optional<_Tp>
-__parallel_transform_reduce(_Index __first, _Index __last, _UnaryOp, _Tp __init, _BinaryOp, _Reduce __reduce) {
- return __reduce(std::move(__first), std::move(__last), std::move(__init));
-}
-
-template <class _RandomAccessIterator, class _Compare, class _LeafSort>
-_LIBCPP_HIDE_FROM_ABI optional<__empty> __parallel_stable_sort(
- _RandomAccessIterator __first, _RandomAccessIterator __last, _Compare __comp, _LeafSort __leaf_sort) {
- __leaf_sort(__first, __last, __comp);
- return __empty{};
-}
-
-_LIBCPP_HIDE_FROM_ABI inline void __cancel_execution() {}
-
-template <class _RandomAccessIterator1,
- class _RandomAccessIterator2,
- class _RandomAccessIterator3,
- class _Compare,
- class _LeafMerge>
-_LIBCPP_HIDE_FROM_ABI optional<__empty> __parallel_merge(
- _RandomAccessIterator1 __first1,
- _RandomAccessIterator1 __last1,
- _RandomAccessIterator2 __first2,
- _RandomAccessIterator2 __last2,
- _RandomAccessIterator3 __outit,
- _Compare __comp,
- _LeafMerge __leaf_merge) {
- __leaf_merge(__first1, __last1, __first2, __last2, __outit, __comp);
- return __empty{};
-}
-
-} // namespace __thread_cpu_backend
-} // namespace __par_backend
-
+namespace __pstl {
+
+struct __std_thread_backend_tag {};
+
+template <>
+struct __cpu_traits<__std_thread_backend_tag> {
+ template <class _RandomAccessIterator, class _Fp>
+ _LIBCPP_HIDE_FROM_ABI static optional<__empty>
+ __for_each(_RandomAccessIterator __first, _RandomAccessIterator __last, _Fp __f) {
+ __f(__first, __last);
+ return __empty{};
+ }
+
+ template <class _Index, class _UnaryOp, class _Tp, class _BinaryOp, class _Reduce>
+ _LIBCPP_HIDE_FROM_ABI static optional<_Tp>
+ __transform_reduce(_Index __first, _Index __last, _UnaryOp, _Tp __init, _BinaryOp, _Reduce __reduce) {
+ return __reduce(std::move(__first), std::move(__last), std::move(__init));
+ }
+
+ template <class _RandomAccessIterator, class _Compare, class _LeafSort>
+ _LIBCPP_HIDE_FROM_ABI static optional<__empty>
+ __stable_sort(_RandomAccessIterator __first, _RandomAccessIterator __last, _Compare __comp, _LeafSort __leaf_sort) {
+ __leaf_sort(__first, __last, __comp);
+ return __empty{};
+ }
+
+ _LIBCPP_HIDE_FROM_ABI static void __cancel_execution() {}
+
+ template <class _RandomAccessIterator1,
+ class _RandomAccessIterator2,
+ class _RandomAccessIterator3,
+ class _Compare,
+ class _LeafMerge>
+ _LIBCPP_HIDE_FROM_ABI static optional<__empty>
+ __merge(_RandomAccessIterator1 __first1,
+ _RandomAccessIterator1 __last1,
+ _RandomAccessIterator2 __first2,
+ _RandomAccessIterator2 __last2,
+ _RandomAccessIterator3 __outit,
+ _Compare __comp,
+ _LeafMerge __leaf_merge) {
+ __leaf_merge(__first1, __last1, __first2, __last2, __outit, __comp);
+ return __empty{};
+ }
+
+ static constexpr size_t __lane_size = 64;
+};
+
+} // namespace __pstl
_LIBCPP_END_NAMESPACE_STD
#endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && && _LIBCPP_STD_VER >= 17
diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform.h
index fdf1a2e78dad90..4b9b2968668327 100644
--- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform.h
+++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform.h
@@ -14,6 +14,7 @@
#include <__config>
#include <__iterator/concepts.h>
#include <__iterator/iterator_traits.h>
+#include <__pstl/cpu_algos/cpu_traits.h>
#include <__type_traits/enable_if.h>
#include <__type_traits/is_execution_policy.h>
#include <__type_traits/remove_cvref.h>
@@ -49,7 +50,7 @@ _LIBCPP_HIDE_FROM_ABI optional<_ForwardOutIterator> __pstl_transform(
if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> &&
__has_random_access_iterator_category_or_concept<_ForwardIterator>::value &&
__has_random_access_iterator_category_or_concept<_ForwardOutIterator>::value) {
- std::__par_backend::__parallel_for(
+ __pstl::__cpu_traits<__cpu_backend_tag>::__for_each(
__first, __last, [__op, __first, __result](_ForwardIterator __brick_first, _ForwardIterator __brick_last) {
auto __res = std::__pstl_transform<__remove_parallel_policy_t<_ExecutionPolicy>>(
__cpu_backend_tag{}, __brick_first, __brick_last, __result + (__brick_first - __first), __op);
@@ -97,7 +98,7 @@ _LIBCPP_HIDE_FROM_ABI optional<_ForwardOutIterator> __pstl_transform(
__has_random_access_iterator_category_or_concept<_ForwardIterator1>::value &&
__has_random_access_iterator_category_or_concept<_ForwardIterator2>::value &&
__has_random_access_iterator_category_or_concept<_ForwardOutIterator>::value) {
- auto __res = std::__par_backend::__parallel_for(
+ auto __res = __pstl::__cpu_traits<__cpu_backend_tag>::__for_each(
__first1,
__last1,
[__op, __first1, __first2, __result](_ForwardIterator1 __brick_first, _ForwardIterator1 __brick_last) {
diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform_reduce.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform_reduce.h
index 376abd39fa36e0..c074eea9861c1b 100644
--- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform_reduce.h
+++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform_reduce.h
@@ -14,6 +14,7 @@
#include <__iterator/concepts.h>
#include <__iterator/iterator_traits.h>
#include <__numeric/transform_reduce.h>
+#include <__pstl/cpu_algos/cpu_traits.h>
#include <__type_traits/desugars_to.h>
#include <__type_traits/is_arithmetic.h>
#include <__type_traits/is_execution_policy.h>
@@ -32,7 +33,8 @@ _LIBCPP_PUSH_MACROS
_LIBCPP_BEGIN_NAMESPACE_STD
-template <typename _DifferenceType,
+template <typename _Backend,
+ typename _DifferenceType,
typename _Tp,
typename _BinaryOperation,
typename _UnaryOperation,
@@ -48,7 +50,8 @@ __simd_transform_reduce(_DifferenceType __n, _Tp __init, _BinaryOperation, _Unar
return __init;
}
-template <typename _Size,
+template <typename _Backend,
+ typename _Size,
typename _Tp,
typename _BinaryOperation,
typename _UnaryOperation,
@@ -58,7 +61,8 @@ template <typename _Size,
int> = 0>
_LIBCPP_HIDE_FROM_ABI _Tp
__simd_transform_reduce(_Size __n, _Tp __init, _BinaryOperation __binary_op, _UnaryOperation __f) noexcept {
- const _Size __block_size = __lane_size / sizeof(_Tp);
+ constexpr size_t __lane_size = __pstl::__cpu_traits<_Backend>::__lane_size;
+ const _Size __block_size = __lane_size / sizeof(_Tp);
if (__n > 2 * __block_size && __block_size > 1) {
alignas(__lane_size) char __lane_buffer[__lane_size];
_Tp* __lane = reinterpret_cast<_Tp*>(__lane_buffer);
@@ -116,7 +120,7 @@ _LIBCPP_HIDE_FROM_ABI optional<_Tp> __pstl_transform_reduce(
if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> &&
__has_random_access_iterator_category_or_concept<_ForwardIterator1>::value &&
__has_random_access_iterator_category_or_concept<_ForwardIterator2>::value) {
- return __par_backend::__parallel_transform_reduce(
+ return __pstl::__cpu_traits<__cpu_backend_tag>::__transform_reduce(
__first1,
std::move(__last1),
[__first1, __first2, __transform](_ForwardIterator1 __iter) {
@@ -138,7 +142,7 @@ _LIBCPP_HIDE_FROM_ABI optional<_Tp> __pstl_transform_reduce(
} else if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
__has_random_access_iterator_category_or_concept<_ForwardIterator1>::value &&
__has_random_access_iterator_category_or_concept<_ForwardIterator2>::value) {
- return std::__simd_transform_reduce(
+ return std::__simd_transform_reduce<__cpu_backend_tag>(
__last1 - __first1, std::move(__init), std::move(__reduce), [&](__iter_diff_t<_ForwardIterator1> __i) {
return __transform(__first1[__i], __first2[__i]);
});
@@ -163,7 +167,7 @@ _LIBCPP_HIDE_FROM_ABI optional<_Tp> __pstl_transform_reduce(
_UnaryOperation __transform) {
if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> &&
__has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
- return __par_backend::__parallel_transform_reduce(
+ return __pstl::__cpu_traits<__cpu_backend_tag>::__transform_reduce(
std::move(__first),
std::move(__last),
[__transform](_ForwardIterator __iter) { return __transform(*__iter); },
@@ -182,7 +186,7 @@ _LIBCPP_HIDE_FROM_ABI optional<_Tp> __pstl_transform_reduce(
});
} else if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
__has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
- return std::__simd_transform_reduce(
+ return std::__simd_transform_reduce<__cpu_backend_tag>(
__last - __first,
std::move(__init),
std::move(__reduce),
diff --git a/libcxx/include/__pstl/cpu_algos/cpu_traits.h b/libcxx/include/__pstl/cpu_algos/cpu_traits.h
new file mode 100644
index 00000000000000..b24f0973d8e5ba
--- /dev/null
+++ b/libcxx/include/__pstl/cpu_algos/cpu_traits.h
@@ -0,0 +1,84 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___PSTL_CPU_ALGOS_CPU_TRAITS_H
+#define _LIBCPP___PSTL_CPU_ALGOS_CPU_TRAITS_H
+
+#include <__config>
+#include <cstddef>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+# pragma GCC system_header
+#endif
+
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+namespace __pstl {
+
+// __cpu_traits
+//
+// This traits class encapsulates the basis operations for a CPU-based implementation of the PSTL.
+// All the operations in the PSTL can be implemented from these basis operations, so a pure CPU backend
+// only needs to customize these traits in order to get an implementation of the whole PSTL.
+//
+// Basis operations
+// ================
+//
+// template <class _RandomAccessIterator, class _Functor>
+// optional<__empty> __for_each(_RandomAccessIterator __first, _RandomAccessIterator __last, _Functor __func);
+// - __func must take a subrange of [__first, __last) that should be executed in serial
+//
+// template <class _Iterator, class _UnaryOp, class _Tp, class _BinaryOp, class _Reduction>
+// optional<_Tp> __transform_reduce(_Iterator __first, _Iterator __last, _UnaryOp, _Tp __init, _BinaryOp, _Reduction);
+//
+// template <class _RandomAccessIterator1,
+// class _RandomAccessIterator2,
+// class _RandomAccessIterator3,
+// class _Compare,
+// class _LeafMerge>
+// optional<_RandomAccessIterator3> __merge(_RandomAccessIterator1 __first1,
+// _RandomAccessIterator1 __last1,
+// _RandomAccessIterator2 __first2,
+// _RandomAccessIterator2 __last2,
+// _RandomAccessIterator3 __outit,
+// _Compare __comp,
+// _LeafMerge __leaf_merge);
+//
+// template <class _RandomAccessIterator, class _Comp, class _LeafSort>
+// optional<__empty> __stable_sort(_RandomAccessIterator __first,
+// _RandomAccessIterator __last,
+// _Comp __comp,
+// _LeafSort __leaf_sort);
+//
+// void __cancel_execution();
+// Cancel the execution of other jobs - they aren't needed anymore. This is not a binding request,
+// some backends may not actually be able to cancel jobs.
+//
+// constexpr size_t __lane_size;
+// Size of SIMD lanes.
+//
+//
+// Exception handling
+// ==================
+//
+// CPU backends are expected to report errors (i.e. failure to allocate) by returning a disengaged `optional` from their
+// implementation. Exceptions shouldn't be used to report an internal failure-to-allocate, since all exceptions are turned
+// into a program termination at the front-end level. When a backend returns a disengaged `optional` to the frontend, the
+// frontend will turn that into a call to `std::__throw_bad_alloc();` to report the internal failure to the user.
+
+template <class _Backend>
+struct __cpu_traits;
+
+} // namespace __pstl
+_LIBCPP_END_NAMESPACE_STD
+
+_LIBCPP_POP_MACROS
+
+#endif // _LIBCPP___PSTL_CPU_ALGOS_CPU_TRAITS_H
diff --git a/libcxx/include/module.modulemap b/libcxx/include/module.modulemap
index ed45a1b1833893..546e5dad1ccd58 100644
--- a/libcxx/include/module.modulemap
+++ b/libcxx/include/module.modulemap
@@ -1604,6 +1604,8 @@ module std_private_numeric_transform_exclusive_scan [system] { header "__numeric
module std_private_numeric_transform_inclusive_scan [system] { header "__numeric/transform_inclusive_scan.h" }
module std_private_numeric_transform_reduce [system] { header "__numeric/transform_reduce.h" }
+module std_private_pstl_cpu_algos_cpu_traits [system] { header "__pstl/cpu_algos/cpu_traits.h" }
+
module std_private_queue_fwd [system] { header "__fwd/queue.h" }
module std_private_random_bernoulli_distribution [system] { header "__random/bernoulli_distribution.h" }
diff --git a/libcxx/src/pstl/libdispatch.cpp b/libcxx/src/pstl/libdispatch.cpp
index 52d4afbcce6e00..d997a9c73463d3 100644
--- a/libcxx/src/pstl/libdispatch.cpp
+++ b/libcxx/src/pstl/libdispatch.cpp
@@ -12,8 +12,7 @@
#include <dispatch/dispatch.h>
_LIBCPP_BEGIN_NAMESPACE_STD
-
-namespace __par_backend::inline __libdispatch {
+namespace __pstl::__libdispatch {
void __dispatch_apply(size_t chunk_count, void* context, void (*func)(void* context, size_t chunk)) noexcept {
::dispatch_apply_f(chunk_count, DISPATCH_APPLY_AUTO, context, func);
@@ -29,7 +28,5 @@ __chunk_partitions __partition_chunks(ptrdiff_t element_count) noexcept {
return partitions;
}
-// NOLINTNEXTLINE(llvm-namespace-comment) // This is https://llvm.org/PR56804
-} // namespace __par_backend::inline __libdispatch
-
+} // namespace __pstl::__libdispatch
_LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/utils/generate_iwyu_mapping.py b/libcxx/utils/generate_iwyu_mapping.py
index 8ab7b86299edca..b8a8580ea30f34 100644
--- a/libcxx/utils/generate_iwyu_mapping.py
+++ b/libcxx/utils/generate_iwyu_mapping.py
@@ -10,6 +10,7 @@ def IWYU_mapping(header: str) -> typing.Optional[typing.List[str]]:
ignore = [
"__debug_utils/.+",
"__fwd/get[.]h",
+ "__pstl/.+",
"__support/.+",
]
if any(re.match(pattern, header) for pattern in ignore):
>From 62b2a384ebfcd6cb3e541489f4c43f9deb6aeda3 Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2 at gmail.com>
Date: Tue, 9 Apr 2024 10:36:35 -0400
Subject: [PATCH 2/6] Rename functions inside cpu_traits to reduce the diff
---
.../pstl_backends/cpu_backends/any_of.h | 2 +-
.../pstl_backends/cpu_backends/fill.h | 2 +-
.../pstl_backends/cpu_backends/find_if.h | 2 +-
.../pstl_backends/cpu_backends/for_each.h | 2 +-
.../pstl_backends/cpu_backends/libdispatch.h | 24 ++++++++---------
.../pstl_backends/cpu_backends/merge.h | 2 +-
.../pstl_backends/cpu_backends/serial.h | 24 ++++++++---------
.../pstl_backends/cpu_backends/stable_sort.h | 2 +-
.../pstl_backends/cpu_backends/thread.h | 24 ++++++++---------
.../pstl_backends/cpu_backends/transform.h | 4 +--
.../cpu_backends/transform_reduce.h | 4 +--
libcxx/include/__pstl/cpu_algos/cpu_traits.h | 26 +++++++++----------
12 files changed, 59 insertions(+), 59 deletions(-)
diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/any_of.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/any_of.h
index 3755d288047e0b..be5e54f3fa5c85 100644
--- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/any_of.h
+++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/any_of.h
@@ -34,7 +34,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD
template <class _Backend, class _Index, class _Brick>
_LIBCPP_HIDE_FROM_ABI optional<bool> __parallel_or(_Index __first, _Index __last, _Brick __f) {
std::atomic<bool> __found(false);
- auto __ret = __pstl::__cpu_traits<_Backend>::__for_each(__first, __last, [__f, &__found](_Index __i, _Index __j) {
+ auto __ret = __pstl::__cpu_traits<_Backend>::__parallel_for(__first, __last, [__f, &__found](_Index __i, _Index __j) {
if (!__found.load(std::memory_order_relaxed) && __f(__i, __j)) {
__found.store(true, std::memory_order_relaxed);
__pstl::__cpu_traits<_Backend>::__cancel_execution();
diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/fill.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/fill.h
index 0c20bdff62675a..49a32f6c5ce551 100644
--- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/fill.h
+++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/fill.h
@@ -40,7 +40,7 @@ _LIBCPP_HIDE_FROM_ABI optional<__empty>
__pstl_fill(__cpu_backend_tag, _ForwardIterator __first, _ForwardIterator __last, const _Tp& __value) {
if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> &&
__has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
- return __pstl::__cpu_traits<__cpu_backend_tag>::__for_each(
+ return __pstl::__cpu_traits<__cpu_backend_tag>::__parallel_for(
__first, __last, [&__value](_ForwardIterator __brick_first, _ForwardIterator __brick_last) {
[[maybe_unused]] auto __res = std::__pstl_fill<__remove_parallel_policy_t<_ExecutionPolicy>>(
__cpu_backend_tag{}, __brick_first, __brick_last, __value);
diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/find_if.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/find_if.h
index 626293faef6921..11a5668bf25af1 100644
--- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/find_if.h
+++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/find_if.h
@@ -42,7 +42,7 @@ __parallel_find(_Index __first, _Index __last, _Brick __f, _Compare __comp, bool
_DifferenceType __initial_dist = __b_first ? __n : -1;
std::atomic<_DifferenceType> __extremum(__initial_dist);
// TODO: find out what is better here: parallel_for or parallel_reduce
- auto __res = __pstl::__cpu_traits<_Backend>::__for_each(
+ auto __res = __pstl::__cpu_traits<_Backend>::__parallel_for(
__first, __last, [__comp, __f, __first, &__extremum](_Index __i, _Index __j) {
// See "Reducing Contention Through Priority Updates", PPoPP '13, for discussion of
// why using a shared variable scales fairly well in this situation.
diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/for_each.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/for_each.h
index d637084e151d81..1667ec0f0c4f41 100644
--- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/for_each.h
+++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/for_each.h
@@ -40,7 +40,7 @@ _LIBCPP_HIDE_FROM_ABI optional<__empty>
__pstl_for_each(__cpu_backend_tag, _ForwardIterator __first, _ForwardIterator __last, _Functor __func) {
if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> &&
__has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
- return __pstl::__cpu_traits<__cpu_backend_tag>::__for_each(
+ return __pstl::__cpu_traits<__cpu_backend_tag>::__parallel_for(
__first, __last, [__func](_ForwardIterator __brick_first, _ForwardIterator __brick_last) {
[[maybe_unused]] auto __res = std::__pstl_for_each<__remove_parallel_policy_t<_ExecutionPolicy>>(
__cpu_backend_tag{}, __brick_first, __brick_last, __func);
diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/libdispatch.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/libdispatch.h
index 17faadf55dd4fa..8757f249680375 100644
--- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/libdispatch.h
+++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/libdispatch.h
@@ -85,7 +85,7 @@ template <>
struct __cpu_traits<__libdispatch_backend_tag> {
template <class _RandomAccessIterator, class _Functor>
_LIBCPP_HIDE_FROM_ABI static optional<__empty>
- __for_each(_RandomAccessIterator __first, _RandomAccessIterator __last, _Functor __func) {
+ __parallel_for(_RandomAccessIterator __first, _RandomAccessIterator __last, _Functor __func) {
return __libdispatch::__dispatch_parallel_for(
__libdispatch::__partition_chunks(__last - __first), std::move(__first), std::move(__func));
}
@@ -105,14 +105,14 @@ struct __cpu_traits<__libdispatch_backend_tag> {
typename _RandomAccessIterator3,
typename _Compare,
typename _LeafMerge>
- _LIBCPP_HIDE_FROM_ABI static optional<__empty>
- __merge(_RandomAccessIterator1 __first1,
- _RandomAccessIterator1 __last1,
- _RandomAccessIterator2 __first2,
- _RandomAccessIterator2 __last2,
- _RandomAccessIterator3 __result,
- _Compare __comp,
- _LeafMerge __leaf_merge) noexcept {
+ _LIBCPP_HIDE_FROM_ABI static optional<__empty> __parallel_merge(
+ _RandomAccessIterator1 __first1,
+ _RandomAccessIterator1 __last1,
+ _RandomAccessIterator2 __first2,
+ _RandomAccessIterator2 __last2,
+ _RandomAccessIterator3 __result,
+ _Compare __comp,
+ _LeafMerge __leaf_merge) noexcept {
__libdispatch::__chunk_partitions __partitions =
__libdispatch::__partition_chunks(std::max<ptrdiff_t>(__last1 - __first1, __last2 - __first2));
@@ -201,7 +201,7 @@ struct __cpu_traits<__libdispatch_backend_tag> {
}
template <class _RandomAccessIterator, class _Transform, class _Value, class _Combiner, class _Reduction>
- _LIBCPP_HIDE_FROM_ABI static optional<_Value> __transform_reduce(
+ _LIBCPP_HIDE_FROM_ABI static optional<_Value> __parallel_transform_reduce(
_RandomAccessIterator __first,
_RandomAccessIterator __last,
_Transform __transform,
@@ -248,8 +248,8 @@ struct __cpu_traits<__libdispatch_backend_tag> {
}
template <class _RandomAccessIterator, class _Comp, class _LeafSort>
- _LIBCPP_HIDE_FROM_ABI static optional<__empty>
- __stable_sort(_RandomAccessIterator __first, _RandomAccessIterator __last, _Comp __comp, _LeafSort __leaf_sort) {
+ _LIBCPP_HIDE_FROM_ABI static optional<__empty> __parallel_stable_sort(
+ _RandomAccessIterator __first, _RandomAccessIterator __last, _Comp __comp, _LeafSort __leaf_sort) {
const auto __size = __last - __first;
auto __partitions = __libdispatch::__partition_chunks(__size);
diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/merge.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/merge.h
index c93f4051c9d094..d034447904872e 100644
--- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/merge.h
+++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/merge.h
@@ -46,7 +46,7 @@ _LIBCPP_HIDE_FROM_ABI optional<_ForwardOutIterator> __pstl_merge(
__has_random_access_iterator_category_or_concept<_ForwardIterator1>::value &&
__has_random_access_iterator_category_or_concept<_ForwardIterator2>::value &&
__has_random_access_iterator_category_or_concept<_ForwardOutIterator>::value) {
- auto __res = __pstl::__cpu_traits<__cpu_backend_tag>::__merge(
+ auto __res = __pstl::__cpu_traits<__cpu_backend_tag>::__parallel_merge(
__first1,
__last1,
__first2,
diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/serial.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/serial.h
index 7544619a8eefd8..c3d2905daed170 100644
--- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/serial.h
+++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/serial.h
@@ -35,20 +35,20 @@ template <>
struct __cpu_traits<__serial_backend_tag> {
template <class _RandomAccessIterator, class _Fp>
_LIBCPP_HIDE_FROM_ABI static optional<__empty>
- __for_each(_RandomAccessIterator __first, _RandomAccessIterator __last, _Fp __f) {
+ __parallel_for(_RandomAccessIterator __first, _RandomAccessIterator __last, _Fp __f) {
__f(__first, __last);
return __empty{};
}
template <class _Index, class _UnaryOp, class _Tp, class _BinaryOp, class _Reduce>
_LIBCPP_HIDE_FROM_ABI static optional<_Tp>
- __transform_reduce(_Index __first, _Index __last, _UnaryOp, _Tp __init, _BinaryOp, _Reduce __reduce) {
+ __parallel_transform_reduce(_Index __first, _Index __last, _UnaryOp, _Tp __init, _BinaryOp, _Reduce __reduce) {
return __reduce(std::move(__first), std::move(__last), std::move(__init));
}
template <class _RandomAccessIterator, class _Compare, class _LeafSort>
- _LIBCPP_HIDE_FROM_ABI static optional<__empty>
- __stable_sort(_RandomAccessIterator __first, _RandomAccessIterator __last, _Compare __comp, _LeafSort __leaf_sort) {
+ _LIBCPP_HIDE_FROM_ABI static optional<__empty> __parallel_stable_sort(
+ _RandomAccessIterator __first, _RandomAccessIterator __last, _Compare __comp, _LeafSort __leaf_sort) {
__leaf_sort(__first, __last, __comp);
return __empty{};
}
@@ -60,14 +60,14 @@ struct __cpu_traits<__serial_backend_tag> {
class _RandomAccessIterator3,
class _Compare,
class _LeafMerge>
- _LIBCPP_HIDE_FROM_ABI static optional<__empty>
- __merge(_RandomAccessIterator1 __first1,
- _RandomAccessIterator1 __last1,
- _RandomAccessIterator2 __first2,
- _RandomAccessIterator2 __last2,
- _RandomAccessIterator3 __outit,
- _Compare __comp,
- _LeafMerge __leaf_merge) {
+ _LIBCPP_HIDE_FROM_ABI static optional<__empty> __parallel_merge(
+ _RandomAccessIterator1 __first1,
+ _RandomAccessIterator1 __last1,
+ _RandomAccessIterator2 __first2,
+ _RandomAccessIterator2 __last2,
+ _RandomAccessIterator3 __outit,
+ _Compare __comp,
+ _LeafMerge __leaf_merge) {
__leaf_merge(__first1, __last1, __first2, __last2, __outit, __comp);
return __empty{};
}
diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/stable_sort.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/stable_sort.h
index 8c60cf897ff860..ebfa0fc69147d5 100644
--- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/stable_sort.h
+++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/stable_sort.h
@@ -29,7 +29,7 @@ template <class _ExecutionPolicy, class _RandomAccessIterator, class _Comp>
_LIBCPP_HIDE_FROM_ABI optional<__empty>
__pstl_stable_sort(__cpu_backend_tag, _RandomAccessIterator __first, _RandomAccessIterator __last, _Comp __comp) {
if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy>) {
- return __pstl::__cpu_traits<__cpu_backend_tag>::__stable_sort(
+ return __pstl::__cpu_traits<__cpu_backend_tag>::__parallel_stable_sort(
__first, __last, __comp, [](_RandomAccessIterator __g_first, _RandomAccessIterator __g_last, _Comp __g_comp) {
std::stable_sort(__g_first, __g_last, __g_comp);
});
diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/thread.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/thread.h
index 2acf912264a001..8d1cb221c3d82a 100644
--- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/thread.h
+++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/thread.h
@@ -38,20 +38,20 @@ template <>
struct __cpu_traits<__std_thread_backend_tag> {
template <class _RandomAccessIterator, class _Fp>
_LIBCPP_HIDE_FROM_ABI static optional<__empty>
- __for_each(_RandomAccessIterator __first, _RandomAccessIterator __last, _Fp __f) {
+ __parallel_for(_RandomAccessIterator __first, _RandomAccessIterator __last, _Fp __f) {
__f(__first, __last);
return __empty{};
}
template <class _Index, class _UnaryOp, class _Tp, class _BinaryOp, class _Reduce>
_LIBCPP_HIDE_FROM_ABI static optional<_Tp>
- __transform_reduce(_Index __first, _Index __last, _UnaryOp, _Tp __init, _BinaryOp, _Reduce __reduce) {
+ __parallel_transform_reduce(_Index __first, _Index __last, _UnaryOp, _Tp __init, _BinaryOp, _Reduce __reduce) {
return __reduce(std::move(__first), std::move(__last), std::move(__init));
}
template <class _RandomAccessIterator, class _Compare, class _LeafSort>
- _LIBCPP_HIDE_FROM_ABI static optional<__empty>
- __stable_sort(_RandomAccessIterator __first, _RandomAccessIterator __last, _Compare __comp, _LeafSort __leaf_sort) {
+ _LIBCPP_HIDE_FROM_ABI static optional<__empty> __parallel_stable_sort(
+ _RandomAccessIterator __first, _RandomAccessIterator __last, _Compare __comp, _LeafSort __leaf_sort) {
__leaf_sort(__first, __last, __comp);
return __empty{};
}
@@ -63,14 +63,14 @@ struct __cpu_traits<__std_thread_backend_tag> {
class _RandomAccessIterator3,
class _Compare,
class _LeafMerge>
- _LIBCPP_HIDE_FROM_ABI static optional<__empty>
- __merge(_RandomAccessIterator1 __first1,
- _RandomAccessIterator1 __last1,
- _RandomAccessIterator2 __first2,
- _RandomAccessIterator2 __last2,
- _RandomAccessIterator3 __outit,
- _Compare __comp,
- _LeafMerge __leaf_merge) {
+ _LIBCPP_HIDE_FROM_ABI static optional<__empty> __parallel_merge(
+ _RandomAccessIterator1 __first1,
+ _RandomAccessIterator1 __last1,
+ _RandomAccessIterator2 __first2,
+ _RandomAccessIterator2 __last2,
+ _RandomAccessIterator3 __outit,
+ _Compare __comp,
+ _LeafMerge __leaf_merge) {
__leaf_merge(__first1, __last1, __first2, __last2, __outit, __comp);
return __empty{};
}
diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform.h
index 4b9b2968668327..d4c383997a67a9 100644
--- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform.h
+++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform.h
@@ -50,7 +50,7 @@ _LIBCPP_HIDE_FROM_ABI optional<_ForwardOutIterator> __pstl_transform(
if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> &&
__has_random_access_iterator_category_or_concept<_ForwardIterator>::value &&
__has_random_access_iterator_category_or_concept<_ForwardOutIterator>::value) {
- __pstl::__cpu_traits<__cpu_backend_tag>::__for_each(
+ __pstl::__cpu_traits<__cpu_backend_tag>::__parallel_for(
__first, __last, [__op, __first, __result](_ForwardIterator __brick_first, _ForwardIterator __brick_last) {
auto __res = std::__pstl_transform<__remove_parallel_policy_t<_ExecutionPolicy>>(
__cpu_backend_tag{}, __brick_first, __brick_last, __result + (__brick_first - __first), __op);
@@ -98,7 +98,7 @@ _LIBCPP_HIDE_FROM_ABI optional<_ForwardOutIterator> __pstl_transform(
__has_random_access_iterator_category_or_concept<_ForwardIterator1>::value &&
__has_random_access_iterator_category_or_concept<_ForwardIterator2>::value &&
__has_random_access_iterator_category_or_concept<_ForwardOutIterator>::value) {
- auto __res = __pstl::__cpu_traits<__cpu_backend_tag>::__for_each(
+ auto __res = __pstl::__cpu_traits<__cpu_backend_tag>::__parallel_for(
__first1,
__last1,
[__op, __first1, __first2, __result](_ForwardIterator1 __brick_first, _ForwardIterator1 __brick_last) {
diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform_reduce.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform_reduce.h
index c074eea9861c1b..956c7d6a88ce29 100644
--- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform_reduce.h
+++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform_reduce.h
@@ -120,7 +120,7 @@ _LIBCPP_HIDE_FROM_ABI optional<_Tp> __pstl_transform_reduce(
if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> &&
__has_random_access_iterator_category_or_concept<_ForwardIterator1>::value &&
__has_random_access_iterator_category_or_concept<_ForwardIterator2>::value) {
- return __pstl::__cpu_traits<__cpu_backend_tag>::__transform_reduce(
+ return __pstl::__cpu_traits<__cpu_backend_tag>::__parallel_transform_reduce(
__first1,
std::move(__last1),
[__first1, __first2, __transform](_ForwardIterator1 __iter) {
@@ -167,7 +167,7 @@ _LIBCPP_HIDE_FROM_ABI optional<_Tp> __pstl_transform_reduce(
_UnaryOperation __transform) {
if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> &&
__has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
- return __pstl::__cpu_traits<__cpu_backend_tag>::__transform_reduce(
+ return __pstl::__cpu_traits<__cpu_backend_tag>::__parallel_transform_reduce(
std::move(__first),
std::move(__last),
[__transform](_ForwardIterator __iter) { return __transform(*__iter); },
diff --git a/libcxx/include/__pstl/cpu_algos/cpu_traits.h b/libcxx/include/__pstl/cpu_algos/cpu_traits.h
index b24f0973d8e5ba..5416ac7d1be31d 100644
--- a/libcxx/include/__pstl/cpu_algos/cpu_traits.h
+++ b/libcxx/include/__pstl/cpu_algos/cpu_traits.h
@@ -32,30 +32,30 @@ namespace __pstl {
// ================
//
// template <class _RandomAccessIterator, class _Functor>
-// optional<__empty> __for_each(_RandomAccessIterator __first, _RandomAccessIterator __last, _Functor __func);
+// optional<__empty> __parallel_for(_RandomAccessIterator __first, _RandomAccessIterator __last, _Functor __func);
// - __func must take a subrange of [__first, __last) that should be executed in serial
//
// template <class _Iterator, class _UnaryOp, class _Tp, class _BinaryOp, class _Reduction>
-// optional<_Tp> __transform_reduce(_Iterator __first, _Iterator __last, _UnaryOp, _Tp __init, _BinaryOp, _Reduction);
+// optional<_Tp> __parallel_transform_reduce(_Iterator __first, _Iterator __last, _UnaryOp, _Tp __init, _BinaryOp, _Reduction);
//
// template <class _RandomAccessIterator1,
// class _RandomAccessIterator2,
// class _RandomAccessIterator3,
// class _Compare,
// class _LeafMerge>
-// optional<_RandomAccessIterator3> __merge(_RandomAccessIterator1 __first1,
-// _RandomAccessIterator1 __last1,
-// _RandomAccessIterator2 __first2,
-// _RandomAccessIterator2 __last2,
-// _RandomAccessIterator3 __outit,
-// _Compare __comp,
-// _LeafMerge __leaf_merge);
+// optional<_RandomAccessIterator3> __parallel_merge(_RandomAccessIterator1 __first1,
+// _RandomAccessIterator1 __last1,
+// _RandomAccessIterator2 __first2,
+// _RandomAccessIterator2 __last2,
+// _RandomAccessIterator3 __outit,
+// _Compare __comp,
+// _LeafMerge __leaf_merge);
//
// template <class _RandomAccessIterator, class _Comp, class _LeafSort>
-// optional<__empty> __stable_sort(_RandomAccessIterator __first,
-// _RandomAccessIterator __last,
-// _Comp __comp,
-// _LeafSort __leaf_sort);
+// optional<__empty> __parallel_stable_sort(_RandomAccessIterator __first,
+// _RandomAccessIterator __last,
+// _Comp __comp,
+// _LeafSort __leaf_sort);
//
// void __cancel_execution();
// Cancel the execution of other jobs - they aren't needed anymore. This is not a binding request,
>From 619b721d7ec3b3daf986d4bb2e2c59e7803d145b Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2 at gmail.com>
Date: Tue, 9 Apr 2024 10:59:33 -0400
Subject: [PATCH 3/6] Formatting
---
libcxx/include/__algorithm/pstl_backends/cpu_backend.h | 2 +-
libcxx/include/__pstl/cpu_algos/cpu_traits.h | 10 ++++++----
2 files changed, 7 insertions(+), 5 deletions(-)
diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backend.h b/libcxx/include/__algorithm/pstl_backends/cpu_backend.h
index c93139243af459..53eae58f960952 100644
--- a/libcxx/include/__algorithm/pstl_backends/cpu_backend.h
+++ b/libcxx/include/__algorithm/pstl_backends/cpu_backend.h
@@ -9,7 +9,6 @@
#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKEND_H
#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKEND_H
-#include <__config>
#include <__algorithm/pstl_backends/cpu_backends/any_of.h>
#include <__algorithm/pstl_backends/cpu_backends/backend.h>
#include <__algorithm/pstl_backends/cpu_backends/fill.h>
@@ -19,5 +18,6 @@
#include <__algorithm/pstl_backends/cpu_backends/stable_sort.h>
#include <__algorithm/pstl_backends/cpu_backends/transform.h>
#include <__algorithm/pstl_backends/cpu_backends/transform_reduce.h>
+#include <__config>
#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKEND_H
diff --git a/libcxx/include/__pstl/cpu_algos/cpu_traits.h b/libcxx/include/__pstl/cpu_algos/cpu_traits.h
index 5416ac7d1be31d..e462129f22eda1 100644
--- a/libcxx/include/__pstl/cpu_algos/cpu_traits.h
+++ b/libcxx/include/__pstl/cpu_algos/cpu_traits.h
@@ -36,7 +36,8 @@ namespace __pstl {
// - __func must take a subrange of [__first, __last) that should be executed in serial
//
// template <class _Iterator, class _UnaryOp, class _Tp, class _BinaryOp, class _Reduction>
-// optional<_Tp> __parallel_transform_reduce(_Iterator __first, _Iterator __last, _UnaryOp, _Tp __init, _BinaryOp, _Reduction);
+// optional<_Tp> __parallel_transform_reduce(_Iterator __first, _Iterator __last, _UnaryOp, _Tp __init, _BinaryOp,
+// _Reduction);
//
// template <class _RandomAccessIterator1,
// class _RandomAccessIterator2,
@@ -69,9 +70,10 @@ namespace __pstl {
// ==================
//
// CPU backends are expected to report errors (i.e. failure to allocate) by returning a disengaged `optional` from their
-// implementation. Exceptions shouldn't be used to report an internal failure-to-allocate, since all exceptions are turned
-// into a program termination at the front-end level. When a backend returns a disengaged `optional` to the frontend, the
-// frontend will turn that into a call to `std::__throw_bad_alloc();` to report the internal failure to the user.
+// implementation. Exceptions shouldn't be used to report an internal failure-to-allocate, since all exceptions are
+// turned into a program termination at the front-end level. When a backend returns a disengaged `optional` to the
+// frontend, the frontend will turn that into a call to `std::__throw_bad_alloc();` to report the internal failure to
+// the user.
template <class _Backend>
struct __cpu_traits;
>From affcf3214107163f85d1b24aaea803e30d2c5150 Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2 at gmail.com>
Date: Fri, 12 Apr 2024 13:45:32 -0400
Subject: [PATCH 4/6] Fix libcxxabi tests
---
.../include/__algorithm/pstl_backends/cpu_backends/backend.h | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/backend.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/backend.h
index 0641a51e6823e3..cb9425862a2b03 100644
--- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/backend.h
+++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/backend.h
@@ -26,7 +26,7 @@
# pragma GCC system_header
#endif
-#if _LIBCPP_STD_VER >= 17
+#if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
_LIBCPP_BEGIN_NAMESPACE_STD
@@ -40,6 +40,6 @@ using __cpu_backend_tag = __pstl::__libdispatch_backend_tag;
_LIBCPP_END_NAMESPACE_STD
-#endif // _LIBCPP_STD_VER >= 17
+#endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && && _LIBCPP_STD_VER >= 17
#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKEND_BACKEND_H
>From f2037debbc8296d545eb838d60f41c7dbc9a8ed3 Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2 at gmail.com>
Date: Fri, 12 Apr 2024 16:55:57 -0400
Subject: [PATCH 5/6] Add comment
---
libcxx/include/__pstl/cpu_algos/cpu_traits.h | 1 +
1 file changed, 1 insertion(+)
diff --git a/libcxx/include/__pstl/cpu_algos/cpu_traits.h b/libcxx/include/__pstl/cpu_algos/cpu_traits.h
index e462129f22eda1..2f0db46e9be83a 100644
--- a/libcxx/include/__pstl/cpu_algos/cpu_traits.h
+++ b/libcxx/include/__pstl/cpu_algos/cpu_traits.h
@@ -64,6 +64,7 @@ namespace __pstl {
//
// constexpr size_t __lane_size;
// Size of SIMD lanes.
+// TODO: Merge this with __native_vector_size from __algorithm/simd_utils.h
//
//
// Exception handling
>From 649727613f9b6d58cd6b0290040d6a6da35a85d6 Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2 at gmail.com>
Date: Fri, 12 Apr 2024 17:00:24 -0400
Subject: [PATCH 6/6] Fix CI with libdispatch
---
.../algorithms/pstl.libdispatch.chunk_partitions.pass.cpp | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/libcxx/test/libcxx/algorithms/pstl.libdispatch.chunk_partitions.pass.cpp b/libcxx/test/libcxx/algorithms/pstl.libdispatch.chunk_partitions.pass.cpp
index 554924a0179d56..8c7016a80b811a 100644
--- a/libcxx/test/libcxx/algorithms/pstl.libdispatch.chunk_partitions.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/pstl.libdispatch.chunk_partitions.pass.cpp
@@ -18,21 +18,21 @@
int main(int, char**) {
{
- auto chunks = std::__par_backend::__libdispatch::__partition_chunks(0);
+ auto chunks = std::__pstl::__libdispatch::__partition_chunks(0);
assert(chunks.__chunk_count_ == 1);
assert(chunks.__first_chunk_size_ == 0);
assert(chunks.__chunk_size_ == 0);
}
{
- auto chunks = std::__par_backend::__libdispatch::__partition_chunks(1);
+ auto chunks = std::__pstl::__libdispatch::__partition_chunks(1);
assert(chunks.__chunk_count_ == 1);
assert(chunks.__first_chunk_size_ == 1);
assert(chunks.__chunk_size_ == 1);
}
for (std::ptrdiff_t i = 2; i != 2ll << 20; ++i) {
- auto chunks = std::__par_backend::__libdispatch::__partition_chunks(i);
+ auto chunks = std::__pstl::__libdispatch::__partition_chunks(i);
assert(chunks.__chunk_count_ >= 1);
assert(chunks.__chunk_count_ <= i);
assert((chunks.__chunk_count_ - 1) * chunks.__chunk_size_ + chunks.__first_chunk_size_ == i);
More information about the libcxx-commits
mailing list