[libcxx-commits] [libcxx] a3ce29f - [libc++][PSTL] Introduce cpu traits (#88134)

Mon Apr 15 07:30:05 PDT 2024

Author: Louis Dionne
Date: 2024-04-15T10:30:00-04:00
New Revision: a3ce29f7bb5510131971ac5ccc63132dd48c8dd2

URL: https://github.com/llvm/llvm-project/commit/a3ce29f7bb5510131971ac5ccc63132dd48c8dd2
DIFF: https://github.com/llvm/llvm-project/commit/a3ce29f7bb5510131971ac5ccc63132dd48c8dd2.diff

LOG: [libc++][PSTL] Introduce cpu traits (#88134)

Currently, CPU backends in the PSTL are created by defining functions
in the __par_backend namespace. Then, the PSTL includes the CPU backend
that gets configured via CMake and gets those definitions.

This prevents CPU backends from easily co-existing and is a bit
confusing.
To solve this problem, this patch introduces the notion of __cpu_traits,
which is a cheap encapsulation of the basis operations required to
implement a CPU-based PSTL. Different backends can now define their own
tag and coexist, and the CPU-based PSTL will simply use __cpu_traits to
dispatch to the right implementation of e.g. __for_each.

Note that this patch doesn't change the actual implementation of the
backends in any way, it only modifies how that implementation is
accessed
to implement PSTL algorithms.

This patch is a step towards #88131.

Added: 
    libcxx/include/__pstl/cpu_algos/cpu_traits.h

Modified: 
    libcxx/include/CMakeLists.txt
    libcxx/include/__algorithm/pstl_backends/cpu_backend.h
    libcxx/include/__algorithm/pstl_backends/cpu_backends/any_of.h
    libcxx/include/__algorithm/pstl_backends/cpu_backends/backend.h
    libcxx/include/__algorithm/pstl_backends/cpu_backends/fill.h
    libcxx/include/__algorithm/pstl_backends/cpu_backends/find_if.h
    libcxx/include/__algorithm/pstl_backends/cpu_backends/for_each.h
    libcxx/include/__algorithm/pstl_backends/cpu_backends/libdispatch.h
    libcxx/include/__algorithm/pstl_backends/cpu_backends/merge.h
    libcxx/include/__algorithm/pstl_backends/cpu_backends/serial.h
    libcxx/include/__algorithm/pstl_backends/cpu_backends/stable_sort.h
    libcxx/include/__algorithm/pstl_backends/cpu_backends/thread.h
    libcxx/include/__algorithm/pstl_backends/cpu_backends/transform.h
    libcxx/include/__algorithm/pstl_backends/cpu_backends/transform_reduce.h
    libcxx/include/module.modulemap
    libcxx/src/pstl/libdispatch.cpp
    libcxx/test/libcxx/algorithms/pstl.libdispatch.chunk_partitions.pass.cpp
    libcxx/utils/generate_iwyu_mapping.py

Removed: 
    


################################################################################
diff  --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt
index 1b6c6de082da94..148a51a216ed4a 100644

--- a/libcxx/include/CMakeLists.txt
+++ b/libcxx/include/CMakeLists.txt
@@ -594,6 +594,7 @@ set(files
   __numeric/transform_exclusive_scan.h
   __numeric/transform_inclusive_scan.h
   __numeric/transform_reduce.h
+  __pstl/cpu_algos/cpu_traits.h
   __random/bernoulli_distribution.h
   __random/binomial_distribution.h
   __random/cauchy_distribution.h

diff  --git a/libcxx/include/__algorithm/pstl_backends/cpu_backend.h b/libcxx/include/__algorithm/pstl_backends/cpu_backend.h
index 6980ded189ea2a..53eae58f960952 100644
--- a/libcxx/include/__algorithm/pstl_backends/cpu_backend.h
+++ b/libcxx/include/__algorithm/pstl_backends/cpu_backend.h
@@ -9,52 +9,6 @@
 #ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKEND_H
 #define _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKEND_H
 
-#include <__config>
-
-/*
-
-  // _Functor takes a subrange for [__first, __last) that should be executed in serial
-  template <class _RandomAccessIterator, class _Functor>
-  optional<__empty> __parallel_for(_RandomAccessIterator __first, _RandomAccessIterator __last, _Functor __func);
-
-  template <class _Iterator, class _UnaryOp, class _Tp, class _BinaryOp, class _Reduction>
-  optional<_Tp>
-  __parallel_transform_reduce(_Iterator __first, _Iterator __last, _UnaryOp, _Tp __init, _BinaryOp, _Reduction);
-
-  // Cancel the execution of other jobs - they aren't needed anymore
-  void __cancel_execution();
-
-  template <class _RandomAccessIterator1,
-            class _RandomAccessIterator2,
-            class _RandomAccessIterator3,
-            class _Compare,
-            class _LeafMerge>
-  optional<void> __parallel_merge(
-      _RandomAccessIterator1 __first1,
-      _RandomAccessIterator1 __last1,
-      _RandomAccessIterator2 __first2,
-      _RandomAccessIterator2 __last2,
-      _RandomAccessIterator3 __outit,
-      _Compare __comp,
-      _LeafMerge __leaf_merge);
-
-  template <class _RandomAccessIterator, class _Comp, class _LeafSort>
-  void __parallel_stable_sort(_RandomAccessIterator __first,
-                              _RandomAccessIterator __last,
-                              _Comp __comp,
-                              _LeafSort __leaf_sort);
-
-  TODO: Document the parallel backend
-
-Exception handling
-==================
-
-CPU backends are expected to report errors (i.e. failure to allocate) by returning a disengaged `optional` from their
-implementation. Exceptions shouldn't be used to report an internal failure-to-allocate, since all exceptions are turned
-into a program termination at the front-end level. When a backend returns a disengaged `optional` to the frontend, the
-frontend will turn that into a call to `std::__throw_bad_alloc();` to report the internal failure to the user.
-*/
-
 #include <__algorithm/pstl_backends/cpu_backends/any_of.h>
 #include <__algorithm/pstl_backends/cpu_backends/backend.h>
 #include <__algorithm/pstl_backends/cpu_backends/fill.h>
@@ -64,5 +18,6 @@ frontend will turn that into a call to `std::__throw_bad_alloc();` to report the
 #include <__algorithm/pstl_backends/cpu_backends/stable_sort.h>
 #include <__algorithm/pstl_backends/cpu_backends/transform.h>
 #include <__algorithm/pstl_backends/cpu_backends/transform_reduce.h>
+#include <__config>
 
 #endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKEND_H

diff  --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/any_of.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/any_of.h
index 13dff80086e72b..be5e54f3fa5c85 100644
--- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/any_of.h
+++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/any_of.h
@@ -17,6 +17,7 @@
 #include <__config>
 #include <__functional/operations.h>
 #include <__iterator/concepts.h>
+#include <__pstl/cpu_algos/cpu_traits.h>
 #include <__type_traits/is_execution_policy.h>
 #include <__utility/move.h>
 #include <__utility/pair.h>
@@ -30,13 +31,13 @@ _LIBCPP_PUSH_MACROS
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
-template <class _Index, class _Brick>
+template <class _Backend, class _Index, class _Brick>
 _LIBCPP_HIDE_FROM_ABI optional<bool> __parallel_or(_Index __first, _Index __last, _Brick __f) {
   std::atomic<bool> __found(false);
-  auto __ret = __par_backend::__parallel_for(__first, __last, [__f, &__found](_Index __i, _Index __j) {
+  auto __ret = __pstl::__cpu_traits<_Backend>::__parallel_for(__first, __last, [__f, &__found](_Index __i, _Index __j) {
     if (!__found.load(std::memory_order_relaxed) && __f(__i, __j)) {
       __found.store(true, std::memory_order_relaxed);
-      __par_backend::__cancel_execution();
+      __pstl::__cpu_traits<_Backend>::__cancel_execution();
     }
   });
   if (!__ret)
@@ -74,7 +75,7 @@ _LIBCPP_HIDE_FROM_ABI optional<bool>
 __pstl_any_of(__cpu_backend_tag, _ForwardIterator __first, _ForwardIterator __last, _Predicate __pred) {
   if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> &&
                 __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
-    return std::__parallel_or(
+    return std::__parallel_or<__cpu_backend_tag>(
         __first, __last, [&__pred](_ForwardIterator __brick_first, _ForwardIterator __brick_last) {
           auto __res = std::__pstl_any_of<__remove_parallel_policy_t<_ExecutionPolicy>>(
               __cpu_backend_tag{}, __brick_first, __brick_last, __pred);

diff  --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/backend.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/backend.h
index ea2210a4a7adbd..cb9425862a2b03 100644
--- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/backend.h
+++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/backend.h
@@ -26,16 +26,20 @@
 #  pragma GCC system_header
 #endif
 
-#if _LIBCPP_STD_VER >= 17
+#if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
-struct __cpu_backend_tag {};
-
-inline constexpr size_t __lane_size = 64;
+#  if defined(_LIBCPP_PSTL_CPU_BACKEND_SERIAL)
+using __cpu_backend_tag = __pstl::__serial_backend_tag;
+#  elif defined(_LIBCPP_PSTL_CPU_BACKEND_THREAD)
+using __cpu_backend_tag = __pstl::__std_thread_backend_tag;
+#  elif defined(_LIBCPP_PSTL_CPU_BACKEND_LIBDISPATCH)
+using __cpu_backend_tag = __pstl::__libdispatch_backend_tag;
+#  endif
 
 _LIBCPP_END_NAMESPACE_STD
 
-#endif // _LIBCPP_STD_VER >= 17
+#endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && && _LIBCPP_STD_VER >= 17
 
 #endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKEND_BACKEND_H

diff  --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/fill.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/fill.h
index 64babe9fd2bdae..49a32f6c5ce551 100644
--- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/fill.h
+++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/fill.h
@@ -13,6 +13,7 @@
 #include <__algorithm/pstl_backends/cpu_backends/backend.h>
 #include <__config>
 #include <__iterator/concepts.h>
+#include <__pstl/cpu_algos/cpu_traits.h>
 #include <__type_traits/is_execution_policy.h>
 #include <__utility/empty.h>
 #include <optional>
@@ -39,7 +40,7 @@ _LIBCPP_HIDE_FROM_ABI optional<__empty>
 __pstl_fill(__cpu_backend_tag, _ForwardIterator __first, _ForwardIterator __last, const _Tp& __value) {
   if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> &&
                 __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
-    return __par_backend::__parallel_for(
+    return __pstl::__cpu_traits<__cpu_backend_tag>::__parallel_for(
         __first, __last, [&__value](_ForwardIterator __brick_first, _ForwardIterator __brick_last) {
           [[maybe_unused]] auto __res = std::__pstl_fill<__remove_parallel_policy_t<_ExecutionPolicy>>(
               __cpu_backend_tag{}, __brick_first, __brick_last, __value);

diff  --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/find_if.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/find_if.h
index 170470e4fb7edd..11a5668bf25af1 100644
--- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/find_if.h
+++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/find_if.h
@@ -16,6 +16,7 @@
 #include <__functional/operations.h>
 #include <__iterator/concepts.h>
 #include <__iterator/iterator_traits.h>
+#include <__pstl/cpu_algos/cpu_traits.h>
 #include <__type_traits/is_execution_policy.h>
 #include <__utility/move.h>
 #include <__utility/pair.h>
@@ -33,7 +34,7 @@ _LIBCPP_PUSH_MACROS
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
-template <class _Index, class _Brick, class _Compare>
+template <class _Backend, class _Index, class _Brick, class _Compare>
 _LIBCPP_HIDE_FROM_ABI optional<_Index>
 __parallel_find(_Index __first, _Index __last, _Brick __f, _Compare __comp, bool __b_first) {
   typedef typename std::iterator_traits<_Index>::
diff erence_type _DifferenceType;
@@ -41,8 +42,8 @@ __parallel_find(_Index __first, _Index __last, _Brick __f, _Compare __comp, bool
   _DifferenceType __initial_dist = __b_first ? __n : -1;
   std::atomic<_DifferenceType> __extremum(__initial_dist);
   // TODO: find out what is better here: parallel_for or parallel_reduce
-  auto __res =
-      __par_backend::__parallel_for(__first, __last, [__comp, __f, __first, &__extremum](_Index __i, _Index __j) {
+  auto __res = __pstl::__cpu_traits<_Backend>::__parallel_for(
+      __first, __last, [__comp, __f, __first, &__extremum](_Index __i, _Index __j) {
         // See "Reducing Contention Through Priority Updates", PPoPP '13, for discussion of
         // why using a shared variable scales fairly well in this situation.
         if (__comp(__i - __first, __extremum)) {
@@ -61,12 +62,12 @@ __parallel_find(_Index __first, _Index __last, _Brick __f, _Compare __comp, bool
   return __extremum.load() != __initial_dist ? __first + __extremum.load() : __last;
 }
 
-template <class _Index, class _DifferenceType, class _Compare>
+template <class _Backend, class _Index, class _DifferenceType, class _Compare>
 _LIBCPP_HIDE_FROM_ABI _Index
 __simd_first(_Index __first, _DifferenceType __begin, _DifferenceType __end, _Compare __comp) noexcept {
   // Experiments show good block sizes like this
-  const _DifferenceType __block_size                        = 8;
-  alignas(__lane_size) _DifferenceType __lane[__block_size] = {0};
+  const _DifferenceType __block_size                                                        = 8;
+  alignas(__pstl::__cpu_traits<_Backend>::__lane_size) _DifferenceType __lane[__block_size] = {0};
   while (__end - __begin >= __block_size) {
     _DifferenceType __found = 0;
     _PSTL_PRAGMA_SIMD_REDUCTION(| : __found) for (_DifferenceType __i = __begin; __i < __begin + __block_size; ++__i) {
@@ -102,7 +103,7 @@ _LIBCPP_HIDE_FROM_ABI optional<_ForwardIterator>
 __pstl_find_if(__cpu_backend_tag, _ForwardIterator __first, _ForwardIterator __last, _Predicate __pred) {
   if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> &&
                 __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
-    return std::__parallel_find(
+    return std::__parallel_find<__cpu_backend_tag>(
         __first,
         __last,
         [&__pred](_ForwardIterator __brick_first, _ForwardIterator __brick_last) {
@@ -116,9 +117,10 @@ __pstl_find_if(__cpu_backend_tag, _ForwardIterator __first, _ForwardIterator __l
   } else if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
                        __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
     using __
diff _t = __iter_
diff _t<_ForwardIterator>;
-    return std::__simd_first(__first, __
diff _t(0), __last - __first, [&__pred](_ForwardIterator __iter, __
diff _t __i) {
-      return __pred(__iter[__i]);
-    });
+    return std::__simd_first<__cpu_backend_tag>(
+        __first, __
diff _t(0), __last - __first, [&__pred](_ForwardIterator __iter, __
diff _t __i) {
+          return __pred(__iter[__i]);
+        });
   } else {
     return std::find_if(__first, __last, __pred);
   }

diff  --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/for_each.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/for_each.h
index 81fd4526b8dbf1..1667ec0f0c4f41 100644
--- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/for_each.h
+++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/for_each.h
@@ -13,6 +13,7 @@
 #include <__algorithm/pstl_backends/cpu_backends/backend.h>
 #include <__config>
 #include <__iterator/concepts.h>
+#include <__pstl/cpu_algos/cpu_traits.h>
 #include <__type_traits/is_execution_policy.h>
 #include <__utility/empty.h>
 #include <optional>
@@ -39,7 +40,7 @@ _LIBCPP_HIDE_FROM_ABI optional<__empty>
 __pstl_for_each(__cpu_backend_tag, _ForwardIterator __first, _ForwardIterator __last, _Functor __func) {
   if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> &&
                 __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
-    return std::__par_backend::__parallel_for(
+    return __pstl::__cpu_traits<__cpu_backend_tag>::__parallel_for(
         __first, __last, [__func](_ForwardIterator __brick_first, _ForwardIterator __brick_last) {
           [[maybe_unused]] auto __res = std::__pstl_for_each<__remove_parallel_policy_t<_ExecutionPolicy>>(
               __cpu_backend_tag{}, __brick_first, __brick_last, __func);

diff  --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/libdispatch.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/libdispatch.h
index e885e7f225172c..8757f249680375 100644
--- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/libdispatch.h
+++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/libdispatch.h
@@ -23,6 +23,7 @@
 #include <__memory/construct_at.h>
 #include <__memory/unique_ptr.h>
 #include <__numeric/reduce.h>
+#include <__pstl/cpu_algos/cpu_traits.h>
 #include <__utility/empty.h>
 #include <__utility/exception_guard.h>
 #include <__utility/move.h>
@@ -37,10 +38,11 @@ _LIBCPP_PUSH_MACROS
 #if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
 
 _LIBCPP_BEGIN_NAMESPACE_STD
+namespace __pstl {
 
-namespace __par_backend {
-inline namespace __libdispatch {
+struct __libdispatch_backend_tag {};
 
+namespace __libdispatch {
 // ::dispatch_apply is marked as __attribute__((nothrow)) because it doesn't let exceptions propagate, and neither do
 // we.
 // TODO: Do we want to add [[_Clang::__callback__(__func, __context, __)]]?
@@ -77,267 +79,270 @@ __dispatch_parallel_for(__chunk_partitions __partitions, _RandomAccessIterator _
 
   return __empty{};
 }
+} // namespace __libdispatch
 
-template <class _RandomAccessIterator, class _Functor>
-_LIBCPP_HIDE_FROM_ABI optional<__empty>
-__parallel_for(_RandomAccessIterator __first, _RandomAccessIterator __last, _Functor __func) {
-  return __libdispatch::__dispatch_parallel_for(
-      __libdispatch::__partition_chunks(__last - __first), std::move(__first), std::move(__func));
-}
-
-template <class _RandomAccessIterator1, class _RandomAccessIterator2, class _RandomAccessIteratorOut>
-struct __merge_range {
-  __merge_range(_RandomAccessIterator1 __mid1, _RandomAccessIterator2 __mid2, _RandomAccessIteratorOut __result)
-      : __mid1_(__mid1), __mid2_(__mid2), __result_(__result) {}
+template <>
+struct __cpu_traits<__libdispatch_backend_tag> {
+  template <class _RandomAccessIterator, class _Functor>
+  _LIBCPP_HIDE_FROM_ABI static optional<__empty>
+  __parallel_for(_RandomAccessIterator __first, _RandomAccessIterator __last, _Functor __func) {
+    return __libdispatch::__dispatch_parallel_for(
+        __libdispatch::__partition_chunks(__last - __first), std::move(__first), std::move(__func));
+  }
 
-  _RandomAccessIterator1 __mid1_;
-  _RandomAccessIterator2 __mid2_;
-  _RandomAccessIteratorOut __result_;
-};
+  template <class _RandomAccessIterator1, class _RandomAccessIterator2, class _RandomAccessIteratorOut>
+  struct __merge_range {
+    __merge_range(_RandomAccessIterator1 __mid1, _RandomAccessIterator2 __mid2, _RandomAccessIteratorOut __result)
+        : __mid1_(__mid1), __mid2_(__mid2), __result_(__result) {}
 
-template <typename _RandomAccessIterator1,
-          typename _RandomAccessIterator2,
-          typename _RandomAccessIterator3,
-          typename _Compare,
-          typename _LeafMerge>
-_LIBCPP_HIDE_FROM_ABI optional<__empty> __parallel_merge(
-    _RandomAccessIterator1 __first1,
-    _RandomAccessIterator1 __last1,
-    _RandomAccessIterator2 __first2,
-    _RandomAccessIterator2 __last2,
-    _RandomAccessIterator3 __result,
-    _Compare __comp,
-    _LeafMerge __leaf_merge) noexcept {
-  __chunk_partitions __partitions =
-      __libdispatch::__partition_chunks(std::max<ptr
diff _t>(__last1 - __first1, __last2 - __first2));
-
-  if (__partitions.__chunk_count_ == 0)
-    return __empty{};
+    _RandomAccessIterator1 __mid1_;
+    _RandomAccessIterator2 __mid2_;
+    _RandomAccessIteratorOut __result_;
+  };
 
-  if (__partitions.__chunk_count_ == 1) {
-    __leaf_merge(__first1, __last1, __first2, __last2, __result, __comp);
-    return __empty{};
-  }
+  template <typename _RandomAccessIterator1,
+            typename _RandomAccessIterator2,
+            typename _RandomAccessIterator3,
+            typename _Compare,
+            typename _LeafMerge>
+  _LIBCPP_HIDE_FROM_ABI static optional<__empty> __parallel_merge(
+      _RandomAccessIterator1 __first1,
+      _RandomAccessIterator1 __last1,
+      _RandomAccessIterator2 __first2,
+      _RandomAccessIterator2 __last2,
+      _RandomAccessIterator3 __result,
+      _Compare __comp,
+      _LeafMerge __leaf_merge) noexcept {
+    __libdispatch::__chunk_partitions __partitions =
+        __libdispatch::__partition_chunks(std::max<ptr
diff _t>(__last1 - __first1, __last2 - __first2));
+
+    if (__partitions.__chunk_count_ == 0)
+      return __empty{};
+
+    if (__partitions.__chunk_count_ == 1) {
+      __leaf_merge(__first1, __last1, __first2, __last2, __result, __comp);
+      return __empty{};
+    }
 
-  using __merge_range_t = __merge_range<_RandomAccessIterator1, _RandomAccessIterator2, _RandomAccessIterator3>;
-  auto const __n_ranges = __partitions.__chunk_count_ + 1;
+    using __merge_range_t = __merge_range<_RandomAccessIterator1, _RandomAccessIterator2, _RandomAccessIterator3>;
+    auto const __n_ranges = __partitions.__chunk_count_ + 1;
 
-  // TODO: use __uninitialized_buffer
-  auto __destroy = [=](__merge_range_t* __ptr) {
-    std::destroy_n(__ptr, __n_ranges);
-    std::allocator<__merge_range_t>().deallocate(__ptr, __n_ranges);
-  };
+    // TODO: use __uninitialized_buffer
+    auto __destroy = [=](__merge_range_t* __ptr) {
+      std::destroy_n(__ptr, __n_ranges);
+      std::allocator<__merge_range_t>().deallocate(__ptr, __n_ranges);
+    };
 
-  unique_ptr<__merge_range_t[], decltype(__destroy)> __ranges(
-      [&]() -> __merge_range_t* {
+    unique_ptr<__merge_range_t[], decltype(__destroy)> __ranges(
+        [&]() -> __merge_range_t* {
 #  ifndef _LIBCPP_HAS_NO_EXCEPTIONS
-        try {
+          try {
 #  endif
-          return std::allocator<__merge_range_t>().allocate(__n_ranges);
+            return std::allocator<__merge_range_t>().allocate(__n_ranges);
 #  ifndef _LIBCPP_HAS_NO_EXCEPTIONS
-        } catch (const std::bad_alloc&) {
-          return nullptr;
-        }
+          } catch (const std::bad_alloc&) {
+            return nullptr;
+          }
 #  endif
-      }(),
-      __destroy);
-
-  if (!__ranges)
-    return nullopt;
+        }(),
+        __destroy);
+
+    if (!__ranges)
+      return nullopt;
+
+    // TODO: Improve the case where the smaller range is merged into just a few (or even one) chunks of the larger case
+    __merge_range_t* __r = __ranges.get();
+    std::__construct_at(__r++, __first1, __first2, __result);
+
+    bool __iterate_first_range = __last1 - __first1 > __last2 - __first2;
+
+    auto __compute_chunk = [&](size_t __chunk_size) -> __merge_range_t {
+      auto [__mid1, __mid2] = [&] {
+        if (__iterate_first_range) {
+          auto __m1 = __first1 + __chunk_size;
+          auto __m2 = std::lower_bound(__first2, __last2, __m1[-1], __comp);
+          return std::make_pair(__m1, __m2);
+        } else {
+          auto __m2 = __first2 + __chunk_size;
+          auto __m1 = std::lower_bound(__first1, __last1, __m2[-1], __comp);
+          return std::make_pair(__m1, __m2);
+        }
+      }();
 
-  // TODO: Improve the case where the smaller range is merged into just a few (or even one) chunks of the larger case
-  __merge_range_t* __r = __ranges.get();
-  std::__construct_at(__r++, __first1, __first2, __result);
+      __result += (__mid1 - __first1) + (__mid2 - __first2);
+      __first1 = __mid1;
+      __first2 = __mid2;
+      return {std::move(__mid1), std::move(__mid2), __result};
+    };
 
-  bool __iterate_first_range = __last1 - __first1 > __last2 - __first2;
+    // handle first chunk
+    std::__construct_at(__r++, __compute_chunk(__partitions.__first_chunk_size_));
 
-  auto __compute_chunk = [&](size_t __chunk_size) -> __merge_range_t {
-    auto [__mid1, __mid2] = [&] {
-      if (__iterate_first_range) {
-        auto __m1 = __first1 + __chunk_size;
-        auto __m2 = std::lower_bound(__first2, __last2, __m1[-1], __comp);
-        return std::make_pair(__m1, __m2);
-      } else {
-        auto __m2 = __first2 + __chunk_size;
-        auto __m1 = std::lower_bound(__first1, __last1, __m2[-1], __comp);
-        return std::make_pair(__m1, __m2);
-      }
-    }();
+    // handle 2 -> N - 1 chunks
+    for (ptr
diff _t __i = 0; __i != __partitions.__chunk_count_ - 2; ++__i)
+      std::__construct_at(__r++, __compute_chunk(__partitions.__chunk_size_));
 
-    __result += (__mid1 - __first1) + (__mid2 - __first2);
-    __first1 = __mid1;
-    __first2 = __mid2;
-    return {std::move(__mid1), std::move(__mid2), __result};
-  };
+    // handle last chunk
+    std::__construct_at(__r, __last1, __last2, __result);
 
-  // handle first chunk
-  std::__construct_at(__r++, __compute_chunk(__partitions.__first_chunk_size_));
-
-  // handle 2 -> N - 1 chunks
-  for (ptr
diff _t __i = 0; __i != __partitions.__chunk_count_ - 2; ++__i)
-    std::__construct_at(__r++, __compute_chunk(__partitions.__chunk_size_));
-
-  // handle last chunk
-  std::__construct_at(__r, __last1, __last2, __result);
-
-  __libdispatch::__dispatch_apply(__partitions.__chunk_count_, [&](size_t __index) {
-    auto __first_iters = __ranges[__index];
-    auto __last_iters  = __ranges[__index + 1];
-    __leaf_merge(
-        __first_iters.__mid1_,
-        __last_iters.__mid1_,
-        __first_iters.__mid2_,
-        __last_iters.__mid2_,
-        __first_iters.__result_,
-        __comp);
-  });
+    __libdispatch::__dispatch_apply(__partitions.__chunk_count_, [&](size_t __index) {
+      auto __first_iters = __ranges[__index];
+      auto __last_iters  = __ranges[__index + 1];
+      __leaf_merge(
+          __first_iters.__mid1_,
+          __last_iters.__mid1_,
+          __first_iters.__mid2_,
+          __last_iters.__mid2_,
+          __first_iters.__result_,
+          __comp);
+    });
 
-  return __empty{};
-}
+    return __empty{};
+  }
 
-template <class _RandomAccessIterator, class _Transform, class _Value, class _Combiner, class _Reduction>
-_LIBCPP_HIDE_FROM_ABI optional<_Value> __parallel_transform_reduce(
-    _RandomAccessIterator __first,
-    _RandomAccessIterator __last,
-    _Transform __transform,
-    _Value __init,
-    _Combiner __combiner,
-    _Reduction __reduction) {
-  if (__first == __last)
-    return __init;
-
-  auto __partitions = __libdispatch::__partition_chunks(__last - __first);
-
-  auto __destroy = [__count = __partitions.__chunk_count_](_Value* __ptr) {
-    std::destroy_n(__ptr, __count);
-    std::allocator<_Value>().deallocate(__ptr, __count);
-  };
+  template <class _RandomAccessIterator, class _Transform, class _Value, class _Combiner, class _Reduction>
+  _LIBCPP_HIDE_FROM_ABI static optional<_Value> __parallel_transform_reduce(
+      _RandomAccessIterator __first,
+      _RandomAccessIterator __last,
+      _Transform __transform,
+      _Value __init,
+      _Combiner __combiner,
+      _Reduction __reduction) {
+    if (__first == __last)
+      return __init;
+
+    auto __partitions = __libdispatch::__partition_chunks(__last - __first);
+
+    auto __destroy = [__count = __partitions.__chunk_count_](_Value* __ptr) {
+      std::destroy_n(__ptr, __count);
+      std::allocator<_Value>().deallocate(__ptr, __count);
+    };
 
-  // TODO: use __uninitialized_buffer
-  // TODO: allocate one element per worker instead of one element per chunk
-  unique_ptr<_Value[], decltype(__destroy)> __values(
-      std::allocator<_Value>().allocate(__partitions.__chunk_count_), __destroy);
+    // TODO: use __uninitialized_buffer
+    // TODO: allocate one element per worker instead of one element per chunk
+    unique_ptr<_Value[], decltype(__destroy)> __values(
+        std::allocator<_Value>().allocate(__partitions.__chunk_count_), __destroy);
+
+    // __dispatch_apply is noexcept
+    __libdispatch::__dispatch_apply(__partitions.__chunk_count_, [&](size_t __chunk) {
+      auto __this_chunk_size = __chunk == 0 ? __partitions.__first_chunk_size_ : __partitions.__chunk_size_;
+      auto __index           = __chunk == 0 ? 0
+                                            : (__chunk * __partitions.__chunk_size_) +
+                                        (__partitions.__first_chunk_size_ - __partitions.__chunk_size_);
+      if (__this_chunk_size != 1) {
+        std::__construct_at(
+            __values.get() + __chunk,
+            __reduction(__first + __index + 2,
+                        __first + __index + __this_chunk_size,
+                        __combiner(__transform(__first + __index), __transform(__first + __index + 1))));
+      } else {
+        std::__construct_at(__values.get() + __chunk, __transform(__first + __index));
+      }
+    });
 
-  // __dispatch_apply is noexcept
-  __libdispatch::__dispatch_apply(__partitions.__chunk_count_, [&](size_t __chunk) {
-    auto __this_chunk_size = __chunk == 0 ? __partitions.__first_chunk_size_ : __partitions.__chunk_size_;
-    auto __index =
-        __chunk == 0
-            ? 0
-            : (__chunk * __partitions.__chunk_size_) + (__partitions.__first_chunk_size_ - __partitions.__chunk_size_);
-    if (__this_chunk_size != 1) {
-      std::__construct_at(
-          __values.get() + __chunk,
-          __reduction(__first + __index + 2,
-                      __first + __index + __this_chunk_size,
-                      __combiner(__transform(__first + __index), __transform(__first + __index + 1))));
-    } else {
-      std::__construct_at(__values.get() + __chunk, __transform(__first + __index));
-    }
-  });
+    return std::reduce(
+        std::make_move_iterator(__values.get()),
+        std::make_move_iterator(__values.get() + __partitions.__chunk_count_),
+        std::move(__init),
+        __combiner);
+  }
 
-  return std::reduce(
-      std::make_move_iterator(__values.get()),
-      std::make_move_iterator(__values.get() + __partitions.__chunk_count_),
-      std::move(__init),
-      __combiner);
-}
+  template <class _RandomAccessIterator, class _Comp, class _LeafSort>
+  _LIBCPP_HIDE_FROM_ABI static optional<__empty> __parallel_stable_sort(
+      _RandomAccessIterator __first, _RandomAccessIterator __last, _Comp __comp, _LeafSort __leaf_sort) {
+    const auto __size = __last - __first;
+    auto __partitions = __libdispatch::__partition_chunks(__size);
 
-template <class _RandomAccessIterator, class _Comp, class _LeafSort>
-_LIBCPP_HIDE_FROM_ABI optional<__empty> __parallel_stable_sort(
-    _RandomAccessIterator __first, _RandomAccessIterator __last, _Comp __comp, _LeafSort __leaf_sort) {
-  const auto __size = __last - __first;
-  auto __partitions = __libdispatch::__partition_chunks(__size);
+    if (__partitions.__chunk_count_ == 0)
+      return __empty{};
 
-  if (__partitions.__chunk_count_ == 0)
-    return __empty{};
+    if (__partitions.__chunk_count_ == 1) {
+      __leaf_sort(__first, __last, __comp);
+      return __empty{};
+    }
 
-  if (__partitions.__chunk_count_ == 1) {
-    __leaf_sort(__first, __last, __comp);
-    return __empty{};
-  }
+    using _Value = __iter_value_type<_RandomAccessIterator>;
 
-  using _Value = __iter_value_type<_RandomAccessIterator>;
+    auto __destroy = [__size](_Value* __ptr) {
+      std::destroy_n(__ptr, __size);
+      std::allocator<_Value>().deallocate(__ptr, __size);
+    };
 
-  auto __destroy = [__size](_Value* __ptr) {
-    std::destroy_n(__ptr, __size);
-    std::allocator<_Value>().deallocate(__ptr, __size);
-  };
+    // TODO: use __uninitialized_buffer
+    unique_ptr<_Value[], decltype(__destroy)> __values(std::allocator<_Value>().allocate(__size), __destroy);
 
-  // TODO: use __uninitialized_buffer
-  unique_ptr<_Value[], decltype(__destroy)> __values(std::allocator<_Value>().allocate(__size), __destroy);
+    // Initialize all elements to a moved-from state
+    // TODO: Don't do this - this can be done in the first merge - see https://llvm.org/PR63928
+    std::__construct_at(__values.get(), std::move(*__first));
+    for (__iter_
diff _t<_RandomAccessIterator> __i = 1; __i != __size; ++__i) {
+      std::__construct_at(__values.get() + __i, std::move(__values.get()[__i - 1]));
+    }
+    *__first = std::move(__values.get()[__size - 1]);
+
+    __libdispatch::__dispatch_parallel_for(
+        __partitions,
+        __first,
+        [&__leaf_sort, &__comp](_RandomAccessIterator __chunk_first, _RandomAccessIterator __chunk_last) {
+          __leaf_sort(std::move(__chunk_first), std::move(__chunk_last), __comp);
+        });
+
+    bool __objects_are_in_buffer = false;
+    do {
+      const auto __old_chunk_size = __partitions.__chunk_size_;
+      if (__partitions.__chunk_count_ % 2 == 1) {
+        auto __inplace_merge_chunks = [&__comp, &__partitions](auto __first_chunk_begin) {
+          std::inplace_merge(
+              __first_chunk_begin,
+              __first_chunk_begin + __partitions.__first_chunk_size_,
+              __first_chunk_begin + __partitions.__first_chunk_size_ + __partitions.__chunk_size_,
+              __comp);
+        };
+        if (__objects_are_in_buffer)
+          __inplace_merge_chunks(__values.get());
+        else
+          __inplace_merge_chunks(__first);
+        __partitions.__first_chunk_size_ += 2 * __partitions.__chunk_size_;
+      } else {
+        __partitions.__first_chunk_size_ += __partitions.__chunk_size_;
+      }
 
-  // Initialize all elements to a moved-from state
-  // TODO: Don't do this - this can be done in the first merge - see https://llvm.org/PR63928
-  std::__construct_at(__values.get(), std::move(*__first));
-  for (__iter_
diff _t<_RandomAccessIterator> __i = 1; __i != __size; ++__i) {
-    std::__construct_at(__values.get() + __i, std::move(__values.get()[__i - 1]));
-  }
-  *__first = std::move(__values.get()[__size - 1]);
-
-  __libdispatch::__dispatch_parallel_for(
-      __partitions,
-      __first,
-      [&__leaf_sort, &__comp](_RandomAccessIterator __chunk_first, _RandomAccessIterator __chunk_last) {
-        __leaf_sort(std::move(__chunk_first), std::move(__chunk_last), __comp);
-      });
-
-  bool __objects_are_in_buffer = false;
-  do {
-    const auto __old_chunk_size = __partitions.__chunk_size_;
-    if (__partitions.__chunk_count_ % 2 == 1) {
-      auto __inplace_merge_chunks = [&__comp, &__partitions](auto __first_chunk_begin) {
-        std::inplace_merge(
-            __first_chunk_begin,
-            __first_chunk_begin + __partitions.__first_chunk_size_,
-            __first_chunk_begin + __partitions.__first_chunk_size_ + __partitions.__chunk_size_,
-            __comp);
+      __partitions.__chunk_size_ *= 2;
+      __partitions.__chunk_count_ /= 2;
+
+      auto __merge_chunks = [__partitions, __old_chunk_size, &__comp](auto __from_first, auto __to_first) {
+        __libdispatch::__dispatch_parallel_for(
+            __partitions,
+            __from_first,
+            [__old_chunk_size, &__from_first, &__to_first, &__comp](auto __chunk_first, auto __chunk_last) {
+              std::merge(std::make_move_iterator(__chunk_first),
+                         std::make_move_iterator(__chunk_last - __old_chunk_size),
+                         std::make_move_iterator(__chunk_last - __old_chunk_size),
+                         std::make_move_iterator(__chunk_last),
+                         __to_first + (__chunk_first - __from_first),
+                         __comp);
+            });
       };
+
       if (__objects_are_in_buffer)
-        __inplace_merge_chunks(__values.get());
+        __merge_chunks(__values.get(), __first);
       else
-        __inplace_merge_chunks(__first);
-      __partitions.__first_chunk_size_ += 2 * __partitions.__chunk_size_;
-    } else {
-      __partitions.__first_chunk_size_ += __partitions.__chunk_size_;
-    }
-
-    __partitions.__chunk_size_ *= 2;
-    __partitions.__chunk_count_ /= 2;
-
-    auto __merge_chunks = [__partitions, __old_chunk_size, &__comp](auto __from_first, auto __to_first) {
-      __libdispatch::__dispatch_parallel_for(
-          __partitions,
-          __from_first,
-          [__old_chunk_size, &__from_first, &__to_first, &__comp](auto __chunk_first, auto __chunk_last) {
-            std::merge(std::make_move_iterator(__chunk_first),
-                       std::make_move_iterator(__chunk_last - __old_chunk_size),
-                       std::make_move_iterator(__chunk_last - __old_chunk_size),
-                       std::make_move_iterator(__chunk_last),
-                       __to_first + (__chunk_first - __from_first),
-                       __comp);
-          });
-    };
+        __merge_chunks(__first, __values.get());
+      __objects_are_in_buffer = !__objects_are_in_buffer;
+    } while (__partitions.__chunk_count_ > 1);
 
-    if (__objects_are_in_buffer)
-      __merge_chunks(__values.get(), __first);
-    else
-      __merge_chunks(__first, __values.get());
-    __objects_are_in_buffer = !__objects_are_in_buffer;
-  } while (__partitions.__chunk_count_ > 1);
+    if (__objects_are_in_buffer) {
+      std::move(__values.get(), __values.get() + __size, __first);
+    }
 
-  if (__objects_are_in_buffer) {
-    std::move(__values.get(), __values.get() + __size, __first);
+    return __empty{};
   }
 
-  return __empty{};
-}
+  _LIBCPP_HIDE_FROM_ABI static void __cancel_execution() {}
 
-_LIBCPP_HIDE_FROM_ABI inline void __cancel_execution() {}
-
-} // namespace __libdispatch
-} // namespace __par_backend
+  static constexpr size_t __lane_size = 64;
+};
 
+} // namespace __pstl
 _LIBCPP_END_NAMESPACE_STD
 
 #endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17

diff  --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/merge.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/merge.h
index b0db70f58b2ef4..d034447904872e 100644
--- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/merge.h
+++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/merge.h
@@ -13,6 +13,7 @@
 #include <__algorithm/pstl_backends/cpu_backends/backend.h>
 #include <__config>
 #include <__iterator/concepts.h>
+#include <__pstl/cpu_algos/cpu_traits.h>
 #include <__type_traits/is_execution_policy.h>
 #include <__utility/move.h>
 #include <optional>
@@ -45,7 +46,7 @@ _LIBCPP_HIDE_FROM_ABI optional<_ForwardOutIterator> __pstl_merge(
                 __has_random_access_iterator_category_or_concept<_ForwardIterator1>::value &&
                 __has_random_access_iterator_category_or_concept<_ForwardIterator2>::value &&
                 __has_random_access_iterator_category_or_concept<_ForwardOutIterator>::value) {
-    auto __res = __par_backend::__parallel_merge(
+    auto __res = __pstl::__cpu_traits<__cpu_backend_tag>::__parallel_merge(
         __first1,
         __last1,
         __first2,

diff  --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/serial.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/serial.h
index afcc7ffb266130..c3d2905daed170 100644
--- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/serial.h
+++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/serial.h
@@ -11,6 +11,7 @@
 #define _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKENDS_SERIAL_H
 
 #include <__config>
+#include <__pstl/cpu_algos/cpu_traits.h>
 #include <__utility/empty.h>
 #include <__utility/move.h>
 #include <cstddef>
@@ -26,54 +27,55 @@ _LIBCPP_PUSH_MACROS
 #  include <__undef_macros>
 
 _LIBCPP_BEGIN_NAMESPACE_STD
-
-namespace __par_backend {
-inline namespace __serial_cpu_backend {
-
-template <class _RandomAccessIterator, class _Fp>
-_LIBCPP_HIDE_FROM_ABI optional<__empty>
-__parallel_for(_RandomAccessIterator __first, _RandomAccessIterator __last, _Fp __f) {
-  __f(__first, __last);
-  return __empty{};
-}
-
-template <class _Index, class _UnaryOp, class _Tp, class _BinaryOp, class _Reduce>
-_LIBCPP_HIDE_FROM_ABI optional<_Tp>
-__parallel_transform_reduce(_Index __first, _Index __last, _UnaryOp, _Tp __init, _BinaryOp, _Reduce __reduce) {
-  return __reduce(std::move(__first), std::move(__last), std::move(__init));
-}
-
-template <class _RandomAccessIterator, class _Compare, class _LeafSort>
-_LIBCPP_HIDE_FROM_ABI optional<__empty> __parallel_stable_sort(
-    _RandomAccessIterator __first, _RandomAccessIterator __last, _Compare __comp, _LeafSort __leaf_sort) {
-  __leaf_sort(__first, __last, __comp);
-  return __empty{};
-}
-
-_LIBCPP_HIDE_FROM_ABI inline void __cancel_execution() {}
-
-template <class _RandomAccessIterator1,
-          class _RandomAccessIterator2,
-          class _RandomAccessIterator3,
-          class _Compare,
-          class _LeafMerge>
-_LIBCPP_HIDE_FROM_ABI optional<__empty> __parallel_merge(
-    _RandomAccessIterator1 __first1,
-    _RandomAccessIterator1 __last1,
-    _RandomAccessIterator2 __first2,
-    _RandomAccessIterator2 __last2,
-    _RandomAccessIterator3 __outit,
-    _Compare __comp,
-    _LeafMerge __leaf_merge) {
-  __leaf_merge(__first1, __last1, __first2, __last2, __outit, __comp);
-  return __empty{};
-}
-
-// TODO: Complete this list
-
-} // namespace __serial_cpu_backend
-} // namespace __par_backend
-
+namespace __pstl {
+
+struct __serial_backend_tag {};
+
+template <>
+struct __cpu_traits<__serial_backend_tag> {
+  template <class _RandomAccessIterator, class _Fp>
+  _LIBCPP_HIDE_FROM_ABI static optional<__empty>
+  __parallel_for(_RandomAccessIterator __first, _RandomAccessIterator __last, _Fp __f) {
+    __f(__first, __last);
+    return __empty{};
+  }
+
+  template <class _Index, class _UnaryOp, class _Tp, class _BinaryOp, class _Reduce>
+  _LIBCPP_HIDE_FROM_ABI static optional<_Tp>
+  __parallel_transform_reduce(_Index __first, _Index __last, _UnaryOp, _Tp __init, _BinaryOp, _Reduce __reduce) {
+    return __reduce(std::move(__first), std::move(__last), std::move(__init));
+  }
+
+  template <class _RandomAccessIterator, class _Compare, class _LeafSort>
+  _LIBCPP_HIDE_FROM_ABI static optional<__empty> __parallel_stable_sort(
+      _RandomAccessIterator __first, _RandomAccessIterator __last, _Compare __comp, _LeafSort __leaf_sort) {
+    __leaf_sort(__first, __last, __comp);
+    return __empty{};
+  }
+
+  _LIBCPP_HIDE_FROM_ABI static void __cancel_execution() {}
+
+  template <class _RandomAccessIterator1,
+            class _RandomAccessIterator2,
+            class _RandomAccessIterator3,
+            class _Compare,
+            class _LeafMerge>
+  _LIBCPP_HIDE_FROM_ABI static optional<__empty> __parallel_merge(
+      _RandomAccessIterator1 __first1,
+      _RandomAccessIterator1 __last1,
+      _RandomAccessIterator2 __first2,
+      _RandomAccessIterator2 __last2,
+      _RandomAccessIterator3 __outit,
+      _Compare __comp,
+      _LeafMerge __leaf_merge) {
+    __leaf_merge(__first1, __last1, __first2, __last2, __outit, __comp);
+    return __empty{};
+  }
+
+  static constexpr size_t __lane_size = 64;
+};
+
+} // namespace __pstl
 _LIBCPP_END_NAMESPACE_STD
 
 _LIBCPP_POP_MACROS

diff  --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/stable_sort.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/stable_sort.h
index 34c423586c4b74..ebfa0fc69147d5 100644
--- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/stable_sort.h
+++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/stable_sort.h
@@ -12,6 +12,7 @@
 #include <__algorithm/pstl_backends/cpu_backends/backend.h>
 #include <__algorithm/stable_sort.h>
 #include <__config>
+#include <__pstl/cpu_algos/cpu_traits.h>
 #include <__type_traits/is_execution_policy.h>
 #include <__utility/empty.h>
 #include <optional>
@@ -28,7 +29,7 @@ template <class _ExecutionPolicy, class _RandomAccessIterator, class _Comp>
 _LIBCPP_HIDE_FROM_ABI optional<__empty>
 __pstl_stable_sort(__cpu_backend_tag, _RandomAccessIterator __first, _RandomAccessIterator __last, _Comp __comp) {
   if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy>) {
-    return __par_backend::__parallel_stable_sort(
+    return __pstl::__cpu_traits<__cpu_backend_tag>::__parallel_stable_sort(
         __first, __last, __comp, [](_RandomAccessIterator __g_first, _RandomAccessIterator __g_last, _Comp __g_comp) {
           std::stable_sort(__g_first, __g_last, __g_comp);
         });

diff  --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/thread.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/thread.h
index eb11a961b760c3..8d1cb221c3d82a 100644
--- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/thread.h
+++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/thread.h
@@ -11,6 +11,7 @@
 
 #include <__assert>
 #include <__config>
+#include <__pstl/cpu_algos/cpu_traits.h>
 #include <__utility/empty.h>
 #include <__utility/move.h>
 #include <cstddef>
@@ -29,52 +30,55 @@ _LIBCPP_PUSH_MACROS
 // by a proper implementation once the PSTL implementation is somewhat stable.
 
 _LIBCPP_BEGIN_NAMESPACE_STD
-
-namespace __par_backend {
-inline namespace __thread_cpu_backend {
-
-template <class _RandomAccessIterator, class _Fp>
-_LIBCPP_HIDE_FROM_ABI optional<__empty>
-__parallel_for(_RandomAccessIterator __first, _RandomAccessIterator __last, _Fp __f) {
-  __f(__first, __last);
-  return __empty{};
-}
-
-template <class _Index, class _UnaryOp, class _Tp, class _BinaryOp, class _Reduce>
-_LIBCPP_HIDE_FROM_ABI optional<_Tp>
-__parallel_transform_reduce(_Index __first, _Index __last, _UnaryOp, _Tp __init, _BinaryOp, _Reduce __reduce) {
-  return __reduce(std::move(__first), std::move(__last), std::move(__init));
-}
-
-template <class _RandomAccessIterator, class _Compare, class _LeafSort>
-_LIBCPP_HIDE_FROM_ABI optional<__empty> __parallel_stable_sort(
-    _RandomAccessIterator __first, _RandomAccessIterator __last, _Compare __comp, _LeafSort __leaf_sort) {
-  __leaf_sort(__first, __last, __comp);
-  return __empty{};
-}
-
-_LIBCPP_HIDE_FROM_ABI inline void __cancel_execution() {}
-
-template <class _RandomAccessIterator1,
-          class _RandomAccessIterator2,
-          class _RandomAccessIterator3,
-          class _Compare,
-          class _LeafMerge>
-_LIBCPP_HIDE_FROM_ABI optional<__empty> __parallel_merge(
-    _RandomAccessIterator1 __first1,
-    _RandomAccessIterator1 __last1,
-    _RandomAccessIterator2 __first2,
-    _RandomAccessIterator2 __last2,
-    _RandomAccessIterator3 __outit,
-    _Compare __comp,
-    _LeafMerge __leaf_merge) {
-  __leaf_merge(__first1, __last1, __first2, __last2, __outit, __comp);
-  return __empty{};
-}
-
-} // namespace __thread_cpu_backend
-} // namespace __par_backend
-
+namespace __pstl {
+
+struct __std_thread_backend_tag {};
+
+template <>
+struct __cpu_traits<__std_thread_backend_tag> {
+  template <class _RandomAccessIterator, class _Fp>
+  _LIBCPP_HIDE_FROM_ABI static optional<__empty>
+  __parallel_for(_RandomAccessIterator __first, _RandomAccessIterator __last, _Fp __f) {
+    __f(__first, __last);
+    return __empty{};
+  }
+
+  template <class _Index, class _UnaryOp, class _Tp, class _BinaryOp, class _Reduce>
+  _LIBCPP_HIDE_FROM_ABI static optional<_Tp>
+  __parallel_transform_reduce(_Index __first, _Index __last, _UnaryOp, _Tp __init, _BinaryOp, _Reduce __reduce) {
+    return __reduce(std::move(__first), std::move(__last), std::move(__init));
+  }
+
+  template <class _RandomAccessIterator, class _Compare, class _LeafSort>
+  _LIBCPP_HIDE_FROM_ABI static optional<__empty> __parallel_stable_sort(
+      _RandomAccessIterator __first, _RandomAccessIterator __last, _Compare __comp, _LeafSort __leaf_sort) {
+    __leaf_sort(__first, __last, __comp);
+    return __empty{};
+  }
+
+  _LIBCPP_HIDE_FROM_ABI static void __cancel_execution() {}
+
+  template <class _RandomAccessIterator1,
+            class _RandomAccessIterator2,
+            class _RandomAccessIterator3,
+            class _Compare,
+            class _LeafMerge>
+  _LIBCPP_HIDE_FROM_ABI static optional<__empty> __parallel_merge(
+      _RandomAccessIterator1 __first1,
+      _RandomAccessIterator1 __last1,
+      _RandomAccessIterator2 __first2,
+      _RandomAccessIterator2 __last2,
+      _RandomAccessIterator3 __outit,
+      _Compare __comp,
+      _LeafMerge __leaf_merge) {
+    __leaf_merge(__first1, __last1, __first2, __last2, __outit, __comp);
+    return __empty{};
+  }
+
+  static constexpr size_t __lane_size = 64;
+};
+
+} // namespace __pstl
 _LIBCPP_END_NAMESPACE_STD
 
 #endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && && _LIBCPP_STD_VER >= 17

diff  --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform.h
index fdf1a2e78dad90..d4c383997a67a9 100644
--- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform.h
+++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform.h
@@ -14,6 +14,7 @@
 #include <__config>
 #include <__iterator/concepts.h>
 #include <__iterator/iterator_traits.h>
+#include <__pstl/cpu_algos/cpu_traits.h>
 #include <__type_traits/enable_if.h>
 #include <__type_traits/is_execution_policy.h>
 #include <__type_traits/remove_cvref.h>
@@ -49,7 +50,7 @@ _LIBCPP_HIDE_FROM_ABI optional<_ForwardOutIterator> __pstl_transform(
   if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> &&
                 __has_random_access_iterator_category_or_concept<_ForwardIterator>::value &&
                 __has_random_access_iterator_category_or_concept<_ForwardOutIterator>::value) {
-    std::__par_backend::__parallel_for(
+    __pstl::__cpu_traits<__cpu_backend_tag>::__parallel_for(
         __first, __last, [__op, __first, __result](_ForwardIterator __brick_first, _ForwardIterator __brick_last) {
           auto __res = std::__pstl_transform<__remove_parallel_policy_t<_ExecutionPolicy>>(
               __cpu_backend_tag{}, __brick_first, __brick_last, __result + (__brick_first - __first), __op);
@@ -97,7 +98,7 @@ _LIBCPP_HIDE_FROM_ABI optional<_ForwardOutIterator> __pstl_transform(
                 __has_random_access_iterator_category_or_concept<_ForwardIterator1>::value &&
                 __has_random_access_iterator_category_or_concept<_ForwardIterator2>::value &&
                 __has_random_access_iterator_category_or_concept<_ForwardOutIterator>::value) {
-    auto __res = std::__par_backend::__parallel_for(
+    auto __res = __pstl::__cpu_traits<__cpu_backend_tag>::__parallel_for(
         __first1,
         __last1,
         [__op, __first1, __first2, __result](_ForwardIterator1 __brick_first, _ForwardIterator1 __brick_last) {

diff  --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform_reduce.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform_reduce.h
index 376abd39fa36e0..956c7d6a88ce29 100644
--- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform_reduce.h
+++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform_reduce.h
@@ -14,6 +14,7 @@
 #include <__iterator/concepts.h>
 #include <__iterator/iterator_traits.h>
 #include <__numeric/transform_reduce.h>
+#include <__pstl/cpu_algos/cpu_traits.h>
 #include <__type_traits/desugars_to.h>
 #include <__type_traits/is_arithmetic.h>
 #include <__type_traits/is_execution_policy.h>
@@ -32,7 +33,8 @@ _LIBCPP_PUSH_MACROS
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
-template <typename _DifferenceType,
+template <typename _Backend,
+          typename _DifferenceType,
           typename _Tp,
           typename _BinaryOperation,
           typename _UnaryOperation,
@@ -48,7 +50,8 @@ __simd_transform_reduce(_DifferenceType __n, _Tp __init, _BinaryOperation, _Unar
   return __init;
 }
 
-template <typename _Size,
+template <typename _Backend,
+          typename _Size,
           typename _Tp,
           typename _BinaryOperation,
           typename _UnaryOperation,
@@ -58,7 +61,8 @@ template <typename _Size,
                         int>    = 0>
 _LIBCPP_HIDE_FROM_ABI _Tp
 __simd_transform_reduce(_Size __n, _Tp __init, _BinaryOperation __binary_op, _UnaryOperation __f) noexcept {
-  const _Size __block_size = __lane_size / sizeof(_Tp);
+  constexpr size_t __lane_size = __pstl::__cpu_traits<_Backend>::__lane_size;
+  const _Size __block_size     = __lane_size / sizeof(_Tp);
   if (__n > 2 * __block_size && __block_size > 1) {
     alignas(__lane_size) char __lane_buffer[__lane_size];
     _Tp* __lane = reinterpret_cast<_Tp*>(__lane_buffer);
@@ -116,7 +120,7 @@ _LIBCPP_HIDE_FROM_ABI optional<_Tp> __pstl_transform_reduce(
   if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> &&
                 __has_random_access_iterator_category_or_concept<_ForwardIterator1>::value &&
                 __has_random_access_iterator_category_or_concept<_ForwardIterator2>::value) {
-    return __par_backend::__parallel_transform_reduce(
+    return __pstl::__cpu_traits<__cpu_backend_tag>::__parallel_transform_reduce(
         __first1,
         std::move(__last1),
         [__first1, __first2, __transform](_ForwardIterator1 __iter) {
@@ -138,7 +142,7 @@ _LIBCPP_HIDE_FROM_ABI optional<_Tp> __pstl_transform_reduce(
   } else if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
                        __has_random_access_iterator_category_or_concept<_ForwardIterator1>::value &&
                        __has_random_access_iterator_category_or_concept<_ForwardIterator2>::value) {
-    return std::__simd_transform_reduce(
+    return std::__simd_transform_reduce<__cpu_backend_tag>(
         __last1 - __first1, std::move(__init), std::move(__reduce), [&](__iter_
diff _t<_ForwardIterator1> __i) {
           return __transform(__first1[__i], __first2[__i]);
         });
@@ -163,7 +167,7 @@ _LIBCPP_HIDE_FROM_ABI optional<_Tp> __pstl_transform_reduce(
     _UnaryOperation __transform) {
   if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> &&
                 __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
-    return __par_backend::__parallel_transform_reduce(
+    return __pstl::__cpu_traits<__cpu_backend_tag>::__parallel_transform_reduce(
         std::move(__first),
         std::move(__last),
         [__transform](_ForwardIterator __iter) { return __transform(*__iter); },
@@ -182,7 +186,7 @@ _LIBCPP_HIDE_FROM_ABI optional<_Tp> __pstl_transform_reduce(
         });
   } else if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
                        __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
-    return std::__simd_transform_reduce(
+    return std::__simd_transform_reduce<__cpu_backend_tag>(
         __last - __first,
         std::move(__init),
         std::move(__reduce),

diff  --git a/libcxx/include/__pstl/cpu_algos/cpu_traits.h b/libcxx/include/__pstl/cpu_algos/cpu_traits.h
new file mode 100644
index 00000000000000..2f0db46e9be83a
--- /dev/null
+++ b/libcxx/include/__pstl/cpu_algos/cpu_traits.h
@@ -0,0 +1,87 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___PSTL_CPU_ALGOS_CPU_TRAITS_H
+#define _LIBCPP___PSTL_CPU_ALGOS_CPU_TRAITS_H
+
+#include <__config>
+#include <cstddef>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#  pragma GCC system_header
+#endif
+
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+namespace __pstl {
+
+// __cpu_traits
+//
+// This traits class encapsulates the basis operations for a CPU-based implementation of the PSTL.
+// All the operations in the PSTL can be implemented from these basis operations, so a pure CPU backend
+// only needs to customize these traits in order to get an implementation of the whole PSTL.
+//
+// Basis operations
+// ================
+//
+//  template <class _RandomAccessIterator, class _Functor>
+//  optional<__empty> __parallel_for(_RandomAccessIterator __first, _RandomAccessIterator __last, _Functor __func);
+//    - __func must take a subrange of [__first, __last) that should be executed in serial
+//
+//  template <class _Iterator, class _UnaryOp, class _Tp, class _BinaryOp, class _Reduction>
+//  optional<_Tp> __parallel_transform_reduce(_Iterator __first, _Iterator __last, _UnaryOp, _Tp __init, _BinaryOp,
+//  _Reduction);
+//
+//  template <class _RandomAccessIterator1,
+//            class _RandomAccessIterator2,
+//            class _RandomAccessIterator3,
+//            class _Compare,
+//            class _LeafMerge>
+//  optional<_RandomAccessIterator3> __parallel_merge(_RandomAccessIterator1 __first1,
+//                                                    _RandomAccessIterator1 __last1,
+//                                                    _RandomAccessIterator2 __first2,
+//                                                    _RandomAccessIterator2 __last2,
+//                                                    _RandomAccessIterator3 __outit,
+//                                                    _Compare __comp,
+//                                                    _LeafMerge __leaf_merge);
+//
+//  template <class _RandomAccessIterator, class _Comp, class _LeafSort>
+//  optional<__empty> __parallel_stable_sort(_RandomAccessIterator __first,
+//                                           _RandomAccessIterator __last,
+//                                           _Comp __comp,
+//                                           _LeafSort __leaf_sort);
+//
+//   void __cancel_execution();
+//      Cancel the execution of other jobs - they aren't needed anymore. This is not a binding request,
+//      some backends may not actually be able to cancel jobs.
+//
+//   constexpr size_t __lane_size;
+//      Size of SIMD lanes.
+//      TODO: Merge this with __native_vector_size from __algorithm/simd_utils.h
+//
+//
+// Exception handling
+// ==================
+//
+// CPU backends are expected to report errors (i.e. failure to allocate) by returning a disengaged `optional` from their
+// implementation. Exceptions shouldn't be used to report an internal failure-to-allocate, since all exceptions are
+// turned into a program termination at the front-end level. When a backend returns a disengaged `optional` to the
+// frontend, the frontend will turn that into a call to `std::__throw_bad_alloc();` to report the internal failure to
+// the user.
+
+template <class _Backend>
+struct __cpu_traits;
+
+} // namespace __pstl
+_LIBCPP_END_NAMESPACE_STD
+
+_LIBCPP_POP_MACROS
+
+#endif // _LIBCPP___PSTL_CPU_ALGOS_CPU_TRAITS_H

diff  --git a/libcxx/include/module.modulemap b/libcxx/include/module.modulemap
index ccee7029824e33..7312a744ef6793 100644
--- a/libcxx/include/module.modulemap
+++ b/libcxx/include/module.modulemap
@@ -1613,6 +1613,8 @@ module std_private_numeric_transform_exclusive_scan [system] { header "__numeric
 module std_private_numeric_transform_inclusive_scan [system] { header "__numeric/transform_inclusive_scan.h" }
 module std_private_numeric_transform_reduce         [system] { header "__numeric/transform_reduce.h" }
 
+module std_private_pstl_cpu_algos_cpu_traits [system] { header "__pstl/cpu_algos/cpu_traits.h" }
+
 module std_private_queue_fwd [system] { header "__fwd/queue.h" }
 
 module std_private_random_bernoulli_distribution          [system] { header "__random/bernoulli_distribution.h" }

diff  --git a/libcxx/src/pstl/libdispatch.cpp b/libcxx/src/pstl/libdispatch.cpp
index 52d4afbcce6e00..d997a9c73463d3 100644
--- a/libcxx/src/pstl/libdispatch.cpp
+++ b/libcxx/src/pstl/libdispatch.cpp
@@ -12,8 +12,7 @@
 #include <dispatch/dispatch.h>
 
 _LIBCPP_BEGIN_NAMESPACE_STD
-
-namespace __par_backend::inline __libdispatch {
+namespace __pstl::__libdispatch {
 
 void __dispatch_apply(size_t chunk_count, void* context, void (*func)(void* context, size_t chunk)) noexcept {
   ::dispatch_apply_f(chunk_count, DISPATCH_APPLY_AUTO, context, func);
@@ -29,7 +28,5 @@ __chunk_partitions __partition_chunks(ptr
diff _t element_count) noexcept {
   return partitions;
 }
 
-// NOLINTNEXTLINE(llvm-namespace-comment) // This is https://llvm.org/PR56804
-} // namespace __par_backend::inline __libdispatch
-
+} // namespace __pstl::__libdispatch
 _LIBCPP_END_NAMESPACE_STD

diff  --git a/libcxx/test/libcxx/algorithms/pstl.libdispatch.chunk_partitions.pass.cpp b/libcxx/test/libcxx/algorithms/pstl.libdispatch.chunk_partitions.pass.cpp
index 554924a0179d56..8c7016a80b811a 100644
--- a/libcxx/test/libcxx/algorithms/pstl.libdispatch.chunk_partitions.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/pstl.libdispatch.chunk_partitions.pass.cpp
@@ -18,21 +18,21 @@
 
 int main(int, char**) {
   {
-    auto chunks = std::__par_backend::__libdispatch::__partition_chunks(0);
+    auto chunks = std::__pstl::__libdispatch::__partition_chunks(0);
     assert(chunks.__chunk_count_ == 1);
     assert(chunks.__first_chunk_size_ == 0);
     assert(chunks.__chunk_size_ == 0);
   }
 
   {
-    auto chunks = std::__par_backend::__libdispatch::__partition_chunks(1);
+    auto chunks = std::__pstl::__libdispatch::__partition_chunks(1);
     assert(chunks.__chunk_count_ == 1);
     assert(chunks.__first_chunk_size_ == 1);
     assert(chunks.__chunk_size_ == 1);
   }
 
   for (std::ptr
diff _t i = 2; i != 2ll << 20; ++i) {
-    auto chunks = std::__par_backend::__libdispatch::__partition_chunks(i);
+    auto chunks = std::__pstl::__libdispatch::__partition_chunks(i);
     assert(chunks.__chunk_count_ >= 1);
     assert(chunks.__chunk_count_ <= i);
     assert((chunks.__chunk_count_ - 1) * chunks.__chunk_size_ + chunks.__first_chunk_size_ == i);

diff  --git a/libcxx/utils/generate_iwyu_mapping.py b/libcxx/utils/generate_iwyu_mapping.py
index 2265438ab49cc7..33607f19968411 100644
--- a/libcxx/utils/generate_iwyu_mapping.py
+++ b/libcxx/utils/generate_iwyu_mapping.py
@@ -10,6 +10,7 @@ def IWYU_mapping(header: str) -> typing.Optional[typing.List[str]]:
     ignore = [
         "__debug_utils/.+",
         "__fwd/get[.]h",
+        "__pstl/.+",
         "__support/.+",
         "__utility/private_constructor_tag.h",
     ]