[libcxx-commits] [libcxx] 68b1035 - [libc++][PSTL] Add a __parallel_sort implementation to libdispatch

Nikolas Klauser via libcxx-commits libcxx-commits at lists.llvm.org
Tue Aug 15 12:39:47 PDT 2023


Author: Nikolas Klauser
Date: 2023-08-15T12:20:40-07:00
New Revision: 68b1035965f08d7cdfdcb6a5c17d53e45896f3f0

URL: https://github.com/llvm/llvm-project/commit/68b1035965f08d7cdfdcb6a5c17d53e45896f3f0
DIFF: https://github.com/llvm/llvm-project/commit/68b1035965f08d7cdfdcb6a5c17d53e45896f3f0.diff

LOG: [libc++][PSTL] Add a __parallel_sort implementation to libdispatch

Reviewed By: #libc, ldionne

Spies: ldionne, libcxx-commits

Differential Revision: https://reviews.llvm.org/D155136

Added: 
    libcxx/benchmarks/algorithms/pstl.stable_sort.bench.cpp

Modified: 
    libcxx/benchmarks/CMakeLists.txt
    libcxx/benchmarks/algorithms/stable_sort.bench.cpp
    libcxx/include/__algorithm/pstl_backends/cpu_backends/libdispatch.h
    libcxx/src/pstl/libdispatch.cpp
    libcxx/test/std/algorithms/alg.nonmodifying/alg.all_of/pstl.all_of.pass.cpp
    libcxx/test/std/algorithms/alg.nonmodifying/alg.any_of/pstl.any_of.pass.cpp
    libcxx/test/std/algorithms/alg.nonmodifying/alg.none_of/pstl.none_of.pass.cpp
    libcxx/test/std/algorithms/alg.sorting/alg.sort/stable.sort/pstl.stable_sort.pass.cpp

Removed: 
    


################################################################################
diff  --git a/libcxx/benchmarks/CMakeLists.txt b/libcxx/benchmarks/CMakeLists.txt
index b5d3fbf05ed802..92f295ef399008 100644
--- a/libcxx/benchmarks/CMakeLists.txt
+++ b/libcxx/benchmarks/CMakeLists.txt
@@ -161,6 +161,7 @@ set(BENCHMARK_TESTS
     algorithms/min.bench.cpp
     algorithms/min_max_element.bench.cpp
     algorithms/pop_heap.bench.cpp
+    algorithms/pstl.stable_sort.bench.cpp
     algorithms/push_heap.bench.cpp
     algorithms/ranges_make_heap.bench.cpp
     algorithms/ranges_make_heap_then_sort_heap.bench.cpp

diff  --git a/libcxx/benchmarks/algorithms/pstl.stable_sort.bench.cpp b/libcxx/benchmarks/algorithms/pstl.stable_sort.bench.cpp
new file mode 100644
index 00000000000000..9357b870bece64
--- /dev/null
+++ b/libcxx/benchmarks/algorithms/pstl.stable_sort.bench.cpp
@@ -0,0 +1,38 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <algorithm>
+
+#include "common.h"
+
+namespace {
+template <class ValueType, class Order>
+struct StableSort {
+  size_t Quantity;
+
+  void run(benchmark::State& state) const {
+    runOpOnCopies<ValueType>(state, Quantity, Order(), BatchSize::CountBatch, [](auto& Copy) {
+      std::stable_sort(std::execution::par, Copy.begin(), Copy.end());
+    });
+  }
+
+  bool skip() const { return Order() == ::Order::Heap; }
+
+  std::string name() const {
+    return "BM_pstl_stable_sort" + ValueType::name() + Order::name() + "/" + std::to_string(Quantity);
+  }
+};
+} // namespace
+
+int main(int argc, char** argv) {
+  benchmark::Initialize(&argc, argv);
+  if (benchmark::ReportUnrecognizedArguments(argc, argv))
+    return 1;
+  makeCartesianProductBenchmark<StableSort, AllValueTypes, AllOrders>(Quantities);
+  benchmark::RunSpecifiedBenchmarks();
+}

diff  --git a/libcxx/benchmarks/algorithms/stable_sort.bench.cpp b/libcxx/benchmarks/algorithms/stable_sort.bench.cpp
index fe88d01dd66f89..024a036e0c2582 100644
--- a/libcxx/benchmarks/algorithms/stable_sort.bench.cpp
+++ b/libcxx/benchmarks/algorithms/stable_sort.bench.cpp
@@ -16,7 +16,7 @@ struct StableSort {
   size_t Quantity;
 
   void run(benchmark::State& state) const {
-    runOpOnCopies<ValueType>(state, Quantity, Order(), BatchSize::CountElements, [](auto& Copy) {
+    runOpOnCopies<ValueType>(state, Quantity, Order(), BatchSize::CountBatch, [](auto& Copy) {
       std::stable_sort(Copy.begin(), Copy.end());
     });
   }

diff  --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/libdispatch.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/libdispatch.h
index 51240e92e0d0b7..50b6e0b1d0a03f 100644
--- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/libdispatch.h
+++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/libdispatch.h
@@ -9,8 +9,10 @@
 #ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKENDS_LIBDISPATCH_H
 #define _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKENDS_LIBDISPATCH_H
 
+#include <__algorithm/inplace_merge.h>
 #include <__algorithm/lower_bound.h>
 #include <__algorithm/max.h>
+#include <__algorithm/merge.h>
 #include <__algorithm/upper_bound.h>
 #include <__atomic/atomic.h>
 #include <__config>
@@ -21,7 +23,6 @@
 #include <__memory/construct_at.h>
 #include <__memory/unique_ptr.h>
 #include <__numeric/reduce.h>
-#include <__utility/exception_guard.h>
 #include <__utility/move.h>
 #include <__utility/pair.h>
 #include <__utility/terminate_on_exception.h>
@@ -61,10 +62,7 @@ struct __chunk_partitions {
 
 template <class _RandomAccessIterator, class _Functor>
 _LIBCPP_HIDE_FROM_ABI void
-__parallel_for(_RandomAccessIterator __first, _RandomAccessIterator __last, _Functor __func) {
-  auto __partitions = __libdispatch::__partition_chunks(__last - __first);
-
-  // Perform the chunked execution.
+__dispatch_parallel_for(__chunk_partitions __partitions, _RandomAccessIterator __first, _Functor __func) {
   __libdispatch::__dispatch_apply(__partitions.__chunk_count_, [&](size_t __chunk) {
     auto __this_chunk_size = __chunk == 0 ? __partitions.__first_chunk_size_ : __partitions.__chunk_size_;
     auto __index =
@@ -75,6 +73,13 @@ __parallel_for(_RandomAccessIterator __first, _RandomAccessIterator __last, _Fun
   });
 }
 
+template <class _RandomAccessIterator, class _Functor>
+_LIBCPP_HIDE_FROM_ABI void
+__parallel_for(_RandomAccessIterator __first, _RandomAccessIterator __last, _Functor __func) {
+  return __libdispatch::__dispatch_parallel_for(
+      __libdispatch::__partition_chunks(__last - __first), std::move(__first), std::move(__func));
+}
+
 template <class _RandomAccessIterator1, class _RandomAccessIterator2, class _RandomAccessIteratorOut>
 struct __merge_range {
   __merge_range(_RandomAccessIterator1 __mid1, _RandomAccessIterator2 __mid2, _RandomAccessIteratorOut __result)
@@ -220,11 +225,92 @@ _LIBCPP_HIDE_FROM_ABI _Value __parallel_transform_reduce(
   });
 }
 
-// TODO: parallelize this
 template <class _RandomAccessIterator, class _Comp, class _LeafSort>
 _LIBCPP_HIDE_FROM_ABI void __parallel_stable_sort(
     _RandomAccessIterator __first, _RandomAccessIterator __last, _Comp __comp, _LeafSort __leaf_sort) {
-  __leaf_sort(__first, __last, __comp);
+  const auto __size = __last - __first;
+  auto __partitions = __libdispatch::__partition_chunks(__size);
+
+  if (__partitions.__chunk_count_ == 0)
+    return;
+
+  if (__partitions.__chunk_count_ == 1)
+    return __leaf_sort(__first, __last, __comp);
+
+  using _Value = __iter_value_type<_RandomAccessIterator>;
+
+  auto __destroy = [__size](_Value* __ptr) {
+    std::destroy_n(__ptr, __size);
+    std::allocator<_Value>().deallocate(__ptr, __size);
+  };
+
+  // TODO: use __uninitialized_buffer
+  unique_ptr<_Value[], decltype(__destroy)> __values(std::allocator<_Value>().allocate(__size), __destroy);
+
+  return std::__terminate_on_exception([&] {
+    // Initialize all elements to a moved-from state
+    // TODO: Don't do this - this can be done in the first merge - see https://llvm.org/PR63928
+    std::__construct_at(__values.get(), std::move(*__first));
+    for (__iter_
diff _t<_RandomAccessIterator> __i = 1; __i != __size; ++__i) {
+      std::__construct_at(__values.get() + __i, std::move(__values.get()[__i - 1]));
+    }
+    *__first = std::move(__values.get()[__size - 1]);
+
+    __libdispatch::__dispatch_parallel_for(
+        __partitions,
+        __first,
+        [&__leaf_sort, &__comp](_RandomAccessIterator __chunk_first, _RandomAccessIterator __chunk_last) {
+          __leaf_sort(std::move(__chunk_first), std::move(__chunk_last), __comp);
+        });
+
+    bool __objects_are_in_buffer = false;
+    do {
+      const auto __old_chunk_size = __partitions.__chunk_size_;
+      if (__partitions.__chunk_count_ % 2 == 1) {
+        auto __inplace_merge_chunks = [&__comp, &__partitions](auto __first_chunk_begin) {
+          std::inplace_merge(
+              __first_chunk_begin,
+              __first_chunk_begin + __partitions.__first_chunk_size_,
+              __first_chunk_begin + __partitions.__first_chunk_size_ + __partitions.__chunk_size_,
+              __comp);
+        };
+        if (__objects_are_in_buffer)
+          __inplace_merge_chunks(__values.get());
+        else
+          __inplace_merge_chunks(__first);
+        __partitions.__first_chunk_size_ += 2 * __partitions.__chunk_size_;
+      } else {
+        __partitions.__first_chunk_size_ += __partitions.__chunk_size_;
+      }
+
+      __partitions.__chunk_size_ *= 2;
+      __partitions.__chunk_count_ /= 2;
+
+      auto __merge_chunks = [__partitions, __old_chunk_size, &__comp](auto __from_first, auto __to_first) {
+        __libdispatch::__dispatch_parallel_for(
+            __partitions,
+            __from_first,
+            [__old_chunk_size, &__from_first, &__to_first, &__comp](auto __chunk_first, auto __chunk_last) {
+              std::merge(std::make_move_iterator(__chunk_first),
+                         std::make_move_iterator(__chunk_last - __old_chunk_size),
+                         std::make_move_iterator(__chunk_last - __old_chunk_size),
+                         std::make_move_iterator(__chunk_last),
+                         __to_first + (__chunk_first - __from_first),
+                         __comp);
+            });
+      };
+
+      if (__objects_are_in_buffer)
+        __merge_chunks(__values.get(), __first);
+      else
+        __merge_chunks(__first, __values.get());
+      __objects_are_in_buffer = !__objects_are_in_buffer;
+    } while (__partitions.__chunk_count_ > 1);
+
+    if (__objects_are_in_buffer) {
+      std::move(__values.get(), __values.get() + __size, __first);
+    }
+  });
 }
 
 _LIBCPP_HIDE_FROM_ABI inline void __cancel_execution() {}

diff  --git a/libcxx/src/pstl/libdispatch.cpp b/libcxx/src/pstl/libdispatch.cpp
index 0ed028b37096ec..52d4afbcce6e00 100644
--- a/libcxx/src/pstl/libdispatch.cpp
+++ b/libcxx/src/pstl/libdispatch.cpp
@@ -24,6 +24,8 @@ __chunk_partitions __partition_chunks(ptr
diff _t element_count) noexcept {
   partitions.__chunk_count_      = std::max<ptr
diff _t>(1, element_count / 256);
   partitions.__chunk_size_       = element_count / partitions.__chunk_count_;
   partitions.__first_chunk_size_ = element_count - (partitions.__chunk_count_ - 1) * partitions.__chunk_size_;
+  if (partitions.__chunk_count_ == 0 && element_count > 0)
+    partitions.__chunk_count_ = 1;
   return partitions;
 }
 

diff  --git a/libcxx/test/std/algorithms/alg.nonmodifying/alg.all_of/pstl.all_of.pass.cpp b/libcxx/test/std/algorithms/alg.nonmodifying/alg.all_of/pstl.all_of.pass.cpp
index f581b3dfbaea4c..82ba4338886d69 100644
--- a/libcxx/test/std/algorithms/alg.nonmodifying/alg.all_of/pstl.all_of.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.nonmodifying/alg.all_of/pstl.all_of.pass.cpp
@@ -62,7 +62,7 @@ struct Test {
     // check that a large number of elements works
     std::vector<int> vec(100);
     std::fill(vec.begin(), vec.end(), 3);
-    assert(std::all_of(Iter(vec.data()), Iter(vec.data() + vec.size()), [](int i) { return i == 3; }));
+    assert(std::all_of(policy, Iter(vec.data()), Iter(vec.data() + vec.size()), [](int i) { return i == 3; }));
   }
 };
 

diff  --git a/libcxx/test/std/algorithms/alg.nonmodifying/alg.any_of/pstl.any_of.pass.cpp b/libcxx/test/std/algorithms/alg.nonmodifying/alg.any_of/pstl.any_of.pass.cpp
index 62a7c152207fbb..c21ce8afe556bc 100644
--- a/libcxx/test/std/algorithms/alg.nonmodifying/alg.any_of/pstl.any_of.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.nonmodifying/alg.any_of/pstl.any_of.pass.cpp
@@ -56,7 +56,7 @@ struct Test {
     // check that a large number of elements works
     std::vector<int> vec(100, 2);
     vec[96] = 3;
-    assert(std::any_of(Iter(vec.data()), Iter(vec.data() + vec.size()), [](int i) { return i == 3; }));
+    assert(std::any_of(policy, Iter(vec.data()), Iter(vec.data() + vec.size()), [](int i) { return i == 3; }));
   }
 };
 

diff  --git a/libcxx/test/std/algorithms/alg.nonmodifying/alg.none_of/pstl.none_of.pass.cpp b/libcxx/test/std/algorithms/alg.nonmodifying/alg.none_of/pstl.none_of.pass.cpp
index a42260d77593bd..f6dfe0cbf7b3de 100644
--- a/libcxx/test/std/algorithms/alg.nonmodifying/alg.none_of/pstl.none_of.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.nonmodifying/alg.none_of/pstl.none_of.pass.cpp
@@ -56,7 +56,7 @@ struct Test {
     // check that a large number of elements works
     std::vector<int> vec(100);
     std::fill(vec.begin(), vec.end(), 3);
-    assert(std::none_of(Iter(vec.data()), Iter(vec.data() + vec.size()), [](int i) { return i != 3; }));
+    assert(std::none_of(policy, Iter(vec.data()), Iter(vec.data() + vec.size()), [](int i) { return i != 3; }));
   }
 };
 

diff  --git a/libcxx/test/std/algorithms/alg.sorting/alg.sort/stable.sort/pstl.stable_sort.pass.cpp b/libcxx/test/std/algorithms/alg.sorting/alg.sort/stable.sort/pstl.stable_sort.pass.cpp
index 7298b367f7f39e..0c4dda5b03ca5c 100644
--- a/libcxx/test/std/algorithms/alg.sorting/alg.sort/stable.sort/pstl.stable_sort.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.sorting/alg.sort/stable.sort/pstl.stable_sort.pass.cpp
@@ -30,6 +30,7 @@
 #include "test_macros.h"
 #include "test_execution_policies.h"
 #include "test_iterators.h"
+#include "MoveOnly.h"
 
 EXECUTION_POLICY_SFINAE_TEST(stable_sort);
 
@@ -45,9 +46,9 @@ struct OrderedValue {
   auto operator>(const OrderedValue& rhs) const { return value > rhs.value; }
 };
 
-template <class Iter, std::size_t N>
-void test_one(std::array<int, N> input, std::array<int, N> expected) {
-  std::stable_sort(Iter(input.data()), Iter(input.data() + input.size()));
+template <class Iter, std::size_t N, class Policy, class ValueT = typename std::iterator_traits<Iter>::value_type>
+void test_one(Policy&& policy, std::array<ValueT, N> input, std::array<ValueT, N> expected) {
+  std::stable_sort(policy, Iter(input.data()), Iter(input.data() + input.size()));
   assert(input == expected);
 }
 
@@ -55,27 +56,26 @@ template <class Iter>
 struct Test {
   template <class Policy>
   void operator()(Policy&& policy) {
-
     // Empty sequence.
-    test_one<Iter, 0>({}, {});
+    test_one<Iter, 0>(policy, {}, {});
     // 1-element sequence.
-    test_one<Iter, 1>({1}, {1});
+    test_one<Iter, 1>(policy, {1}, {1});
     // 2-element sequence.
-    test_one<Iter, 2>({2, 1}, {1, 2});
+    test_one<Iter, 2>(policy, {2, 1}, {1, 2});
     // 3-element sequence.
-    test_one<Iter, 3>({2, 1, 3}, {1, 2, 3});
+    test_one<Iter, 3>(policy, {2, 1, 3}, {1, 2, 3});
     // Longer sequence.
-    test_one<Iter, 8>({2, 1, 3, 6, 8, 4, 11, 5}, {1, 2, 3, 4, 5, 6, 8, 11});
+    test_one<Iter, 8>(policy, {2, 1, 3, 6, 8, 4, 11, 5}, {1, 2, 3, 4, 5, 6, 8, 11});
     // Longer sequence with duplicates.
-    test_one<Iter, 7>({2, 1, 3, 6, 2, 8, 6}, {1, 2, 2, 3, 6, 6, 8});
+    test_one<Iter, 7>(policy, {2, 1, 3, 6, 2, 8, 6}, {1, 2, 2, 3, 6, 6, 8});
     // All elements are the same.
-    test_one<Iter, 3>({1, 1, 1}, {1, 1, 1});
+    test_one<Iter, 3>(policy, {1, 1, 1}, {1, 1, 1});
     // Already sorted.
-    test_one<Iter, 5>({1, 2, 3, 4, 5}, {1, 2, 3, 4, 5});
+    test_one<Iter, 5>(policy, {1, 2, 3, 4, 5}, {1, 2, 3, 4, 5});
     // Reverse-sorted.
-    test_one<Iter, 5>({5, 4, 3, 2, 1}, {1, 2, 3, 4, 5});
+    test_one<Iter, 5>(policy, {5, 4, 3, 2, 1}, {1, 2, 3, 4, 5});
     // Repeating pattern.
-    test_one<Iter, 6>({1, 2, 1, 2, 1, 2}, {1, 1, 1, 2, 2, 2});
+    test_one<Iter, 6>(policy, {1, 2, 1, 2, 1, 2}, {1, 1, 1, 2, 2, 2});
 
     { // The sort is stable (equivalent elements remain in the same order).
       using V        = OrderedValue;
@@ -126,8 +126,25 @@ struct Test {
   }
 };
 
+struct NotDefaultConstructible {
+  NotDefaultConstructible(int i) : i_(i) {}
+
+  int i_;
+
+  friend bool operator==(NotDefaultConstructible lhs, NotDefaultConstructible rhs) {
+    return lhs.i_ == rhs.i_;
+  }
+
+  friend bool operator<(NotDefaultConstructible lhs, NotDefaultConstructible rhs) {
+    return lhs.i_ < rhs.i_;
+  }
+};
+
 int main(int, char**) {
-  types::for_each(types::random_access_iterator_list<int*>{}, TestIteratorWithPolicies<Test>{});
+  types::for_each(types::concatenate_t<types::random_access_iterator_list<int*>,
+                                       types::random_access_iterator_list<MoveOnly*>,
+                                       types::random_access_iterator_list<NotDefaultConstructible*>>{},
+                  TestIteratorWithPolicies<Test>{});
 
 #ifndef TEST_HAS_NO_EXCEPTIONS
   std::set_terminate(terminate_successful);


        


More information about the libcxx-commits mailing list