[libcxx-commits] [libcxx] [libc++] Optimize ranges::minmax (PR #87335)

Thu Apr 4 14:20:56 PDT 2024

https://github.com/philnik777 updated https://github.com/llvm/llvm-project/pull/87335

>From 2fa896aad6d526866475fd1fd2f6d3dda9430e02 Mon Sep 17 00:00:00 2001
From: Nikolas Klauser <nikolasklauser at berlin.de>
Date: Mon, 11 Mar 2024 13:18:12 +0100
Subject: [PATCH] [libc++] Optimize ranges::minmax

---
 libcxx/benchmarks/CMakeLists.txt              |  1 +
 libcxx/benchmarks/algorithms/minmax.bench.cpp | 68 +++++++++++++++++++
 libcxx/docs/ReleaseNotes/19.rst               |  2 +
 libcxx/include/__algorithm/comp.h             |  3 +
 libcxx/include/__algorithm/ranges_minmax.h    | 17 ++++-
 libcxx/include/__functional/operations.h      |  6 ++
 .../include/__functional/ranges_operations.h  |  3 +
 libcxx/include/__type_traits/desugars_to.h    |  1 +
 8 files changed, 100 insertions(+), 1 deletion(-)
 create mode 100644 libcxx/benchmarks/algorithms/minmax.bench.cpp

diff --git a/libcxx/benchmarks/CMakeLists.txt b/libcxx/benchmarks/CMakeLists.txt
index 387e013afeb6c4..928238c1ac69ba 100644
--- a/libcxx/benchmarks/CMakeLists.txt
+++ b/libcxx/benchmarks/CMakeLists.txt
@@ -182,6 +182,7 @@ set(BENCHMARK_TESTS
     algorithms/make_heap.bench.cpp
     algorithms/make_heap_then_sort_heap.bench.cpp
     algorithms/min.bench.cpp
+    algorithms/minmax.bench.cpp
     algorithms/min_max_element.bench.cpp
     algorithms/mismatch.bench.cpp
     algorithms/pop_heap.bench.cpp
diff --git a/libcxx/benchmarks/algorithms/minmax.bench.cpp b/libcxx/benchmarks/algorithms/minmax.bench.cpp
new file mode 100644
index 00000000000000..b0ff7f91c19939
--- /dev/null
+++ b/libcxx/benchmarks/algorithms/minmax.bench.cpp
@@ -0,0 +1,68 @@
+#include <algorithm>
+#include <cassert>
+
+#include <benchmark/benchmark.h>
+
+void run_sizes(auto benchmark) {
+  benchmark->Arg(1)
+      ->Arg(2)
+      ->Arg(3)
+      ->Arg(4)
+      ->Arg(5)
+      ->Arg(6)
+      ->Arg(7)
+      ->Arg(8)
+      ->Arg(9)
+      ->Arg(10)
+      ->Arg(11)
+      ->Arg(12)
+      ->Arg(13)
+      ->Arg(14)
+      ->Arg(15)
+      ->Arg(16)
+      ->Arg(17)
+      ->Arg(18)
+      ->Arg(19)
+      ->Arg(20)
+      ->Arg(21)
+      ->Arg(22)
+      ->Arg(23)
+      ->Arg(24)
+      ->Arg(25)
+      ->Arg(26)
+      ->Arg(27)
+      ->Arg(28)
+      ->Arg(29)
+      ->Arg(30)
+      ->Arg(31)
+      ->Arg(32)
+      ->Arg(64)
+      ->Arg(512)
+      ->Arg(1024)
+      ->Arg(4000)
+      ->Arg(4096)
+      ->Arg(5500)
+      ->Arg(64000)
+      ->Arg(65536)
+      ->Arg(70000);
+}
+
+template <class T>
+static void BM_std_minmax(benchmark::State& state) {
+  std::vector<T> vec(state.range(), 3);
+
+  for (auto _ : state) {
+    benchmark::DoNotOptimize(vec);
+    benchmark::DoNotOptimize(std::ranges::minmax(vec));
+  }
+}
+BENCHMARK(BM_std_minmax<char>)->Apply(run_sizes);
+BENCHMARK(BM_std_minmax<short>)->Apply(run_sizes);
+BENCHMARK(BM_std_minmax<int>)->Apply(run_sizes);
+BENCHMARK(BM_std_minmax<long long>)->Apply(run_sizes);
+BENCHMARK(BM_std_minmax<unsigned char>)->Apply(run_sizes);
+BENCHMARK(BM_std_minmax<unsigned short>)->Apply(run_sizes);
+BENCHMARK(BM_std_minmax<unsigned int>)->Apply(run_sizes);
+BENCHMARK(BM_std_minmax<unsigned long long>)->Apply(run_sizes);
+
+BENCHMARK_MAIN();
diff --git a/libcxx/docs/ReleaseNotes/19.rst b/libcxx/docs/ReleaseNotes/19.rst
index 2da9df54a53198..a420b599cd597e 100644
--- a/libcxx/docs/ReleaseNotes/19.rst
+++ b/libcxx/docs/ReleaseNotes/19.rst
@@ -54,6 +54,8 @@ Improvements and New Features
   resulting in a performance increase of up to 1400x.
 - The ``std::mismatch`` algorithm has been optimized for integral types, which can lead up to 40x performance
   improvements.
+- The ``std::ranges::minmax`` algorithm has been optimized for integral types, resulting in a performance increase of
+  up to 100x.
 
 - The ``_LIBCPP_ENABLE_CXX26_REMOVED_STRSTREAM`` macro has been added to make the declarations in ``<strstream>`` available.
 
diff --git a/libcxx/include/__algorithm/comp.h b/libcxx/include/__algorithm/comp.h
index a089375e3da139..a0fa88d6d2acd3 100644
--- a/libcxx/include/__algorithm/comp.h
+++ b/libcxx/include/__algorithm/comp.h
@@ -41,6 +41,9 @@ struct __less<void, void> {
   }
 };
 
+template <class _Tp>
+inline const bool __desugars_to_v<__less_tag, __less<>, _Tp, _Tp> = true;
+
 _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP___ALGORITHM_COMP_H
diff --git a/libcxx/include/__algorithm/ranges_minmax.h b/libcxx/include/__algorithm/ranges_minmax.h
index 22a62b620c936f..049263b615fc0e 100644
--- a/libcxx/include/__algorithm/ranges_minmax.h
+++ b/libcxx/include/__algorithm/ranges_minmax.h
@@ -24,6 +24,8 @@
 #include <__ranges/access.h>
 #include <__ranges/concepts.h>
 #include <__type_traits/is_reference.h>
+#include <__type_traits/is_trivially_copyable.h>
+#include <__type_traits/operation_traits.h>
 #include <__type_traits/remove_cvref.h>
 #include <__utility/forward.h>
 #include <__utility/move.h>
@@ -83,7 +85,20 @@ struct __fn {
 
     _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(__first != __last, "range has to contain at least one element");
 
-    if constexpr (forward_range<_Range>) {
+    // This optimiation is not in minmax_element because clang doesn't see through the pointers and as a result doesn't
+    // vectorize the code.
+    if constexpr (contiguous_range<_Range> && is_integral_v<_ValueT> &&
+                  __is_cheap_to_copy<_ValueT> & __is_identity<_Proj>::value &&
+                  __desugars_to_v<__less_tag, _Comp, _ValueT, _ValueT>) {
+      minmax_result<_ValueT> __result = {__r[0], __r[0]};
+      for (auto __e : __r) {
+        if (__e < __result.min)
+          __result.min = __e;
+        if (__result.max < __e)
+          __result.max = __e;
+      }
+      return __result;
+    } else if constexpr (forward_range<_Range>) {
       // Special-case the one element case. Avoid repeatedly initializing objects from the result of an iterator
       // dereference when doing so might not be idempotent. The `if constexpr` avoids the extra branch in cases where
       // it's not needed.
diff --git a/libcxx/include/__functional/operations.h b/libcxx/include/__functional/operations.h
index 9aa28e4925069c..240f127e542553 100644
--- a/libcxx/include/__functional/operations.h
+++ b/libcxx/include/__functional/operations.h
@@ -359,6 +359,9 @@ struct _LIBCPP_TEMPLATE_VIS less : __binary_function<_Tp, _Tp, bool> {
 };
 _LIBCPP_CTAD_SUPPORTED_FOR_TYPE(less);
 
+template <class _Tp>
+inline const bool __desugars_to_v<__less_tag, less<_Tp>, _Tp, _Tp> = true;
+
 #if _LIBCPP_STD_VER >= 14
 template <>
 struct _LIBCPP_TEMPLATE_VIS less<void> {
@@ -370,6 +373,9 @@ struct _LIBCPP_TEMPLATE_VIS less<void> {
   }
   typedef void is_transparent;
 };
+
+template <class _Tp>
+inline const bool __desugars_to_v<__less_tag, less<>, _Tp, _Tp> = true;
 #endif
 
 #if _LIBCPP_STD_VER >= 14
diff --git a/libcxx/include/__functional/ranges_operations.h b/libcxx/include/__functional/ranges_operations.h
index a9dffaf6962585..27f06eadd0eb11 100644
--- a/libcxx/include/__functional/ranges_operations.h
+++ b/libcxx/include/__functional/ranges_operations.h
@@ -99,6 +99,9 @@ struct greater_equal {
 template <class _Tp, class _Up>
 inline const bool __desugars_to_v<__equal_tag, ranges::equal_to, _Tp, _Up> = true;
 
+template <class _Tp, class _Up>
+inline const bool __desugars_to_v<__less_tag, ranges::less, _Tp, _Up> = true;
+
 #endif // _LIBCPP_STD_VER >= 20
 
 _LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/include/__type_traits/desugars_to.h b/libcxx/include/__type_traits/desugars_to.h
index a8f69c28dfc520..97a2ee5448f203 100644
--- a/libcxx/include/__type_traits/desugars_to.h
+++ b/libcxx/include/__type_traits/desugars_to.h
@@ -20,6 +20,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 // Tags to represent the canonical operations
 struct __equal_tag {};
 struct __plus_tag {};
+struct __less_tag {};
 
 // This class template is used to determine whether an operation "desugars"
 // (or boils down) to a given canonical operation.