[libcxx-commits] [libcxx] [libc++] Optimize lexicographical_compare (PR #65279)

Nikolas Klauser via libcxx-commits libcxx-commits at lists.llvm.org
Sat Jul 27 05:06:57 PDT 2024


================
@@ -11,39 +11,96 @@
 
 #include <__algorithm/comp.h>
 #include <__algorithm/comp_ref_type.h>
+#include <__algorithm/max.h>
+#include <__algorithm/mismatch.h>
+#include <__algorithm/simd_utils.h>
 #include <__config>
+#include <__functional/identity.h>
 #include <__iterator/iterator_traits.h>
+#include <__string/constexpr_c_functions.h>
+#include <__type_traits/desugars_to.h>
+#include <__type_traits/invoke.h>
+#include <__type_traits/is_equality_comparable.h>
+#include <__type_traits/is_integral.h>
+#include <__type_traits/is_volatile.h>
+#include <cstring>
+#include <cwchar>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
 #endif
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
-template <class _Compare, class _InputIterator1, class _InputIterator2>
+template <class _Iter1, class _Sent1, class _Iter2, class _Sent2, class _Proj1, class _Proj2, class _Comp>
 _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 bool __lexicographical_compare(
-    _InputIterator1 __first1,
-    _InputIterator1 __last1,
-    _InputIterator2 __first2,
-    _InputIterator2 __last2,
-    _Compare __comp) {
-  for (; __first2 != __last2; ++__first1, (void)++__first2) {
-    if (__first1 == __last1 || __comp(*__first1, *__first2))
+    _Iter1 __first1, _Sent1 __last1, _Iter2 __first2, _Sent2 __last2, _Comp& __comp, _Proj1& __proj1, _Proj2& __proj2) {
+  while (__first2 != __last2) {
+    if (__first1 == __last1 ||
+        std::__invoke(__comp, std::__invoke(__proj1, *__first1), std::__invoke(__proj2, *__first2)))
       return true;
-    if (__comp(*__first2, *__first1))
+    if (std::__invoke(__comp, std::__invoke(__proj2, *__first2), std::__invoke(__proj1, *__first1)))
       return false;
+    ++__first1;
+    ++__first2;
   }
   return false;
 }
 
+#if _LIBCPP_VECTORIZE_ALGORITHMS
----------------
philnik777 wrote:

It turns out the `mismatch` variant is actually still quite a bit faster at `-Os`, so I think we simply want to always have this optimization:
```
------------------------------------------------------------------------------------
Benchmark                                                      naive        mismatch
------------------------------------------------------------------------------------
bm_lexicographical_compare<unsigned char>/1                  1.34 ns         2.28 ns
bm_lexicographical_compare<unsigned char>/2                  1.93 ns         2.51 ns
bm_lexicographical_compare<unsigned char>/3                  2.71 ns         2.52 ns
bm_lexicographical_compare<unsigned char>/4                  3.48 ns         2.17 ns
bm_lexicographical_compare<unsigned char>/5                  4.15 ns         2.15 ns
bm_lexicographical_compare<unsigned char>/6                  4.86 ns         2.08 ns
bm_lexicographical_compare<unsigned char>/7                  5.47 ns         2.07 ns
bm_lexicographical_compare<unsigned char>/8                  6.20 ns         2.07 ns
bm_lexicographical_compare<unsigned char>/16                 11.9 ns         2.07 ns
bm_lexicographical_compare<unsigned char>/64                 45.6 ns         2.30 ns
bm_lexicographical_compare<unsigned char>/512                 367 ns         6.47 ns
bm_lexicographical_compare<unsigned char>/4096               2881 ns         41.0 ns
bm_lexicographical_compare<unsigned char>/32768             23029 ns          488 ns
bm_lexicographical_compare<unsigned char>/262144           184128 ns         4330 ns
bm_lexicographical_compare<unsigned char>/1048576          737395 ns        20003 ns
bm_lexicographical_compare<signed char>/1                    1.40 ns         1.42 ns
bm_lexicographical_compare<signed char>/2                    2.19 ns         1.69 ns
bm_lexicographical_compare<signed char>/3                    2.92 ns         2.22 ns
bm_lexicographical_compare<signed char>/4                    3.72 ns         2.67 ns
bm_lexicographical_compare<signed char>/5                    4.42 ns         3.32 ns
bm_lexicographical_compare<signed char>/6                    5.08 ns         3.78 ns
bm_lexicographical_compare<signed char>/7                    5.74 ns         4.48 ns
bm_lexicographical_compare<signed char>/8                    6.50 ns         5.10 ns
bm_lexicographical_compare<signed char>/16                   12.1 ns         8.60 ns
bm_lexicographical_compare<signed char>/64                   45.9 ns         31.4 ns
bm_lexicographical_compare<signed char>/512                   365 ns          247 ns
bm_lexicographical_compare<signed char>/4096                 2888 ns         1925 ns
bm_lexicographical_compare<signed char>/32768               23055 ns        15343 ns
bm_lexicographical_compare<signed char>/262144             184544 ns       122678 ns
bm_lexicographical_compare<signed char>/1048576            738252 ns       491933 ns
bm_lexicographical_compare<int>/1                            1.40 ns         1.41 ns
bm_lexicographical_compare<int>/2                            1.87 ns         1.69 ns
bm_lexicographical_compare<int>/3                            2.37 ns         2.13 ns
bm_lexicographical_compare<int>/4                            2.87 ns         2.47 ns
bm_lexicographical_compare<int>/5                            3.45 ns         2.85 ns
bm_lexicographical_compare<int>/6                            4.11 ns         3.24 ns
bm_lexicographical_compare<int>/7                            4.65 ns         3.53 ns
bm_lexicographical_compare<int>/8                            5.20 ns         3.92 ns
bm_lexicographical_compare<int>/16                           9.88 ns         6.73 ns
bm_lexicographical_compare<int>/64                           39.1 ns         22.7 ns
bm_lexicographical_compare<int>/512                           317 ns          165 ns
bm_lexicographical_compare<int>/4096                         2485 ns         1282 ns
bm_lexicographical_compare<int>/32768                       20957 ns         9958 ns
bm_lexicographical_compare<int>/262144                     159614 ns        79793 ns
bm_lexicographical_compare<int>/1048576                    640461 ns       321012 ns
bm_ranges_lexicographical_compare<unsigned char>/1           1.40 ns         2.29 ns
bm_ranges_lexicographical_compare<unsigned char>/2           1.87 ns         2.07 ns
bm_ranges_lexicographical_compare<unsigned char>/3           2.38 ns         2.07 ns
bm_ranges_lexicographical_compare<unsigned char>/4           3.07 ns         2.07 ns
bm_ranges_lexicographical_compare<unsigned char>/5           3.72 ns         2.07 ns
bm_ranges_lexicographical_compare<unsigned char>/6           4.45 ns         2.07 ns
bm_ranges_lexicographical_compare<unsigned char>/7           5.13 ns         2.07 ns
bm_ranges_lexicographical_compare<unsigned char>/8           5.78 ns         2.07 ns
bm_ranges_lexicographical_compare<unsigned char>/16          11.3 ns         2.07 ns
bm_ranges_lexicographical_compare<unsigned char>/64          44.1 ns         2.30 ns
bm_ranges_lexicographical_compare<unsigned char>/512          356 ns         6.48 ns
bm_ranges_lexicographical_compare<unsigned char>/4096        2802 ns         40.2 ns
bm_ranges_lexicographical_compare<unsigned char>/32768      22428 ns          476 ns
bm_ranges_lexicographical_compare<unsigned char>/262144    181003 ns         4457 ns
bm_ranges_lexicographical_compare<unsigned char>/1048576   725626 ns        20100 ns
bm_ranges_lexicographical_compare<signed char>/1             1.40 ns         1.61 ns
bm_ranges_lexicographical_compare<signed char>/2             1.65 ns         1.83 ns
bm_ranges_lexicographical_compare<signed char>/3             2.36 ns         2.12 ns
bm_ranges_lexicographical_compare<signed char>/4             3.08 ns         2.51 ns
bm_ranges_lexicographical_compare<signed char>/5             3.72 ns         2.91 ns
bm_ranges_lexicographical_compare<signed char>/6             4.42 ns         3.37 ns
bm_ranges_lexicographical_compare<signed char>/7             5.16 ns         3.79 ns
bm_ranges_lexicographical_compare<signed char>/8             5.78 ns         4.25 ns
bm_ranges_lexicographical_compare<signed char>/16            11.3 ns         7.71 ns
bm_ranges_lexicographical_compare<signed char>/64            45.0 ns         29.6 ns
bm_ranges_lexicographical_compare<signed char>/512            356 ns          234 ns
bm_ranges_lexicographical_compare<signed char>/4096          2808 ns         1841 ns
bm_ranges_lexicographical_compare<signed char>/32768        22411 ns        14730 ns
bm_ranges_lexicographical_compare<signed char>/262144      214881 ns       117844 ns
bm_ranges_lexicographical_compare<signed char>/1048576     716978 ns       471653 ns
bm_ranges_lexicographical_compare<int>/1                     1.17 ns         1.61 ns
bm_ranges_lexicographical_compare<int>/2                     1.64 ns         2.07 ns
bm_ranges_lexicographical_compare<int>/3                     2.21 ns         2.29 ns
bm_ranges_lexicographical_compare<int>/4                     2.81 ns         2.53 ns
bm_ranges_lexicographical_compare<int>/5                     3.28 ns         2.90 ns
bm_ranges_lexicographical_compare<int>/6                     3.87 ns         3.29 ns
bm_ranges_lexicographical_compare<int>/7                     4.61 ns         3.63 ns
bm_ranges_lexicographical_compare<int>/8                     5.07 ns         3.91 ns
bm_ranges_lexicographical_compare<int>/16                    9.99 ns         6.68 ns
bm_ranges_lexicographical_compare<int>/64                    39.1 ns         21.7 ns
bm_ranges_lexicographical_compare<int>/512                    318 ns          165 ns
bm_ranges_lexicographical_compare<int>/4096                  2491 ns         1284 ns
bm_ranges_lexicographical_compare<int>/32768                19472 ns        10030 ns
bm_ranges_lexicographical_compare<int>/262144              159860 ns        79518 ns
bm_ranges_lexicographical_compare<int>/1048576             640198 ns       320270 ns
```

https://github.com/llvm/llvm-project/pull/65279


More information about the libcxx-commits mailing list