[libcxx-commits] [libcxx] [libc++] Optimize lexicographical_compare (PR #65279)
Nikolas Klauser via libcxx-commits
libcxx-commits at lists.llvm.org
Sat Jul 27 05:06:57 PDT 2024
================
@@ -11,39 +11,96 @@
#include <__algorithm/comp.h>
#include <__algorithm/comp_ref_type.h>
+#include <__algorithm/max.h>
+#include <__algorithm/mismatch.h>
+#include <__algorithm/simd_utils.h>
#include <__config>
+#include <__functional/identity.h>
#include <__iterator/iterator_traits.h>
+#include <__string/constexpr_c_functions.h>
+#include <__type_traits/desugars_to.h>
+#include <__type_traits/invoke.h>
+#include <__type_traits/is_equality_comparable.h>
+#include <__type_traits/is_integral.h>
+#include <__type_traits/is_volatile.h>
+#include <cstring>
+#include <cwchar>
#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
# pragma GCC system_header
#endif
_LIBCPP_BEGIN_NAMESPACE_STD
-template <class _Compare, class _InputIterator1, class _InputIterator2>
+template <class _Iter1, class _Sent1, class _Iter2, class _Sent2, class _Proj1, class _Proj2, class _Comp>
_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 bool __lexicographical_compare(
- _InputIterator1 __first1,
- _InputIterator1 __last1,
- _InputIterator2 __first2,
- _InputIterator2 __last2,
- _Compare __comp) {
- for (; __first2 != __last2; ++__first1, (void)++__first2) {
- if (__first1 == __last1 || __comp(*__first1, *__first2))
+ _Iter1 __first1, _Sent1 __last1, _Iter2 __first2, _Sent2 __last2, _Comp& __comp, _Proj1& __proj1, _Proj2& __proj2) {
+ while (__first2 != __last2) {
+ if (__first1 == __last1 ||
+ std::__invoke(__comp, std::__invoke(__proj1, *__first1), std::__invoke(__proj2, *__first2)))
return true;
- if (__comp(*__first2, *__first1))
+ if (std::__invoke(__comp, std::__invoke(__proj2, *__first2), std::__invoke(__proj1, *__first1)))
return false;
+ ++__first1;
+ ++__first2;
}
return false;
}
+#if _LIBCPP_VECTORIZE_ALGORITHMS
----------------
philnik777 wrote:
It turns out the `mismatch` variant is actually still quite a bit faster at `-Os`, so I think we simply want to always have this optimization:
```
------------------------------------------------------------------------------------
Benchmark naive mismatch
------------------------------------------------------------------------------------
bm_lexicographical_compare<unsigned char>/1 1.34 ns 2.28 ns
bm_lexicographical_compare<unsigned char>/2 1.93 ns 2.51 ns
bm_lexicographical_compare<unsigned char>/3 2.71 ns 2.52 ns
bm_lexicographical_compare<unsigned char>/4 3.48 ns 2.17 ns
bm_lexicographical_compare<unsigned char>/5 4.15 ns 2.15 ns
bm_lexicographical_compare<unsigned char>/6 4.86 ns 2.08 ns
bm_lexicographical_compare<unsigned char>/7 5.47 ns 2.07 ns
bm_lexicographical_compare<unsigned char>/8 6.20 ns 2.07 ns
bm_lexicographical_compare<unsigned char>/16 11.9 ns 2.07 ns
bm_lexicographical_compare<unsigned char>/64 45.6 ns 2.30 ns
bm_lexicographical_compare<unsigned char>/512 367 ns 6.47 ns
bm_lexicographical_compare<unsigned char>/4096 2881 ns 41.0 ns
bm_lexicographical_compare<unsigned char>/32768 23029 ns 488 ns
bm_lexicographical_compare<unsigned char>/262144 184128 ns 4330 ns
bm_lexicographical_compare<unsigned char>/1048576 737395 ns 20003 ns
bm_lexicographical_compare<signed char>/1 1.40 ns 1.42 ns
bm_lexicographical_compare<signed char>/2 2.19 ns 1.69 ns
bm_lexicographical_compare<signed char>/3 2.92 ns 2.22 ns
bm_lexicographical_compare<signed char>/4 3.72 ns 2.67 ns
bm_lexicographical_compare<signed char>/5 4.42 ns 3.32 ns
bm_lexicographical_compare<signed char>/6 5.08 ns 3.78 ns
bm_lexicographical_compare<signed char>/7 5.74 ns 4.48 ns
bm_lexicographical_compare<signed char>/8 6.50 ns 5.10 ns
bm_lexicographical_compare<signed char>/16 12.1 ns 8.60 ns
bm_lexicographical_compare<signed char>/64 45.9 ns 31.4 ns
bm_lexicographical_compare<signed char>/512 365 ns 247 ns
bm_lexicographical_compare<signed char>/4096 2888 ns 1925 ns
bm_lexicographical_compare<signed char>/32768 23055 ns 15343 ns
bm_lexicographical_compare<signed char>/262144 184544 ns 122678 ns
bm_lexicographical_compare<signed char>/1048576 738252 ns 491933 ns
bm_lexicographical_compare<int>/1 1.40 ns 1.41 ns
bm_lexicographical_compare<int>/2 1.87 ns 1.69 ns
bm_lexicographical_compare<int>/3 2.37 ns 2.13 ns
bm_lexicographical_compare<int>/4 2.87 ns 2.47 ns
bm_lexicographical_compare<int>/5 3.45 ns 2.85 ns
bm_lexicographical_compare<int>/6 4.11 ns 3.24 ns
bm_lexicographical_compare<int>/7 4.65 ns 3.53 ns
bm_lexicographical_compare<int>/8 5.20 ns 3.92 ns
bm_lexicographical_compare<int>/16 9.88 ns 6.73 ns
bm_lexicographical_compare<int>/64 39.1 ns 22.7 ns
bm_lexicographical_compare<int>/512 317 ns 165 ns
bm_lexicographical_compare<int>/4096 2485 ns 1282 ns
bm_lexicographical_compare<int>/32768 20957 ns 9958 ns
bm_lexicographical_compare<int>/262144 159614 ns 79793 ns
bm_lexicographical_compare<int>/1048576 640461 ns 321012 ns
bm_ranges_lexicographical_compare<unsigned char>/1 1.40 ns 2.29 ns
bm_ranges_lexicographical_compare<unsigned char>/2 1.87 ns 2.07 ns
bm_ranges_lexicographical_compare<unsigned char>/3 2.38 ns 2.07 ns
bm_ranges_lexicographical_compare<unsigned char>/4 3.07 ns 2.07 ns
bm_ranges_lexicographical_compare<unsigned char>/5 3.72 ns 2.07 ns
bm_ranges_lexicographical_compare<unsigned char>/6 4.45 ns 2.07 ns
bm_ranges_lexicographical_compare<unsigned char>/7 5.13 ns 2.07 ns
bm_ranges_lexicographical_compare<unsigned char>/8 5.78 ns 2.07 ns
bm_ranges_lexicographical_compare<unsigned char>/16 11.3 ns 2.07 ns
bm_ranges_lexicographical_compare<unsigned char>/64 44.1 ns 2.30 ns
bm_ranges_lexicographical_compare<unsigned char>/512 356 ns 6.48 ns
bm_ranges_lexicographical_compare<unsigned char>/4096 2802 ns 40.2 ns
bm_ranges_lexicographical_compare<unsigned char>/32768 22428 ns 476 ns
bm_ranges_lexicographical_compare<unsigned char>/262144 181003 ns 4457 ns
bm_ranges_lexicographical_compare<unsigned char>/1048576 725626 ns 20100 ns
bm_ranges_lexicographical_compare<signed char>/1 1.40 ns 1.61 ns
bm_ranges_lexicographical_compare<signed char>/2 1.65 ns 1.83 ns
bm_ranges_lexicographical_compare<signed char>/3 2.36 ns 2.12 ns
bm_ranges_lexicographical_compare<signed char>/4 3.08 ns 2.51 ns
bm_ranges_lexicographical_compare<signed char>/5 3.72 ns 2.91 ns
bm_ranges_lexicographical_compare<signed char>/6 4.42 ns 3.37 ns
bm_ranges_lexicographical_compare<signed char>/7 5.16 ns 3.79 ns
bm_ranges_lexicographical_compare<signed char>/8 5.78 ns 4.25 ns
bm_ranges_lexicographical_compare<signed char>/16 11.3 ns 7.71 ns
bm_ranges_lexicographical_compare<signed char>/64 45.0 ns 29.6 ns
bm_ranges_lexicographical_compare<signed char>/512 356 ns 234 ns
bm_ranges_lexicographical_compare<signed char>/4096 2808 ns 1841 ns
bm_ranges_lexicographical_compare<signed char>/32768 22411 ns 14730 ns
bm_ranges_lexicographical_compare<signed char>/262144 214881 ns 117844 ns
bm_ranges_lexicographical_compare<signed char>/1048576 716978 ns 471653 ns
bm_ranges_lexicographical_compare<int>/1 1.17 ns 1.61 ns
bm_ranges_lexicographical_compare<int>/2 1.64 ns 2.07 ns
bm_ranges_lexicographical_compare<int>/3 2.21 ns 2.29 ns
bm_ranges_lexicographical_compare<int>/4 2.81 ns 2.53 ns
bm_ranges_lexicographical_compare<int>/5 3.28 ns 2.90 ns
bm_ranges_lexicographical_compare<int>/6 3.87 ns 3.29 ns
bm_ranges_lexicographical_compare<int>/7 4.61 ns 3.63 ns
bm_ranges_lexicographical_compare<int>/8 5.07 ns 3.91 ns
bm_ranges_lexicographical_compare<int>/16 9.99 ns 6.68 ns
bm_ranges_lexicographical_compare<int>/64 39.1 ns 21.7 ns
bm_ranges_lexicographical_compare<int>/512 318 ns 165 ns
bm_ranges_lexicographical_compare<int>/4096 2491 ns 1284 ns
bm_ranges_lexicographical_compare<int>/32768 19472 ns 10030 ns
bm_ranges_lexicographical_compare<int>/262144 159860 ns 79518 ns
bm_ranges_lexicographical_compare<int>/1048576 640198 ns 320270 ns
```
https://github.com/llvm/llvm-project/pull/65279
More information about the libcxx-commits
mailing list