[libcxx-commits] [libcxx] 985c1a4 - [libc++] Optimize the two range overload of mismatch (#86853)

via libcxx-commits libcxx-commits at lists.llvm.org
Mon Apr 1 09:21:55 PDT 2024


Author: Nikolas Klauser
Date: 2024-04-01T18:21:51+02:00
New Revision: 985c1a44f8d49e0afeba907fe29d881c19b319fc

URL: https://github.com/llvm/llvm-project/commit/985c1a44f8d49e0afeba907fe29d881c19b319fc
DIFF: https://github.com/llvm/llvm-project/commit/985c1a44f8d49e0afeba907fe29d881c19b319fc.diff

LOG: [libc++] Optimize the two range overload of mismatch (#86853)

```
-----------------------------------------------------------------------------
Benchmark                                                 old             new
-----------------------------------------------------------------------------
bm_mismatch_two_range_overload<char>/1               0.941 ns         1.88 ns
bm_mismatch_two_range_overload<char>/2                1.43 ns         2.15 ns
bm_mismatch_two_range_overload<char>/3                1.95 ns         2.55 ns
bm_mismatch_two_range_overload<char>/4                2.58 ns         2.90 ns
bm_mismatch_two_range_overload<char>/5                3.75 ns         3.31 ns
bm_mismatch_two_range_overload<char>/6                5.00 ns         3.83 ns
bm_mismatch_two_range_overload<char>/7                5.59 ns         4.35 ns
bm_mismatch_two_range_overload<char>/8                6.37 ns         4.84 ns
bm_mismatch_two_range_overload<char>/16               11.8 ns         6.72 ns
bm_mismatch_two_range_overload<char>/64               45.5 ns         2.59 ns
bm_mismatch_two_range_overload<char>/512               366 ns         12.6 ns
bm_mismatch_two_range_overload<char>/4096             2890 ns         91.6 ns
bm_mismatch_two_range_overload<char>/32768           23038 ns          758 ns
bm_mismatch_two_range_overload<char>/262144         142813 ns         6573 ns
bm_mismatch_two_range_overload<char>/1048576        366679 ns        26710 ns
bm_mismatch_two_range_overload<short>/1              0.934 ns         1.88 ns
bm_mismatch_two_range_overload<short>/2               1.30 ns         2.58 ns
bm_mismatch_two_range_overload<short>/3               1.76 ns         3.28 ns
bm_mismatch_two_range_overload<short>/4               2.24 ns         3.98 ns
bm_mismatch_two_range_overload<short>/5               2.80 ns         4.92 ns
bm_mismatch_two_range_overload<short>/6               3.58 ns         6.01 ns
bm_mismatch_two_range_overload<short>/7               4.29 ns         7.03 ns
bm_mismatch_two_range_overload<short>/8               4.67 ns         7.39 ns
bm_mismatch_two_range_overload<short>/16              9.86 ns         13.1 ns
bm_mismatch_two_range_overload<short>/64              38.9 ns         4.55 ns
bm_mismatch_two_range_overload<short>/512              348 ns         27.7 ns
bm_mismatch_two_range_overload<short>/4096            2881 ns          225 ns
bm_mismatch_two_range_overload<short>/32768          23111 ns         1715 ns
bm_mismatch_two_range_overload<short>/262144        184846 ns        14416 ns
bm_mismatch_two_range_overload<short>/1048576       742885 ns        57264 ns
bm_mismatch_two_range_overload<int>/1                0.838 ns         1.19 ns
bm_mismatch_two_range_overload<int>/2                 1.19 ns         1.65 ns
bm_mismatch_two_range_overload<int>/3                 1.83 ns         2.06 ns
bm_mismatch_two_range_overload<int>/4                 2.38 ns         2.42 ns
bm_mismatch_two_range_overload<int>/5                 3.60 ns         2.47 ns
bm_mismatch_two_range_overload<int>/6                 3.68 ns         3.05 ns
bm_mismatch_two_range_overload<int>/7                 4.32 ns         3.36 ns
bm_mismatch_two_range_overload<int>/8                 5.18 ns         3.58 ns
bm_mismatch_two_range_overload<int>/16                10.6 ns         2.84 ns
bm_mismatch_two_range_overload<int>/64                39.0 ns         7.78 ns
bm_mismatch_two_range_overload<int>/512                247 ns         53.9 ns
bm_mismatch_two_range_overload<int>/4096              1927 ns          429 ns
bm_mismatch_two_range_overload<int>/32768            15569 ns         3393 ns
bm_mismatch_two_range_overload<int>/262144          125413 ns        28504 ns
bm_mismatch_two_range_overload<int>/1048576         504549 ns       112729 ns
```

Added: 
    

Modified: 
    libcxx/benchmarks/algorithms/mismatch.bench.cpp
    libcxx/include/__algorithm/mismatch.h
    libcxx/include/__algorithm/ranges_mismatch.h
    libcxx/include/__algorithm/simd_utils.h
    libcxx/test/libcxx/transitive_includes/cxx23.csv
    libcxx/test/libcxx/transitive_includes/cxx26.csv
    libcxx/test/std/algorithms/alg.nonmodifying/mismatch/mismatch.pass.cpp

Removed: 
    


################################################################################
diff  --git a/libcxx/benchmarks/algorithms/mismatch.bench.cpp b/libcxx/benchmarks/algorithms/mismatch.bench.cpp
index 06289068bb0492..791782879011e2 100644
--- a/libcxx/benchmarks/algorithms/mismatch.bench.cpp
+++ b/libcxx/benchmarks/algorithms/mismatch.bench.cpp
@@ -37,4 +37,20 @@ BENCHMARK(bm_mismatch<char>)->Apply(BenchmarkSizes);
 BENCHMARK(bm_mismatch<short>)->Apply(BenchmarkSizes);
 BENCHMARK(bm_mismatch<int>)->Apply(BenchmarkSizes);
 
+template <class T>
+static void bm_mismatch_two_range_overload(benchmark::State& state) {
+  std::vector<T> vec1(state.range(), '1');
+  std::vector<T> vec2(state.range(), '1');
+  std::mt19937_64 rng(std::random_device{}());
+
+  vec1.back() = '2';
+  for (auto _ : state) {
+    benchmark::DoNotOptimize(vec1);
+    benchmark::DoNotOptimize(std::mismatch(vec1.begin(), vec1.end(), vec2.begin(), vec2.end()));
+  }
+}
+BENCHMARK(bm_mismatch_two_range_overload<char>)->DenseRange(1, 8)->Range(16, 1 << 20);
+BENCHMARK(bm_mismatch_two_range_overload<short>)->DenseRange(1, 8)->Range(16, 1 << 20);
+BENCHMARK(bm_mismatch_two_range_overload<int>)->DenseRange(1, 8)->Range(16, 1 << 20);
+
 BENCHMARK_MAIN();

diff  --git a/libcxx/include/__algorithm/mismatch.h b/libcxx/include/__algorithm/mismatch.h
index 1cb83b01b2ebe5..8abb273ac17822 100644
--- a/libcxx/include/__algorithm/mismatch.h
+++ b/libcxx/include/__algorithm/mismatch.h
@@ -11,6 +11,7 @@
 #define _LIBCPP___ALGORITHM_MISMATCH_H
 
 #include <__algorithm/comp.h>
+#include <__algorithm/min.h>
 #include <__algorithm/simd_utils.h>
 #include <__algorithm/unwrap_iter.h>
 #include <__config>
@@ -136,6 +137,25 @@ mismatch(_InputIterator1 __first1, _InputIterator1 __last1, _InputIterator2 __fi
 }
 
 #if _LIBCPP_STD_VER >= 14
+template <class _Iter1, class _Sent1, class _Iter2, class _Sent2, class _Pred, class _Proj1, class _Proj2>
+[[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair<_Iter1, _Iter2> __mismatch(
+    _Iter1 __first1, _Sent1 __last1, _Iter2 __first2, _Sent2 __last2, _Pred& __pred, _Proj1& __proj1, _Proj2& __proj2) {
+  while (__first1 != __last1 && __first2 != __last2) {
+    if (!std::__invoke(__pred, std::__invoke(__proj1, *__first1), std::__invoke(__proj2, *__first2)))
+      break;
+    ++__first1;
+    ++__first2;
+  }
+  return {std::move(__first1), std::move(__first2)};
+}
+
+template <class _Tp, class _Pred, class _Proj1, class _Proj2>
+[[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair<_Tp*, _Tp*>
+__mismatch(_Tp* __first1, _Tp* __last1, _Tp* __first2, _Tp* __last2, _Pred& __pred, _Proj1& __proj1, _Proj2& __proj2) {
+  auto __len = std::min(__last1 - __first1, __last2 - __first2);
+  return std::__mismatch(__first1, __first1 + __len, __first2, __pred, __proj1, __proj2);
+}
+
 template <class _InputIterator1, class _InputIterator2, class _BinaryPredicate>
 _LIBCPP_NODISCARD_EXT inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair<_InputIterator1, _InputIterator2>
 mismatch(_InputIterator1 __first1,
@@ -143,10 +163,16 @@ mismatch(_InputIterator1 __first1,
          _InputIterator2 __first2,
          _InputIterator2 __last2,
          _BinaryPredicate __pred) {
-  for (; __first1 != __last1 && __first2 != __last2; ++__first1, (void)++__first2)
-    if (!__pred(*__first1, *__first2))
-      break;
-  return pair<_InputIterator1, _InputIterator2>(__first1, __first2);
+  __identity __proj;
+  auto __res = std::__mismatch(
+      std::__unwrap_iter(__first1),
+      std::__unwrap_iter(__last1),
+      std::__unwrap_iter(__first2),
+      std::__unwrap_iter(__last2),
+      __pred,
+      __proj,
+      __proj);
+  return {std::__rewrap_iter(__first1, __res.first), std::__rewrap_iter(__first2, __res.second)};
 }
 
 template <class _InputIterator1, class _InputIterator2>

diff  --git a/libcxx/include/__algorithm/ranges_mismatch.h b/libcxx/include/__algorithm/ranges_mismatch.h
index 037af39126230a..d8a7dd43af09d5 100644
--- a/libcxx/include/__algorithm/ranges_mismatch.h
+++ b/libcxx/include/__algorithm/ranges_mismatch.h
@@ -10,6 +10,8 @@
 #define _LIBCPP___ALGORITHM_RANGES_MISMATCH_H
 
 #include <__algorithm/in_in_result.h>
+#include <__algorithm/mismatch.h>
+#include <__algorithm/unwrap_range.h>
 #include <__config>
 #include <__functional/identity.h>
 #include <__functional/invoke.h>
@@ -42,13 +44,17 @@ struct __fn {
   template <class _I1, class _S1, class _I2, class _S2, class _Pred, class _Proj1, class _Proj2>
   static _LIBCPP_HIDE_FROM_ABI constexpr mismatch_result<_I1, _I2>
   __go(_I1 __first1, _S1 __last1, _I2 __first2, _S2 __last2, _Pred& __pred, _Proj1& __proj1, _Proj2& __proj2) {
-    while (__first1 != __last1 && __first2 != __last2) {
-      if (!std::invoke(__pred, std::invoke(__proj1, *__first1), std::invoke(__proj2, *__first2)))
-        break;
-      ++__first1;
-      ++__first2;
+    if constexpr (forward_iterator<_I1> && forward_iterator<_I2>) {
+      auto __range1 = std::__unwrap_range(__first1, __last1);
+      auto __range2 = std::__unwrap_range(__first2, __last2);
+      auto __res =
+          std::__mismatch(__range1.first, __range1.second, __range2.first, __range2.second, __pred, __proj1, __proj2);
+      return {std::__rewrap_range<_S1>(__first1, __res.first), std::__rewrap_range<_S2>(__first2, __res.second)};
+    } else {
+      auto __res = std::__mismatch(
+          std::move(__first1), std::move(__last1), std::move(__first2), std::move(__last2), __pred, __proj1, __proj2);
+      return {std::move(__res.first), std::move(__res.second)};
     }
-    return {std::move(__first1), std::move(__first2)};
   }
 
   template <input_iterator _I1,
@@ -71,8 +77,8 @@ struct __fn {
             class _Proj1 = identity,
             class _Proj2 = identity>
     requires indirectly_comparable<iterator_t<_R1>, iterator_t<_R2>, _Pred, _Proj1, _Proj2>
-  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI constexpr mismatch_result<borrowed_iterator_t<_R1>,
-                                                                        borrowed_iterator_t<_R2>>
+  _LIBCPP_NODISCARD_EXT
+  _LIBCPP_HIDE_FROM_ABI constexpr mismatch_result<borrowed_iterator_t<_R1>, borrowed_iterator_t<_R2>>
   operator()(_R1&& __r1, _R2&& __r2, _Pred __pred = {}, _Proj1 __proj1 = {}, _Proj2 __proj2 = {}) const {
     return __go(
         ranges::begin(__r1), ranges::end(__r1), ranges::begin(__r2), ranges::end(__r2), __pred, __proj1, __proj2);

diff  --git a/libcxx/include/__algorithm/simd_utils.h b/libcxx/include/__algorithm/simd_utils.h
index 1aedb3db010f77..989a1957987e1e 100644
--- a/libcxx/include/__algorithm/simd_utils.h
+++ b/libcxx/include/__algorithm/simd_utils.h
@@ -9,6 +9,7 @@
 #ifndef _LIBCPP___ALGORITHM_SIMD_UTILS_H
 #define _LIBCPP___ALGORITHM_SIMD_UTILS_H
 
+#include <__algorithm/min.h>
 #include <__bit/bit_cast.h>
 #include <__bit/countr.h>
 #include <__config>
@@ -22,6 +23,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 // TODO: Find out how altivec changes things and allow vectorizations there too.
 #if _LIBCPP_STD_VER >= 14 && defined(_LIBCPP_CLANG_VER) && _LIBCPP_CLANG_VER >= 1700 && !defined(__ALTIVEC__)
 #  define _LIBCPP_HAS_ALGORITHM_VECTOR_UTILS 1
@@ -94,7 +98,8 @@ _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI size_t __find_first_set(__simd_vector<_T
 
   // This has MSan disabled du to https://github.com/llvm/llvm-project/issues/85876
   auto __impl = [&]<class _MaskT>(_MaskT) _LIBCPP_NO_SANITIZE("memory") noexcept {
-    return std::__countr_zero(__builtin_bit_cast(_MaskT, __builtin_convertvector(__vec, __mask_vec)));
+    return std::min<size_t>(
+        _Np, std::__countr_zero(__builtin_bit_cast(_MaskT, __builtin_convertvector(__vec, __mask_vec))));
   };
 
   if constexpr (sizeof(__mask_vec) == sizeof(uint8_t)) {
@@ -120,4 +125,6 @@ _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP_HAS_ALGORITHM_VECTOR_UTILS
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_SIMD_UTILS_H

diff  --git a/libcxx/test/libcxx/transitive_includes/cxx23.csv b/libcxx/test/libcxx/transitive_includes/cxx23.csv
index 79c67dc00cfb9b..69429b5bce8250 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx23.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx23.csv
@@ -452,6 +452,7 @@ random vector
 random version
 ranges compare
 ranges cstddef
+ranges cstdint
 ranges cwchar
 ranges initializer_list
 ranges iterator

diff  --git a/libcxx/test/libcxx/transitive_includes/cxx26.csv b/libcxx/test/libcxx/transitive_includes/cxx26.csv
index 79c67dc00cfb9b..69429b5bce8250 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx26.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx26.csv
@@ -452,6 +452,7 @@ random vector
 random version
 ranges compare
 ranges cstddef
+ranges cstdint
 ranges cwchar
 ranges initializer_list
 ranges iterator

diff  --git a/libcxx/test/std/algorithms/alg.nonmodifying/mismatch/mismatch.pass.cpp b/libcxx/test/std/algorithms/alg.nonmodifying/mismatch/mismatch.pass.cpp
index 55c9eea863c3ff..eb5f7cacdde34b 100644
--- a/libcxx/test/std/algorithms/alg.nonmodifying/mismatch/mismatch.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.nonmodifying/mismatch/mismatch.pass.cpp
@@ -187,8 +187,8 @@ int main(int, char**) {
   { // check the tail of the vectorized loop
     for (size_t vec_size = 1; vec_size != 256; ++vec_size) {
       {
-        std::vector<char> lhs(256);
-        std::vector<char> rhs(256);
+        std::vector<char> lhs(vec_size);
+        std::vector<char> rhs(vec_size);
 
         check<char*>(lhs, rhs, lhs.size());
         lhs.back() = 1;
@@ -199,8 +199,8 @@ int main(int, char**) {
         rhs.back() = 0;
       }
       {
-        std::vector<int> lhs(256);
-        std::vector<int> rhs(256);
+        std::vector<int> lhs(vec_size);
+        std::vector<int> rhs(vec_size);
 
         check<int*>(lhs, rhs, lhs.size());
         lhs.back() = 1;


        


More information about the libcxx-commits mailing list