[libcxx-commits] [libcxx] [libc++] Vectorize mismatch (PR #73255)

Denis Yaroshevskiy via libcxx-commits libcxx-commits at lists.llvm.org
Sat Feb 24 08:45:09 PST 2024

@@ -11,23 +11,89 @@
 #include <__algorithm/comp.h>
+#include <__algorithm/simd_utils.h>
+#include <__algorithm/unwrap_iter.h>
 #include <__config>
-#include <__iterator/iterator_traits.h>
+#include <__functional/identity.h>
+#include <__type_traits/invoke.h>
+#include <__type_traits/is_constant_evaluated.h>
+#include <__type_traits/is_equality_comparable.h>
+#include <__type_traits/operation_traits.h>
+#include <__utility/move.h>
 #include <__utility/pair.h>
 #  pragma GCC system_header
+#include <__undef_macros>
+template <class _Iter1, class _Sent1, class _Iter2, class _Pred, class _Proj1, class _Proj2>
+__mismatch_loop(_Iter1 __first1, _Sent1 __last1, _Iter2 __first2, _Pred& __pred, _Proj1& __proj1, _Proj2& __proj2) {
+  while (__first1 != __last1) {
+    if (!std::__invoke(__pred, std::__invoke(__proj1, *__first1), std::__invoke(__proj2, *__first2)))
+      break;
+    ++__first1;
+    ++__first2;
+  }
+  return std::make_pair(std::move(__first1), std::move(__first2));
+template <class _Iter1, class _Sent1, class _Iter2, class _Pred, class _Proj1, class _Proj2>
+__mismatch(_Iter1 __first1, _Sent1 __last1, _Iter2 __first2, _Pred& __pred, _Proj1& __proj1, _Proj2& __proj2) {
+  return std::__mismatch_loop(__first1, __last1, __first2, __pred, __proj1, __proj2);
+template <class _Tp,
+          class _Pred,
+          class _Proj1,
+          class _Proj2,
+          __enable_if_t<is_integral<_Tp>::value && __desugars_to<__equal_tag, _Pred, _Tp, _Tp>::value &&
+                            __is_identity<_Proj1>::value && __is_identity<_Proj2>::value,
+                        int> = 0>
+__mismatch(_Tp* __first1, _Tp* __last1, _Tp* __first2, _Pred& __pred, _Proj1& __proj1, _Proj2& __proj2) {
+  constexpr size_t __unroll_count = 4;
+  constexpr size_t __vec_size     = __native_vector_size<_Tp>;
+  using __vec                     = __simd_vector<_Tp, __vec_size>;
+  while (!__libcpp_is_constant_evaluated() && static_cast<size_t>(__last1 - __first1) >= __unroll_count * __vec_size) {
+    __vec __lhs[__unroll_count];
+    __vec __rhs[__unroll_count];
+    for (size_t __i = 0; __i != __unroll_count; ++__i) {
+      __lhs[__i] = std::__load_vector<__vec>(__first1 + __i * __vec_size);
+      __rhs[__i] = std::__load_vector<__vec>(__first2 + __i * __vec_size);
+    }
+    for (size_t __i = 0; __i != __unroll_count; ++__i) {
+      if (auto __cmp_res = __lhs[__i] == __rhs[__i]; !std::__all_of(__cmp_res)) {
+        auto __offset = __i * __unroll_count + std::__find_first_not_set(__cmp_res);
+        return {__first1 + __offset, __first2 + __offset};
+      }
+    }
+    __first1 += __unroll_count * __vec_size;
+    __first2 += __unroll_count * __vec_size;
+  }
+  return std::__mismatch_loop(__first1, __last1, __first2, __pred, __proj1, __proj2);
DenisYaroshevskiy wrote:

fyi: this tale is not amazing. especially for chars, chars are slow.

the good tail is to load two overlapping registers of half size.
That is very unnoying to write though, so up to you.

see here how memcmp does it:


More information about the libcxx-commits mailing list