[libcxx-commits] [libcxx] [libc++] Vectorize std::mismatch with trivially equality comparable types (PR #87716)
Nikolas Klauser via libcxx-commits
libcxx-commits at lists.llvm.org
Sat Apr 6 13:22:22 PDT 2024
https://github.com/philnik777 updated https://github.com/llvm/llvm-project/pull/87716
>From 9eb34352d9b4aa4b8afb0d1411649b55c4f0eeba Mon Sep 17 00:00:00 2001
From: Nikolas Klauser <nikolasklauser at berlin.de>
Date: Mon, 25 Mar 2024 08:25:23 +0100
Subject: [PATCH] [libc++] Vectorize trivially equality comparable types
---
libcxx/include/CMakeLists.txt | 1 +
libcxx/include/__algorithm/mismatch.h | 60 +++++++---
libcxx/include/__algorithm/simd_utils.h | 27 ++++-
libcxx/include/__iterator/aliasing_iterator.h | 106 ++++++++++++++++++
.../mismatch/mismatch.pass.cpp | 47 ++++++--
5 files changed, 215 insertions(+), 26 deletions(-)
create mode 100644 libcxx/include/__iterator/aliasing_iterator.h
diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt
index 097a41d4c41740..5922c9106853ed 100644
--- a/libcxx/include/CMakeLists.txt
+++ b/libcxx/include/CMakeLists.txt
@@ -456,6 +456,7 @@ set(files
__ios/fpos.h
__iterator/access.h
__iterator/advance.h
+ __iterator/aliasing_iterator.h
__iterator/back_insert_iterator.h
__iterator/bounded_iter.h
__iterator/common_iterator.h
diff --git a/libcxx/include/__algorithm/mismatch.h b/libcxx/include/__algorithm/mismatch.h
index 4ada29eabc470c..cfabbe5fc7b337 100644
--- a/libcxx/include/__algorithm/mismatch.h
+++ b/libcxx/include/__algorithm/mismatch.h
@@ -16,6 +16,8 @@
#include <__algorithm/unwrap_iter.h>
#include <__config>
#include <__functional/identity.h>
+#include <__iterator/aliasing_iterator.h>
+#include <__type_traits/copy_cv.h>
#include <__type_traits/desugars_to.h>
#include <__type_traits/invoke.h>
#include <__type_traits/is_constant_evaluated.h>
@@ -53,20 +55,15 @@ __mismatch(_Iter1 __first1, _Sent1 __last1, _Iter2 __first2, _Pred& __pred, _Pro
return std::__mismatch_loop(__first1, __last1, __first2, __pred, __proj1, __proj2);
}
-#if _LIBCPP_VECTORIZE_ALGORITHMS
+#if _LIBCPP_HAS_ALGORITHM_VECTOR_UTILS
-template <class _Tp,
- class _Pred,
- class _Proj1,
- class _Proj2,
- __enable_if_t<is_integral<_Tp>::value && __desugars_to_v<__equal_tag, _Pred, _Tp, _Tp> &&
- __is_identity<_Proj1>::value && __is_identity<_Proj2>::value,
- int> = 0>
-_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair<_Tp*, _Tp*>
-__mismatch(_Tp* __first1, _Tp* __last1, _Tp* __first2, _Pred& __pred, _Proj1& __proj1, _Proj2& __proj2) {
+template <class _Iter>
+_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair<_Iter, _Iter>
+__mismatch_vectorized(_Iter __first1, _Iter __last1, _Iter __first2) {
+ using __value_type = __iter_value_type<_Iter>;
constexpr size_t __unroll_count = 4;
- constexpr size_t __vec_size = __native_vector_size<_Tp>;
- using __vec = __simd_vector<_Tp, __vec_size>;
+ constexpr size_t __vec_size = __native_vector_size<__value_type>;
+ using __vec = __simd_vector<__value_type, __vec_size>;
if (!__libcpp_is_constant_evaluated()) {
auto __orig_first1 = __first1;
@@ -116,9 +113,46 @@ __mismatch(_Tp* __first1, _Tp* __last1, _Tp* __first2, _Pred& __pred, _Proj1& __
} // else loop over the elements individually
}
- return std::__mismatch_loop(__first1, __last1, __first2, __pred, __proj1, __proj2);
+ __equal_to __pred;
+ __identity __proj;
+ return std::__mismatch_loop(__first1, __last1, __first2, __pred, __proj, __proj);
+}
+
+#endif // _LIBCPP_HAS_ALGORITHM_VECTOR_UTILS
+
+#if _LIBCPP_VECTORIZE_ALGORITHMS
+
+template <class _Tp,
+ class _Pred,
+ class _Proj1,
+ class _Proj2,
+ __enable_if_t<is_integral<_Tp>::value && __desugars_to_v<__equal_tag, _Pred, _Tp, _Tp> &&
+ __is_identity<_Proj1>::value && __is_identity<_Proj2>::value,
+ int> = 0>
+_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair<_Tp*, _Tp*>
+__mismatch(_Tp* __first1, _Tp* __last1, _Tp* __first2, _Pred&, _Proj1&, _Proj2&) {
+ return std::__mismatch_vectorized(__first1, __last1, __first2);
}
+template <class _Tp,
+ class _Pred,
+ class _Proj1,
+ class _Proj2,
+ __enable_if_t<!is_integral<_Tp>::value && __desugars_to_v<__equal_tag, _Pred, _Tp, _Tp> &&
+ __is_identity<_Proj1>::value && __is_identity<_Proj2>::value &&
+ __can_map_to_integer_v<_Tp> && __libcpp_is_trivially_equality_comparable<_Tp, _Tp>::value,
+ int> = 0>
+_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair<_Tp*, _Tp*>
+__mismatch(_Tp* __first1, _Tp* __last1, _Tp* __first2, _Pred& __pred, _Proj1& __proj1, _Proj2& __proj2) {
+ if (__libcpp_is_constant_evaluated()) {
+ return std::__mismatch_loop(__first1, __last1, __first2, __pred, __proj1, __proj2);
+ } else {
+ using _Iter = __aliasing_iterator<_Tp, __get_as_integer_type<_Tp>>;
+ // This is valid because we disable TBAA when loading vectors. Alignment requirements still have to be fulfilled.
+ auto __ret = std::__mismatch_vectorized(_Iter(__first1), _Iter(__last1), _Iter(__first2));
+ return {__ret.first.base(), __ret.second.base()};
+ }
+}
#endif // _LIBCPP_VECTORIZE_ALGORITHMS
template <class _InputIterator1, class _InputIterator2, class _BinaryPredicate>
diff --git a/libcxx/include/__algorithm/simd_utils.h b/libcxx/include/__algorithm/simd_utils.h
index 989a1957987e1e..e99cf1ba9203b3 100644
--- a/libcxx/include/__algorithm/simd_utils.h
+++ b/libcxx/include/__algorithm/simd_utils.h
@@ -43,6 +43,27 @@ _LIBCPP_PUSH_MACROS
_LIBCPP_BEGIN_NAMESPACE_STD
+template <class _Tp>
+inline constexpr bool __can_map_to_integer_v =
+ sizeof(_Tp) == alignof(_Tp) && (sizeof(_Tp) == 1 || sizeof(_Tp) == 2 || sizeof(_Tp) == 4 || sizeof(_Tp) == 8);
+
+template <class _Tp>
+_LIBCPP_HIDE_FROM_ABI auto __get_as_integer_type_impl() {
+ if constexpr (sizeof(_Tp) == 1)
+ return uint8_t{};
+ else if constexpr (sizeof(_Tp) == 2)
+ return uint16_t{};
+ else if constexpr (sizeof(_Tp) == 4)
+ return uint32_t{};
+ else if constexpr (sizeof(_Tp) == 8)
+ return uint64_t{};
+ else
+ static_assert(false, "Unexpected size type");
+}
+
+template <class _Tp>
+using __get_as_integer_type = decltype(std::__get_as_integer_type_impl<_Tp>());
+
// This isn't specialized for 64 byte vectors on purpose. They have the potential to significantly reduce performance
// in mixed simd/non-simd workloads and don't provide any performance improvement for currently vectorized algorithms
// as far as benchmarks are concerned.
@@ -80,10 +101,10 @@ template <class _VecT>
using __simd_vector_underlying_type_t = decltype(std::__simd_vector_underlying_type_impl(_VecT{}));
// This isn't inlined without always_inline when loading chars.
-template <class _VecT, class _Tp>
-_LIBCPP_NODISCARD _LIBCPP_ALWAYS_INLINE _LIBCPP_HIDE_FROM_ABI _VecT __load_vector(const _Tp* __ptr) noexcept {
+template <class _VecT, class _Iter>
+_LIBCPP_NODISCARD _LIBCPP_ALWAYS_INLINE _LIBCPP_HIDE_FROM_ABI _VecT __load_vector(_Iter __iter) noexcept {
return [=]<size_t... _Indices>(index_sequence<_Indices...>) _LIBCPP_ALWAYS_INLINE noexcept {
- return _VecT{__ptr[_Indices]...};
+ return _VecT{__iter[_Indices]...};
}(make_index_sequence<__simd_vector_size_v<_VecT>>{});
}
diff --git a/libcxx/include/__iterator/aliasing_iterator.h b/libcxx/include/__iterator/aliasing_iterator.h
new file mode 100644
index 00000000000000..50c30f057292c5
--- /dev/null
+++ b/libcxx/include/__iterator/aliasing_iterator.h
@@ -0,0 +1,106 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___ITERATOR_ALIASING_ITERATOR_H
+#define _LIBCPP___ITERATOR_ALIASING_ITERATOR_H
+
+#include <__config>
+#include <__iterator/iterator_traits.h>
+#include <__type_traits/is_trivial.h>
+#include <cstddef>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+# pragma GCC system_header
+#endif
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+template <class _BaseT, class _Alias>
+class __aliasing_iterator {
+ _BaseT* __base_ = nullptr;
+
+public:
+ using iterator_category = random_access_iterator_tag;
+ using value_type = _Alias;
+ using difference_type = ptrdiff_t;
+
+ static_assert(is_trivial<_Alias>::value);
+ static_assert(sizeof(_BaseT) == sizeof(_Alias));
+
+ _LIBCPP_HIDE_FROM_ABI __aliasing_iterator() = default;
+ _LIBCPP_HIDE_FROM_ABI __aliasing_iterator(_BaseT* __base) _NOEXCEPT : __base_(__base) {}
+
+ _LIBCPP_HIDE_FROM_ABI __aliasing_iterator& operator++() _NOEXCEPT {
+ ++__base_;
+ return *this;
+ }
+
+ _LIBCPP_HIDE_FROM_ABI __aliasing_iterator operator++(int) _NOEXCEPT {
+ __aliasing_iterator __tmp(*this);
+ ++__base_;
+ return __tmp;
+ }
+
+ _LIBCPP_HIDE_FROM_ABI __aliasing_iterator& operator--() _NOEXCEPT {
+ --__base_;
+ return *this;
+ }
+
+ _LIBCPP_HIDE_FROM_ABI __aliasing_iterator operator--(int) _NOEXCEPT {
+ __aliasing_iterator __tmp(*this);
+ --__base_;
+ return __tmp;
+ }
+
+ friend _LIBCPP_HIDE_FROM_ABI __aliasing_iterator
+ operator+(__aliasing_iterator __iter, difference_type __n) _NOEXCEPT {
+ return __aliasing_iterator(__iter.__base_ + __n);
+ }
+
+ _LIBCPP_HIDE_FROM_ABI __aliasing_iterator operator+=(difference_type __n) _NOEXCEPT {
+ __base_ += __n;
+ return *this;
+ }
+
+ _LIBCPP_HIDE_FROM_ABI __aliasing_iterator operator-(difference_type __n) _NOEXCEPT {
+ return __aliasing_iterator(__base_ - __n);
+ }
+
+ _LIBCPP_HIDE_FROM_ABI __aliasing_iterator operator-=(difference_type __n) _NOEXCEPT {
+ __base_ -= __n;
+ return *this;
+ }
+
+ friend _LIBCPP_HIDE_FROM_ABI difference_type
+ operator-(__aliasing_iterator __lhs, __aliasing_iterator __rhs) _NOEXCEPT {
+ return __lhs.__base_ - __rhs.__base_;
+ }
+
+ _BaseT* base() _NOEXCEPT { return __base_; }
+ const _BaseT* base() const _NOEXCEPT { return __base_; }
+
+ _LIBCPP_HIDE_FROM_ABI _Alias operator*() const _NOEXCEPT {
+ _Alias __val;
+ __builtin_memcpy(&__val, __base_, sizeof(_BaseT));
+ return __val;
+ }
+
+ _LIBCPP_HIDE_FROM_ABI _Alias operator[](difference_type __n) const _NOEXCEPT { return *(*this + __n); }
+
+ friend bool operator==(const __aliasing_iterator& __lhs, const __aliasing_iterator& __rhs) _NOEXCEPT {
+ return __lhs.__base_ == __rhs.__base_;
+ }
+
+ friend bool operator!=(const __aliasing_iterator& __lhs, const __aliasing_iterator& __rhs) _NOEXCEPT {
+ return __lhs.__base_ != __rhs.__base_;
+ }
+};
+
+_LIBCPP_END_NAMESPACE_STD
+
+#endif // _LIBCPP___ITERATOR_ALIASING_ITERATOR_H
diff --git a/libcxx/test/std/algorithms/alg.nonmodifying/mismatch/mismatch.pass.cpp b/libcxx/test/std/algorithms/alg.nonmodifying/mismatch/mismatch.pass.cpp
index eb5f7cacdde34b..72df17628dad78 100644
--- a/libcxx/test/std/algorithms/alg.nonmodifying/mismatch/mismatch.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.nonmodifying/mismatch/mismatch.pass.cpp
@@ -66,14 +66,27 @@ TEST_CONSTEXPR_CXX20 void check(Container1 lhs, Container2 rhs, size_t offset) {
#endif
}
-struct NonTrivial {
+// Compares modulo 4 to make sure we only forward to the vectorized version if we are trivially equality comparable
+struct NonTrivialMod4Comp {
int i_;
- TEST_CONSTEXPR_CXX20 NonTrivial(int i) : i_(i) {}
- TEST_CONSTEXPR_CXX20 NonTrivial(NonTrivial&& other) : i_(other.i_) { other.i_ = 0; }
+ TEST_CONSTEXPR_CXX20 NonTrivialMod4Comp(int i) : i_(i) {}
+ TEST_CONSTEXPR_CXX20 NonTrivialMod4Comp(NonTrivialMod4Comp&& other) : i_(other.i_) { other.i_ = 0; }
- TEST_CONSTEXPR_CXX20 friend bool operator==(const NonTrivial& lhs, const NonTrivial& rhs) { return lhs.i_ == rhs.i_; }
+ TEST_CONSTEXPR_CXX20 friend bool operator==(const NonTrivialMod4Comp& lhs, const NonTrivialMod4Comp& rhs) {
+ return lhs.i_ % 4 == rhs.i_ % 4;
+ }
+};
+
+#if TEST_STD_VER >= 20
+struct TriviallyEqualityComparable {
+ int i_;
+
+ TEST_CONSTEXPR_CXX20 TriviallyEqualityComparable(int i) : i_(i) {}
+
+ TEST_CONSTEXPR_CXX20 friend bool operator==(TriviallyEqualityComparable, TriviallyEqualityComparable) = default;
};
+#endif // TEST_STD_VER >= 20
struct ModTwoComp {
TEST_CONSTEXPR_CXX20 bool operator()(int lhs, int rhs) { return lhs % 2 == rhs % 2; }
@@ -136,16 +149,30 @@ TEST_CONSTEXPR_CXX20 bool test() {
types::for_each(types::cpp17_input_iterator_list<int*>(), Test());
{ // use a non-integer type to also test the general case - all elements match
- std::array<NonTrivial, 8> lhs = {1, 2, 3, 4, 5, 6, 7, 8};
- std::array<NonTrivial, 8> rhs = {1, 2, 3, 4, 5, 6, 7, 8};
- check<NonTrivial*>(std::move(lhs), std::move(rhs), 8);
+ std::array<NonTrivialMod4Comp, 8> lhs = {1, 2, 3, 4, 5, 6, 7, 8};
+ std::array<NonTrivialMod4Comp, 8> rhs = {1, 2, 3, 4, 1, 6, 7, 8};
+ check<NonTrivialMod4Comp*>(std::move(lhs), std::move(rhs), 8);
}
{ // use a non-integer type to also test the general case - not all elements match
- std::array<NonTrivial, 8> lhs = {1, 2, 3, 4, 7, 6, 7, 8};
- std::array<NonTrivial, 8> rhs = {1, 2, 3, 4, 5, 6, 7, 8};
- check<NonTrivial*>(std::move(lhs), std::move(rhs), 4);
+ std::array<NonTrivialMod4Comp, 8> lhs = {1, 2, 3, 4, 7, 6, 7, 8};
+ std::array<NonTrivialMod4Comp, 8> rhs = {1, 2, 3, 4, 5, 6, 7, 8};
+ check<NonTrivialMod4Comp*>(std::move(lhs), std::move(rhs), 4);
+ }
+
+#if TEST_STD_VER >= 20
+ { // trivially equaltiy comparable class type to test forwarding to the vectorized version - all elements match
+ std::array<TriviallyEqualityComparable, 8> lhs = {1, 2, 3, 4, 5, 6, 7, 8};
+ std::array<TriviallyEqualityComparable, 8> rhs = {1, 2, 3, 4, 5, 6, 7, 8};
+ check<TriviallyEqualityComparable*>(std::move(lhs), std::move(rhs), 8);
+ }
+
+ { // trivially equaltiy comparable class type to test forwarding to the vectorized version - not all elements match
+ std::array<TriviallyEqualityComparable, 8> lhs = {1, 2, 3, 4, 7, 6, 7, 8};
+ std::array<TriviallyEqualityComparable, 8> rhs = {1, 2, 3, 4, 5, 6, 7, 8};
+ check<TriviallyEqualityComparable*>(std::move(lhs), std::move(rhs), 4);
}
+#endif // TEST_STD_VER >= 20
return true;
}
More information about the libcxx-commits
mailing list