[libcxx-commits] [libcxx] [libc++] Vectorize std::find (PR #156431)
Nikolas Klauser via libcxx-commits
libcxx-commits at lists.llvm.org
Mon Sep 22 02:37:24 PDT 2025
https://github.com/philnik777 updated https://github.com/llvm/llvm-project/pull/156431
>From 4b65ea112bc592dacb47ca0adf49b5170f99f4eb Mon Sep 17 00:00:00 2001
From: Nikolas Klauser <nikolasklauser at berlin.de>
Date: Tue, 2 Sep 2025 10:12:21 +0200
Subject: [PATCH] [libc++] Vectorize std::find
---
libcxx/docs/ReleaseNotes/22.rst | 2 +
libcxx/include/__algorithm/find.h | 108 ++++++++++++++----
libcxx/include/__algorithm/simd_utils.h | 11 +-
.../algorithms/nonmodifying/find.bench.cpp | 1 +
4 files changed, 97 insertions(+), 25 deletions(-)
diff --git a/libcxx/docs/ReleaseNotes/22.rst b/libcxx/docs/ReleaseNotes/22.rst
index 509ead64ee525..6b94cacb1b4a6 100644
--- a/libcxx/docs/ReleaseNotes/22.rst
+++ b/libcxx/docs/ReleaseNotes/22.rst
@@ -63,6 +63,8 @@ Improvements and New Features
- Multiple internal types have been refactored to use ``[[no_unique_address]]``, resulting in faster compile times and
reduced debug information.
+- The performance of ``std::find`` has been improved by up to 2x for integral types
+
Deprecations and Removals
-------------------------
diff --git a/libcxx/include/__algorithm/find.h b/libcxx/include/__algorithm/find.h
index 8c8cb5820fee3..0b0f1e6451b2e 100644
--- a/libcxx/include/__algorithm/find.h
+++ b/libcxx/include/__algorithm/find.h
@@ -12,6 +12,7 @@
#include <__algorithm/find_segment_if.h>
#include <__algorithm/min.h>
+#include <__algorithm/simd_utils.h>
#include <__algorithm/unwrap_iter.h>
#include <__bit/countr.h>
#include <__bit/invert_if.h>
@@ -44,39 +45,100 @@ _LIBCPP_BEGIN_NAMESPACE_STD
// generic implementation
template <class _Iter, class _Sent, class _Tp, class _Proj>
_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Iter
-__find(_Iter __first, _Sent __last, const _Tp& __value, _Proj& __proj) {
+__find_loop(_Iter __first, _Sent __last, const _Tp& __value, _Proj& __proj) {
for (; __first != __last; ++__first)
if (std::__invoke(__proj, *__first) == __value)
break;
return __first;
}
-// trivially equality comparable implementations
-template <class _Tp,
- class _Up,
- class _Proj,
- __enable_if_t<__is_identity<_Proj>::value && __libcpp_is_trivially_equality_comparable<_Tp, _Up>::value &&
- sizeof(_Tp) == 1,
- int> = 0>
-_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Tp* __find(_Tp* __first, _Tp* __last, const _Up& __value, _Proj&) {
- if (auto __ret = std::__constexpr_memchr(__first, __value, __last - __first))
- return __ret;
- return __last;
+template <class _Iter, class _Sent, class _Tp, class _Proj>
+_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Iter
+__find(_Iter __first, _Sent __last, const _Tp& __value, _Proj& __proj) {
+ return std::__find_loop(std::move(__first), std::move(__last), __value, __proj);
}
-#if _LIBCPP_HAS_WIDE_CHARACTERS
-template <class _Tp,
- class _Up,
- class _Proj,
- __enable_if_t<__is_identity<_Proj>::value && __libcpp_is_trivially_equality_comparable<_Tp, _Up>::value &&
- sizeof(_Tp) == sizeof(wchar_t) && _LIBCPP_ALIGNOF(_Tp) >= _LIBCPP_ALIGNOF(wchar_t),
- int> = 0>
+#if _LIBCPP_VECTORIZE_ALGORITHMS
+template <class _Tp, class _Up>
+[[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI
+_LIBCPP_CONSTEXPR_SINCE_CXX14 _Tp* __find_vectorized(_Tp* __first, _Tp* __last, _Up __value) {
+ if (!__libcpp_is_constant_evaluated()) {
+ constexpr size_t __unroll_count = 4;
+ constexpr size_t __vec_size = __native_vector_size<_Tp>;
+ using __vec = __simd_vector<_Tp, __vec_size>;
+
+ auto __orig_first = __first;
+
+ auto __values = static_cast<__simd_vector<_Up, __vec_size>>(__value);
+ while (static_cast<size_t>(__last - __first) >= __unroll_count * __vec_size) [[__unlikely__]] {
+ __vec __lhs[__unroll_count];
+
+ for (size_t __i = 0; __i != __unroll_count; ++__i)
+ __lhs[__i] = std::__load_vector<__vec>(__first + __i * __vec_size);
+
+ for (size_t __i = 0; __i != __unroll_count; ++__i) {
+ if (auto __cmp_res = __lhs[__i] == __values; std::__any_of(__cmp_res)) {
+ auto __offset = __i * __vec_size + std::__find_first_set(__cmp_res);
+ return __first + __offset;
+ }
+ }
+
+ __first += __unroll_count * __vec_size;
+ }
+
+ // check the remaining 0-3 vectors
+ while (static_cast<size_t>(__last - __first) >= __vec_size) {
+ if (auto __cmp_res = std::__load_vector<__vec>(__first) == __values; std::__any_of(__cmp_res)) {
+ return __first + std::__find_first_set(__cmp_res);
+ }
+ __first += __vec_size;
+ }
+
+ if (__last - __first == 0)
+ return __first;
+
+ // Check if we can load elements in front of the current pointer. If that's the case load a vector at
+ // (last - vector_size) to check the remaining elements
+ if (static_cast<size_t>(__first - __orig_first) >= __vec_size) {
+ __first = __last - __vec_size;
+ return __first + std::__find_first_set(std::__load_vector<__vec>(__first) == __values);
+ }
+ }
+
+ __identity __proj;
+ return std::__find_loop(__first, __last, __value, __proj);
+}
+#endif
+
+// trivially equality comparable implementations
+template <
+ class _Tp,
+ class _Up,
+ class _Proj,
+ __enable_if_t<__is_identity<_Proj>::value && __libcpp_is_trivially_equality_comparable<_Tp, _Up>::value, int> = 0>
_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Tp* __find(_Tp* __first, _Tp* __last, const _Up& __value, _Proj&) {
- if (auto __ret = std::__constexpr_wmemchr(__first, __value, __last - __first))
- return __ret;
- return __last;
+ if constexpr (sizeof(_Tp) == 1) {
+ if (auto __ret = std::__constexpr_memchr(__first, __value, __last - __first))
+ return __ret;
+ return __last;
+ }
+#if _LIBCPP_HAS_WIDE_CHARACTERS
+ else if constexpr (sizeof(_Tp) == sizeof(wchar_t) && _LIBCPP_ALIGNOF(_Tp) >= _LIBCPP_ALIGNOF(wchar_t)) {
+ if (auto __ret = std::__constexpr_wmemchr(__first, __value, __last - __first))
+ return __ret;
+ return __last;
+ }
+#endif
+#if _LIBCPP_VECTORIZE_ALGORITHMS
+ else if constexpr (is_integral<_Tp>::value) {
+ return std::__find_vectorized(__first, __last, __value);
+ }
+#endif
+ else {
+ __identity __proj;
+ return std::__find_loop(__first, __last, __value, __proj);
+ }
}
-#endif // _LIBCPP_HAS_WIDE_CHARACTERS
// TODO: This should also be possible to get right with different signedness
// cast integral types to allow vectorization
diff --git a/libcxx/include/__algorithm/simd_utils.h b/libcxx/include/__algorithm/simd_utils.h
index 96b074c063a5d..c632b36e78c44 100644
--- a/libcxx/include/__algorithm/simd_utils.h
+++ b/libcxx/include/__algorithm/simd_utils.h
@@ -108,19 +108,26 @@ using __simd_vector_underlying_type_t _LIBCPP_NODEBUG = decltype(std::__simd_vec
// This isn't inlined without always_inline when loading chars.
template <class _VecT, class _Iter>
-[[__nodiscard__]] _LIBCPP_ALWAYS_INLINE _LIBCPP_HIDE_FROM_ABI _VecT __load_vector(_Iter __iter) noexcept {
+[[__nodiscard__]] _LIBCPP_ALWAYS_INLINE _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR _VecT
+__load_vector(_Iter __iter) noexcept {
return [=]<size_t... _Indices>(index_sequence<_Indices...>) _LIBCPP_ALWAYS_INLINE noexcept {
return _VecT{__iter[_Indices]...};
}(make_index_sequence<__simd_vector_size_v<_VecT>>{});
}
+template <class _Tp, size_t _Np>
+[[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR bool __any_of(__simd_vector<_Tp, _Np> __vec) noexcept {
+ return __builtin_reduce_or(__builtin_convertvector(__vec, __simd_vector<bool, _Np>));
+}
+
template <class _Tp, size_t _Np>
[[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI bool __all_of(__simd_vector<_Tp, _Np> __vec) noexcept {
return __builtin_reduce_and(__builtin_convertvector(__vec, __simd_vector<bool, _Np>));
}
template <class _Tp, size_t _Np>
-[[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI size_t __find_first_set(__simd_vector<_Tp, _Np> __vec) noexcept {
+[[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR size_t
+__find_first_set(__simd_vector<_Tp, _Np> __vec) noexcept {
using __mask_vec = __simd_vector<bool, _Np>;
// This has MSan disabled du to https://llvm.org/PR85876
diff --git a/libcxx/test/benchmarks/algorithms/nonmodifying/find.bench.cpp b/libcxx/test/benchmarks/algorithms/nonmodifying/find.bench.cpp
index b2ead1cc75585..afea31fb59e95 100644
--- a/libcxx/test/benchmarks/algorithms/nonmodifying/find.bench.cpp
+++ b/libcxx/test/benchmarks/algorithms/nonmodifying/find.bench.cpp
@@ -51,6 +51,7 @@ int main(int argc, char** argv) {
// find
bm.template operator()<std::vector<char>>("std::find(vector<char>) (" + comment + ")", std_find);
bm.template operator()<std::vector<int>>("std::find(vector<int>) (" + comment + ")", std_find);
+ bm.template operator()<std::vector<long long>>("std::find(vector<long long>) (" + comment + ")", std_find);
bm.template operator()<std::deque<int>>("std::find(deque<int>) (" + comment + ")", std_find);
bm.template operator()<std::list<int>>("std::find(list<int>) (" + comment + ")", std_find);
More information about the libcxx-commits
mailing list