[libcxx-commits] [libcxx] [libc++] Vectorize std::find (PR #156431)

via libcxx-commits libcxx-commits at lists.llvm.org
Mon Sep 15 08:31:24 PDT 2025


llvmbot wrote:


<!--LLVM PR SUMMARY COMMENT-->

@llvm/pr-subscribers-libcxx

Author: Nikolas Klauser (philnik777)

<details>
<summary>Changes</summary>

```
-----------------------------------------------------------------------------
Benchmark                                                 old             new
-----------------------------------------------------------------------------
std::find(vector<char>) (bail 25%)/8                  1.43 ns         1.44 ns
std::find(vector<char>) (bail 25%)/1024               5.54 ns         5.59 ns
std::find(vector<char>) (bail 25%)/8192               38.4 ns         39.1 ns
std::find(vector<char>) (bail 25%)/32768               134 ns          136 ns
std::find(vector<int>) (bail 25%)/8                   1.56 ns         1.57 ns
std::find(vector<int>) (bail 25%)/1024                65.3 ns         65.4 ns
std::find(vector<int>) (bail 25%)/8192                 465 ns          464 ns
std::find(vector<int>) (bail 25%)/32768               1832 ns         1832 ns
std::find(vector<long long>) (bail 25%)/8            0.920 ns         1.20 ns
std::find(vector<long long>) (bail 25%)/1024          65.2 ns         31.2 ns
std::find(vector<long long>) (bail 25%)/8192           464 ns          255 ns
std::find(vector<long long>) (bail 25%)/32768         1833 ns          992 ns
std::find(vector<char>) (process all)/8               1.21 ns         1.22 ns
std::find(vector<char>) (process all)/50              1.92 ns         1.93 ns
std::find(vector<char>) (process all)/1024            16.6 ns         16.9 ns
std::find(vector<char>) (process all)/8192             134 ns          136 ns
std::find(vector<char>) (process all)/32768            488 ns          503 ns
std::find(vector<int>) (process all)/8                2.45 ns         2.48 ns
std::find(vector<int>) (process all)/50               12.7 ns         12.7 ns
std::find(vector<int>) (process all)/1024              236 ns          236 ns
std::find(vector<int>) (process all)/8192             1830 ns         1834 ns
std::find(vector<int>) (process all)/32768            7351 ns         7346 ns
std::find(vector<long long>) (process all)/8          2.02 ns         1.45 ns
std::find(vector<long long>) (process all)/50         12.0 ns         6.12 ns
std::find(vector<long long>) (process all)/1024        235 ns          123 ns
std::find(vector<long long>) (process all)/8192       1830 ns          983 ns
std::find(vector<long long>) (process all)/32768      7306 ns         3969 ns
std::find(vector<bool>) (process all)/8               1.14 ns         1.15 ns
std::find(vector<bool>) (process all)/50              1.16 ns         1.17 ns
std::find(vector<bool>) (process all)/1024            4.51 ns         4.53 ns
std::find(vector<bool>) (process all)/8192            33.6 ns         33.5 ns
std::find(vector<bool>) (process all)/1048576         3660 ns         3660 ns
```

---
Full diff: https://github.com/llvm/llvm-project/pull/156431.diff


4 Files Affected:

- (modified) libcxx/docs/ReleaseNotes/22.rst (+2) 
- (modified) libcxx/include/__algorithm/find.h (+83-23) 
- (modified) libcxx/include/__algorithm/simd_utils.h (+9-2) 
- (modified) libcxx/test/benchmarks/algorithms/nonmodifying/find.bench.cpp (+1) 


``````````diff
diff --git a/libcxx/docs/ReleaseNotes/22.rst b/libcxx/docs/ReleaseNotes/22.rst
index cd8171dc5d1e8..f36230697a25c 100644
--- a/libcxx/docs/ReleaseNotes/22.rst
+++ b/libcxx/docs/ReleaseNotes/22.rst
@@ -62,6 +62,8 @@ Improvements and New Features
 - ``ofstream::write`` has been optimized to pass through large strings to system calls directly instead of copying them
   in chunks into a buffer.
 
+- The performance of ``std::find`` has been improved by up to 2x for integral types
+
 Deprecations and Removals
 -------------------------
 
diff --git a/libcxx/include/__algorithm/find.h b/libcxx/include/__algorithm/find.h
index 8c8cb5820fee3..b76216c0a2d7b 100644
--- a/libcxx/include/__algorithm/find.h
+++ b/libcxx/include/__algorithm/find.h
@@ -12,6 +12,7 @@
 
 #include <__algorithm/find_segment_if.h>
 #include <__algorithm/min.h>
+#include <__algorithm/simd_utils.h>
 #include <__algorithm/unwrap_iter.h>
 #include <__bit/countr.h>
 #include <__bit/invert_if.h>
@@ -44,39 +45,98 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 // generic implementation
 template <class _Iter, class _Sent, class _Tp, class _Proj>
 _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Iter
-__find(_Iter __first, _Sent __last, const _Tp& __value, _Proj& __proj) {
+__find_loop(_Iter __first, _Sent __last, const _Tp& __value, _Proj& __proj) {
   for (; __first != __last; ++__first)
     if (std::__invoke(__proj, *__first) == __value)
       break;
   return __first;
 }
 
-// trivially equality comparable implementations
-template <class _Tp,
-          class _Up,
-          class _Proj,
-          __enable_if_t<__is_identity<_Proj>::value && __libcpp_is_trivially_equality_comparable<_Tp, _Up>::value &&
-                            sizeof(_Tp) == 1,
-                        int> = 0>
-_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Tp* __find(_Tp* __first, _Tp* __last, const _Up& __value, _Proj&) {
-  if (auto __ret = std::__constexpr_memchr(__first, __value, __last - __first))
-    return __ret;
-  return __last;
+template <class _Iter, class _Sent, class _Tp, class _Proj>
+_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Iter
+__find(_Iter __first, _Sent __last, const _Tp& __value, _Proj& __proj) {
+  return std::__find_loop(std::move(__first), std::move(__last), __value, __proj);
 }
 
-#if _LIBCPP_HAS_WIDE_CHARACTERS
-template <class _Tp,
-          class _Up,
-          class _Proj,
-          __enable_if_t<__is_identity<_Proj>::value && __libcpp_is_trivially_equality_comparable<_Tp, _Up>::value &&
-                            sizeof(_Tp) == sizeof(wchar_t) && _LIBCPP_ALIGNOF(_Tp) >= _LIBCPP_ALIGNOF(wchar_t),
-                        int> = 0>
+#if _LIBCPP_VECTORIZE_ALGORITHMS
+template <class _Tp, class _Up>
+[[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI
+_LIBCPP_CONSTEXPR_SINCE_CXX14 _Tp* __find_vectorized(_Tp* __first, _Tp* __last, _Up __value) {
+  constexpr size_t __unroll_count = 4;
+  constexpr size_t __vec_size     = __native_vector_size<_Tp>;
+  using __vec                     = __simd_vector<_Tp, __vec_size>;
+
+  auto __orig_first = __first;
+
+  auto __values = static_cast<__simd_vector<_Up, __vec_size>>(__value);
+  while (static_cast<size_t>(__last - __first) >= __unroll_count * __vec_size) [[__unlikely__]] {
+    __vec __lhs[__unroll_count];
+
+    for (size_t __i = 0; __i != __unroll_count; ++__i)
+      __lhs[__i] = std::__load_vector<__vec>(__first + __i * __vec_size);
+
+    for (size_t __i = 0; __i != __unroll_count; ++__i) {
+      if (auto __cmp_res = __lhs[__i] == __values; std::__any_of(__cmp_res)) {
+        auto __offset = __i * __vec_size + std::__find_first_set(__cmp_res);
+        return __first + __offset;
+      }
+    }
+
+    __first += __unroll_count * __vec_size;
+  }
+
+  // check the remaining 0-3 vectors
+  while (static_cast<size_t>(__last - __first) >= __vec_size) {
+    if (auto __cmp_res = std::__load_vector<__vec>(__first) == __values; std::__any_of(__cmp_res)) {
+      return __first + std::__find_first_set(__cmp_res);
+    }
+    __first += __vec_size;
+  }
+
+  if (__last - __first == 0)
+    return __first;
+
+  // Check if we can load elements in front of the current pointer. If that's the case load a vector at
+  // (last - vector_size) to check the remaining elements
+  if (static_cast<size_t>(__first - __orig_first) >= __vec_size) {
+    __first = __last - __vec_size;
+    return __first + std::__find_first_set(std::__load_vector<__vec>(__first) == __values);
+  }
+
+  __identity __proj;
+  return std::__find_loop(__first, __last, __value, __proj);
+}
+#endif
+
+// trivially equality comparable implementations
+template <
+    class _Tp,
+    class _Up,
+    class _Proj,
+    __enable_if_t<__is_identity<_Proj>::value && __libcpp_is_trivially_equality_comparable<_Tp, _Up>::value, int> = 0>
 _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Tp* __find(_Tp* __first, _Tp* __last, const _Up& __value, _Proj&) {
-  if (auto __ret = std::__constexpr_wmemchr(__first, __value, __last - __first))
-    return __ret;
-  return __last;
+  if constexpr (sizeof(_Tp) == 1) {
+    if (auto __ret = std::__constexpr_memchr(__first, __value, __last - __first))
+      return __ret;
+    return __last;
+  }
+#if _LIBCPP_HAS_WIDE_CHARACTERS
+  else if constexpr (sizeof(_Tp) == sizeof(wchar_t) && _LIBCPP_ALIGNOF(_Tp) >= _LIBCPP_ALIGNOF(wchar_t)) {
+    if (auto __ret = std::__constexpr_wmemchr(__first, __value, __last - __first))
+      return __ret;
+    return __last;
+  }
+#endif
+#if _LIBCPP_VECTORIZE_ALGORITHMS
+  else if constexpr (is_integral<_Tp>::value) {
+    return std::__find_vectorized(__first, __last, __value);
+  }
+#endif
+  else {
+    __identity __proj;
+    return std::__find_loop(__first, __last, __value, __proj);
+  }
 }
-#endif // _LIBCPP_HAS_WIDE_CHARACTERS
 
 // TODO: This should also be possible to get right with different signedness
 // cast integral types to allow vectorization
diff --git a/libcxx/include/__algorithm/simd_utils.h b/libcxx/include/__algorithm/simd_utils.h
index 07fef20f6166d..eb5da7f46ecd4 100644
--- a/libcxx/include/__algorithm/simd_utils.h
+++ b/libcxx/include/__algorithm/simd_utils.h
@@ -108,19 +108,26 @@ using __simd_vector_underlying_type_t _LIBCPP_NODEBUG = decltype(std::__simd_vec
 
 // This isn't inlined without always_inline when loading chars.
 template <class _VecT, class _Iter>
-[[__nodiscard__]] _LIBCPP_ALWAYS_INLINE _LIBCPP_HIDE_FROM_ABI _VecT __load_vector(_Iter __iter) noexcept {
+[[__nodiscard__]] _LIBCPP_ALWAYS_INLINE _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR _VecT
+__load_vector(_Iter __iter) noexcept {
   return [=]<size_t... _Indices>(index_sequence<_Indices...>) _LIBCPP_ALWAYS_INLINE noexcept {
     return _VecT{__iter[_Indices]...};
   }(make_index_sequence<__simd_vector_size_v<_VecT>>{});
 }
 
+template <class _Tp, size_t _Np>
+[[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR bool __any_of(__simd_vector<_Tp, _Np> __vec) noexcept {
+  return __builtin_reduce_or(__builtin_convertvector(__vec, __simd_vector<bool, _Np>));
+}
+
 template <class _Tp, size_t _Np>
 [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI bool __all_of(__simd_vector<_Tp, _Np> __vec) noexcept {
   return __builtin_reduce_and(__builtin_convertvector(__vec, __simd_vector<bool, _Np>));
 }
 
 template <class _Tp, size_t _Np>
-[[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI size_t __find_first_set(__simd_vector<_Tp, _Np> __vec) noexcept {
+[[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR size_t
+__find_first_set(__simd_vector<_Tp, _Np> __vec) noexcept {
   using __mask_vec = __simd_vector<bool, _Np>;
 
   // This has MSan disabled du to https://github.com/llvm/llvm-project/issues/85876
diff --git a/libcxx/test/benchmarks/algorithms/nonmodifying/find.bench.cpp b/libcxx/test/benchmarks/algorithms/nonmodifying/find.bench.cpp
index b2ead1cc75585..afea31fb59e95 100644
--- a/libcxx/test/benchmarks/algorithms/nonmodifying/find.bench.cpp
+++ b/libcxx/test/benchmarks/algorithms/nonmodifying/find.bench.cpp
@@ -51,6 +51,7 @@ int main(int argc, char** argv) {
     // find
     bm.template operator()<std::vector<char>>("std::find(vector<char>) (" + comment + ")", std_find);
     bm.template operator()<std::vector<int>>("std::find(vector<int>) (" + comment + ")", std_find);
+    bm.template operator()<std::vector<long long>>("std::find(vector<long long>) (" + comment + ")", std_find);
     bm.template operator()<std::deque<int>>("std::find(deque<int>) (" + comment + ")", std_find);
     bm.template operator()<std::list<int>>("std::find(list<int>) (" + comment + ")", std_find);
 

``````````

</details>


https://github.com/llvm/llvm-project/pull/156431


More information about the libcxx-commits mailing list