[libcxx-commits] [libcxx] [libc++] Vectorize std::adjacent_find (PR #89757)
Nikolas Klauser via libcxx-commits
libcxx-commits at lists.llvm.org
Mon Apr 29 02:29:09 PDT 2024
https://github.com/philnik777 updated https://github.com/llvm/llvm-project/pull/89757
>From d5e486783933db7e4905f500c43b092f0a79dc3d Mon Sep 17 00:00:00 2001
From: Nikolas Klauser <nikolasklauser at berlin.de>
Date: Sun, 14 Apr 2024 15:50:28 +0200
Subject: [PATCH] [libc++] Vectorize std::adjacent_find
---
libcxx/benchmarks/CMakeLists.txt | 1 +
.../algorithms/adjacent_find.bench.cpp | 42 ++++++
libcxx/include/__algorithm/adjacent_find.h | 80 ++++++++++-
.../alg.adjacent.find/adjacent_find.pass.cpp | 124 ++++++++++++++----
4 files changed, 220 insertions(+), 27 deletions(-)
create mode 100644 libcxx/benchmarks/algorithms/adjacent_find.bench.cpp
diff --git a/libcxx/benchmarks/CMakeLists.txt b/libcxx/benchmarks/CMakeLists.txt
index 527a2acf2d3b36..33f9b18f80eb73 100644
--- a/libcxx/benchmarks/CMakeLists.txt
+++ b/libcxx/benchmarks/CMakeLists.txt
@@ -173,6 +173,7 @@ endfunction()
#==============================================================================
set(BENCHMARK_TESTS
algorithms.partition_point.bench.cpp
+ algorithms/adjacent_find.bench.cpp
algorithms/count.bench.cpp
algorithms/equal.bench.cpp
algorithms/find.bench.cpp
diff --git a/libcxx/benchmarks/algorithms/adjacent_find.bench.cpp b/libcxx/benchmarks/algorithms/adjacent_find.bench.cpp
new file mode 100644
index 00000000000000..4467acbcfbefd2
--- /dev/null
+++ b/libcxx/benchmarks/algorithms/adjacent_find.bench.cpp
@@ -0,0 +1,42 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <algorithm>
+#include <benchmark/benchmark.h>
+#include <random>
+
+void BenchmarkSizes(benchmark::internal::Benchmark* Benchmark) {
+ Benchmark->DenseRange(1, 8);
+ for (size_t i = 16; i != 1 << 20; i *= 2) {
+ Benchmark->Arg(i - 1);
+ Benchmark->Arg(i);
+ Benchmark->Arg(i + 1);
+ }
+}
+
+// TODO: Look into benchmarking aligned and unaligned memory explicitly
+// (currently things happen to be aligned because they are malloced that way)
+template <class T>
+static void bm_adjacent_find(benchmark::State& state) {
+ std::vector<T> vec1(state.range());
+
+ size_t val = 1;
+ for (auto& e : vec1) {
+ e = val++;
+ }
+
+ for (auto _ : state) {
+ benchmark::DoNotOptimize(vec1);
+ benchmark::DoNotOptimize(std::adjacent_find(vec1.begin(), vec1.end()));
+ }
+}
+BENCHMARK(bm_adjacent_find<char>)->Apply(BenchmarkSizes);
+BENCHMARK(bm_adjacent_find<short>)->Apply(BenchmarkSizes);
+BENCHMARK(bm_adjacent_find<int>)->Apply(BenchmarkSizes);
+
+BENCHMARK_MAIN();
diff --git a/libcxx/include/__algorithm/adjacent_find.h b/libcxx/include/__algorithm/adjacent_find.h
index 6f15456e3a4d07..b4c511383abd64 100644
--- a/libcxx/include/__algorithm/adjacent_find.h
+++ b/libcxx/include/__algorithm/adjacent_find.h
@@ -12,8 +12,11 @@
#include <__algorithm/comp.h>
#include <__algorithm/iterator_operations.h>
+#include <__algorithm/simd_utils.h>
+#include <__algorithm/unwrap_iter.h>
#include <__config>
#include <__iterator/iterator_traits.h>
+#include <__type_traits/is_constant_evaluated.h>
#include <__utility/move.h>
#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
@@ -27,7 +30,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD
template <class _Iter, class _Sent, class _BinaryPredicate>
_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Iter
-__adjacent_find(_Iter __first, _Sent __last, _BinaryPredicate&& __pred) {
+__adjacent_find_loop(_Iter __first, _Sent __last, _BinaryPredicate&& __pred) {
if (__first == __last)
return __first;
_Iter __i = __first;
@@ -39,10 +42,83 @@ __adjacent_find(_Iter __first, _Sent __last, _BinaryPredicate&& __pred) {
return __i;
}
+template <class _Iter, class _Sent, class _BinaryPredicate>
+_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Iter
+__adjacent_find(_Iter __first, _Sent __last, _BinaryPredicate&& __pred) {
+ return std::__adjacent_find_loop(__first, __last, __pred);
+}
+
+#if _LIBCPP_VECTORIZE_ALGORITHMS
+
+template <class _Tp, class _Pred>
+_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Tp*
+__adjacent_find_vectorized(_Tp* __first, _Tp* __last, _Pred& __pred) {
+ constexpr size_t __unroll_count = 4;
+ constexpr size_t __vec_size = __native_vector_size<_Tp>;
+ using __vec = __simd_vector<_Tp, __vec_size>;
+
+ if (!__libcpp_is_constant_evaluated()) {
+ auto __orig_first = __first;
+ while (static_cast<size_t>(__last - __first) > __unroll_count * __vec_size) [[__unlikely__]] {
+ __vec __cmp_res[__unroll_count];
+
+ for (size_t __i = 0; __i != __unroll_count; ++__i) {
+ __cmp_res[__i] = std::__load_vector<__vec>(__first + __i * __vec_size) !=
+ std::__load_vector<__vec>(__first + __i * __vec_size + 1);
+ }
+
+ for (size_t __i = 0; __i != __unroll_count; ++__i) {
+ if (!std::__all_of(__cmp_res[__i])) {
+ auto __offset = __i * __vec_size + std::__find_first_not_set(__cmp_res[__i]);
+ return __first + __offset;
+ }
+ }
+
+ __first += __unroll_count * __vec_size;
+ }
+
+ // check the last 0-3 vectors
+ while (static_cast<size_t>(__last - __first) > __vec_size) [[__unlikely__]] {
+ if (auto __cmp_res = std::__load_vector<__vec>(__first) != std::__load_vector<__vec>(__first + 1);
+ !std::__all_of(__cmp_res)) {
+ auto __offset = std::__find_first_not_set(__cmp_res);
+ return __first + __offset;
+ }
+ __first += __vec_size;
+ }
+
+ if (__first == __last)
+ return __first;
+
+ // Check if we can load elements in front of the current pointer. If that's the case load a vector at
+ // (last - vector_size - 1) to check the remaining elements
+ if (static_cast<size_t>(__first - __orig_first) > __vec_size) {
+ __first = __last - __vec_size - 1;
+ auto __offset =
+ std::__find_first_not_set(std::__load_vector<__vec>(__first) != std::__load_vector<__vec>(__first + 1));
+ if (__offset == __vec_size)
+ return __last;
+ return __first + __offset;
+ }
+ } // else loop over the elements individually
+ return std::__adjacent_find_loop(__first, __last, __pred);
+}
+
+template <class _Tp,
+ class _Pred,
+ __enable_if_t<is_integral<_Tp>::value && __desugars_to_v<__equal_tag, _Pred, _Tp, _Tp>, int> = 0>
+_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Tp*
+__adjacent_find(_Tp* __first, _Tp* __last, _Pred& __pred) {
+ return std::__adjacent_find_vectorized(__first, __last, __pred);
+}
+
+#endif // _LIBCPP_VECTORIZE_ALGORITHMS
+
template <class _ForwardIterator, class _BinaryPredicate>
_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _ForwardIterator
adjacent_find(_ForwardIterator __first, _ForwardIterator __last, _BinaryPredicate __pred) {
- return std::__adjacent_find(std::move(__first), std::move(__last), __pred);
+ return std::__rewrap_iter(
+ __first, std::__adjacent_find(std::__unwrap_iter(__first), std::__unwrap_iter(__last), __pred));
}
template <class _ForwardIterator>
diff --git a/libcxx/test/std/algorithms/alg.nonmodifying/alg.adjacent.find/adjacent_find.pass.cpp b/libcxx/test/std/algorithms/alg.nonmodifying/alg.adjacent.find/adjacent_find.pass.cpp
index 6d57c5869ab704..94d2947cf629f6 100644
--- a/libcxx/test/std/algorithms/alg.nonmodifying/alg.adjacent.find/adjacent_find.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.nonmodifying/alg.adjacent.find/adjacent_find.pass.cpp
@@ -14,39 +14,113 @@
// adjacent_find(Iter first, Iter last);
#include <algorithm>
+#include <array>
#include <cassert>
+#include <vector>
#include "test_macros.h"
#include "test_iterators.h"
-#if TEST_STD_VER > 17
-TEST_CONSTEXPR bool test_constexpr() {
- int ia[] = {0, 1, 2, 2, 0, 1, 2, 3};
- int ib[] = {0, 1, 2, 7, 0, 1, 2, 3};
+struct Test {
+ template <class Iter>
+ TEST_CONSTEXPR_CXX20 void operator()() {
+ int ia[] = {0, 1, 2, 2, 0, 1, 2, 3};
+ const unsigned sa = sizeof(ia) / sizeof(ia[0]);
+ assert(std::adjacent_find(Iter(ia), Iter(ia + sa)) == Iter(ia + 2));
+ assert(std::adjacent_find(Iter(ia), Iter(ia)) == Iter(ia));
+ assert(std::adjacent_find(Iter(ia + 3), Iter(ia + sa)) == Iter(ia + sa));
+ }
+};
- return (std::adjacent_find(std::begin(ia), std::end(ia)) == ia+2)
- && (std::adjacent_find(std::begin(ib), std::end(ib)) == std::end(ib))
- ;
- }
-#endif
+struct NonTrivial {
+ int i_;
+
+ TEST_CONSTEXPR_CXX20 NonTrivial(int i) : i_(i) {}
+ TEST_CONSTEXPR_CXX20 NonTrivial(NonTrivial&& other) : i_(other.i_) { other.i_ = 0; }
+
+ TEST_CONSTEXPR_CXX20 friend bool operator==(const NonTrivial& lhs, const NonTrivial& rhs) { return lhs.i_ == rhs.i_; }
+};
+
+struct ModTwoComp {
+ TEST_CONSTEXPR_CXX20 bool operator()(int lhs, int rhs) { return lhs % 2 == rhs % 2; }
+};
+
+TEST_CONSTEXPR_CXX20 bool test() {
+ types::for_each(types::forward_iterator_list<int*>(), Test());
+
+ { // use a non-integer type to also test the general case - no match
+ std::array<NonTrivial, 8> arr = {1, 2, 3, 4, 5, 6, 7, 8};
+ assert(std::adjacent_find(arr.begin(), arr.end()) == arr.end());
+ }
+
+ { // use a non-integer type to also test the general case - match
+ std::array<NonTrivial, 8> lhs = {1, 2, 3, 4, 4, 6, 7, 8};
+ assert(std::adjacent_find(lhs.begin(), lhs.end()) == lhs.begin() + 3);
+ }
+
+ { // use a custom comparator
+ std::array<int, 8> lhs = {0, 1, 2, 3, 5, 6, 7, 8};
+ assert(std::adjacent_find(lhs.begin(), lhs.end(), ModTwoComp()) == lhs.begin() + 3);
+ }
+
+ return true;
+}
-int main(int, char**)
-{
- int ia[] = {0, 1, 2, 2, 0, 1, 2, 3};
- const unsigned sa = sizeof(ia)/sizeof(ia[0]);
- assert(std::adjacent_find(forward_iterator<const int*>(ia),
- forward_iterator<const int*>(ia + sa)) ==
- forward_iterator<const int*>(ia+2));
- assert(std::adjacent_find(forward_iterator<const int*>(ia),
- forward_iterator<const int*>(ia)) ==
- forward_iterator<const int*>(ia));
- assert(std::adjacent_find(forward_iterator<const int*>(ia+3),
- forward_iterator<const int*>(ia + sa)) ==
- forward_iterator<const int*>(ia+sa));
-
-#if TEST_STD_VER > 17
- static_assert(test_constexpr());
+template <class T>
+void fill_vec(std::vector<T>& vec) {
+ for (size_t i = 0; i != vec.size(); ++i) {
+ vec[i] = static_cast<T>(i);
+ }
+}
+
+int main(int, char**) {
+ test();
+#if TEST_STD_VER >= 20
+ static_assert(test());
#endif
+ { // check with a lot of elements to test the vectorization optimization
+ {
+ std::vector<char> vec(256);
+ fill_vec(vec);
+ for (size_t i = 0; i != vec.size() - 1; ++i) {
+ vec[i] = static_cast<char>(i + 1);
+ assert(std::adjacent_find(vec.begin(), vec.end()) == vec.begin() + i);
+ vec[i] = static_cast<char>(i);
+ }
+ }
+
+ {
+ std::vector<int> vec(256);
+ fill_vec(vec);
+ for (size_t i = 0; i != vec.size() - 1; ++i) {
+ vec[i] = static_cast<int>(i + 1);
+ assert(std::adjacent_find(vec.begin(), vec.end()) == vec.begin() + i);
+ vec[i] = static_cast<int>(i);
+ }
+ }
+ }
+
+ { // check the tail of the vectorized loop
+ for (size_t vec_size = 2; vec_size != 256; ++vec_size) {
+ {
+ std::vector<char> vec(vec_size);
+ fill_vec(vec);
+
+ assert(std::adjacent_find(vec.begin(), vec.end()) == vec.end());
+ vec.back() = static_cast<char>(vec.size() - 2);
+ assert(std::adjacent_find(vec.begin(), vec.end()) == vec.end() - 2);
+ }
+ {
+ std::vector<int> vec(vec_size);
+ fill_vec(vec);
+
+ assert(std::adjacent_find(vec.begin(), vec.end()) == vec.end());
+ vec.back() = static_cast<int>(vec.size() - 2);
+ assert(std::adjacent_find(vec.begin(), vec.end()) == vec.end() - 2);
+ }
+ }
+ }
+
return 0;
}
More information about the libcxx-commits
mailing list