[libcxx-commits] [libcxx] [libc++] Vectorize std::adjacent_find (PR #89757)

Mon Apr 29 02:29:09 PDT 2024

https://github.com/philnik777 updated https://github.com/llvm/llvm-project/pull/89757

>From d5e486783933db7e4905f500c43b092f0a79dc3d Mon Sep 17 00:00:00 2001
From: Nikolas Klauser <nikolasklauser at berlin.de>
Date: Sun, 14 Apr 2024 15:50:28 +0200
Subject: [PATCH] [libc++] Vectorize std::adjacent_find

---
 libcxx/benchmarks/CMakeLists.txt              |   1 +
 .../algorithms/adjacent_find.bench.cpp        |  42 ++++++
 libcxx/include/__algorithm/adjacent_find.h    |  80 ++++++++++-
 .../alg.adjacent.find/adjacent_find.pass.cpp  | 124 ++++++++++++++----
 4 files changed, 220 insertions(+), 27 deletions(-)
 create mode 100644 libcxx/benchmarks/algorithms/adjacent_find.bench.cpp

diff --git a/libcxx/benchmarks/CMakeLists.txt b/libcxx/benchmarks/CMakeLists.txt
index 527a2acf2d3b36..33f9b18f80eb73 100644
--- a/libcxx/benchmarks/CMakeLists.txt
+++ b/libcxx/benchmarks/CMakeLists.txt
@@ -173,6 +173,7 @@ endfunction()
 #==============================================================================
 set(BENCHMARK_TESTS
     algorithms.partition_point.bench.cpp
+    algorithms/adjacent_find.bench.cpp
     algorithms/count.bench.cpp
     algorithms/equal.bench.cpp
     algorithms/find.bench.cpp
diff --git a/libcxx/benchmarks/algorithms/adjacent_find.bench.cpp b/libcxx/benchmarks/algorithms/adjacent_find.bench.cpp
new file mode 100644
index 00000000000000..4467acbcfbefd2
--- /dev/null
+++ b/libcxx/benchmarks/algorithms/adjacent_find.bench.cpp
@@ -0,0 +1,42 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <algorithm>
+#include <benchmark/benchmark.h>
+#include <random>
+
+void BenchmarkSizes(benchmark::internal::Benchmark* Benchmark) {
+  Benchmark->DenseRange(1, 8);
+  for (size_t i = 16; i != 1 << 20; i *= 2) {
+    Benchmark->Arg(i - 1);
+    Benchmark->Arg(i);
+    Benchmark->Arg(i + 1);
+  }
+}
+
+// TODO: Look into benchmarking aligned and unaligned memory explicitly
+// (currently things happen to be aligned because they are malloced that way)
+template <class T>
+static void bm_adjacent_find(benchmark::State& state) {
+  std::vector<T> vec1(state.range());
+
+  size_t val = 1;
+  for (auto& e : vec1) {
+    e = val++;
+  }
+
+  for (auto _ : state) {
+    benchmark::DoNotOptimize(vec1);
+    benchmark::DoNotOptimize(std::adjacent_find(vec1.begin(), vec1.end()));
+  }
+}
+BENCHMARK(bm_adjacent_find<char>)->Apply(BenchmarkSizes);
+BENCHMARK(bm_adjacent_find<short>)->Apply(BenchmarkSizes);
+BENCHMARK(bm_adjacent_find<int>)->Apply(BenchmarkSizes);
+
+BENCHMARK_MAIN();
diff --git a/libcxx/include/__algorithm/adjacent_find.h b/libcxx/include/__algorithm/adjacent_find.h
index 6f15456e3a4d07..b4c511383abd64 100644
--- a/libcxx/include/__algorithm/adjacent_find.h
+++ b/libcxx/include/__algorithm/adjacent_find.h
@@ -12,8 +12,11 @@
 
 #include <__algorithm/comp.h>
 #include <__algorithm/iterator_operations.h>
+#include <__algorithm/simd_utils.h>
+#include <__algorithm/unwrap_iter.h>
 #include <__config>
 #include <__iterator/iterator_traits.h>
+#include <__type_traits/is_constant_evaluated.h>
 #include <__utility/move.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
@@ -27,7 +30,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 
 template <class _Iter, class _Sent, class _BinaryPredicate>
 _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Iter
-__adjacent_find(_Iter __first, _Sent __last, _BinaryPredicate&& __pred) {
+__adjacent_find_loop(_Iter __first, _Sent __last, _BinaryPredicate&& __pred) {
   if (__first == __last)
     return __first;
   _Iter __i = __first;
@@ -39,10 +42,83 @@ __adjacent_find(_Iter __first, _Sent __last, _BinaryPredicate&& __pred) {
   return __i;
 }
 
+template <class _Iter, class _Sent, class _BinaryPredicate>
+_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Iter
+__adjacent_find(_Iter __first, _Sent __last, _BinaryPredicate&& __pred) {
+  return std::__adjacent_find_loop(__first, __last, __pred);
+}
+
+#if _LIBCPP_VECTORIZE_ALGORITHMS
+
+template <class _Tp, class _Pred>
+_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Tp*
+__adjacent_find_vectorized(_Tp* __first, _Tp* __last, _Pred& __pred) {
+  constexpr size_t __unroll_count = 4;
+  constexpr size_t __vec_size     = __native_vector_size<_Tp>;
+  using __vec                     = __simd_vector<_Tp, __vec_size>;
+
+  if (!__libcpp_is_constant_evaluated()) {
+    auto __orig_first = __first;
+    while (static_cast<size_t>(__last - __first) > __unroll_count * __vec_size) [[__unlikely__]] {
+      __vec __cmp_res[__unroll_count];
+
+      for (size_t __i = 0; __i != __unroll_count; ++__i) {
+        __cmp_res[__i] = std::__load_vector<__vec>(__first + __i * __vec_size) !=
+                         std::__load_vector<__vec>(__first + __i * __vec_size + 1);
+      }
+
+      for (size_t __i = 0; __i != __unroll_count; ++__i) {
+        if (!std::__all_of(__cmp_res[__i])) {
+          auto __offset = __i * __vec_size + std::__find_first_not_set(__cmp_res[__i]);
+          return __first + __offset;
+        }
+      }
+
+      __first += __unroll_count * __vec_size;
+    }
+
+    // check the last 0-3 vectors
+    while (static_cast<size_t>(__last - __first) > __vec_size) [[__unlikely__]] {
+      if (auto __cmp_res = std::__load_vector<__vec>(__first) != std::__load_vector<__vec>(__first + 1);
+          !std::__all_of(__cmp_res)) {
+        auto __offset = std::__find_first_not_set(__cmp_res);
+        return __first + __offset;
+      }
+      __first += __vec_size;
+    }
+
+    if (__first == __last)
+      return __first;
+
+    // Check if we can load elements in front of the current pointer. If that's the case load a vector at
+    // (last - vector_size - 1) to check the remaining elements
+    if (static_cast<size_t>(__first - __orig_first) > __vec_size) {
+      __first = __last - __vec_size - 1;
+      auto __offset =
+          std::__find_first_not_set(std::__load_vector<__vec>(__first) != std::__load_vector<__vec>(__first + 1));
+      if (__offset == __vec_size)
+        return __last;
+      return __first + __offset;
+    }
+  } // else loop over the elements individually
+  return std::__adjacent_find_loop(__first, __last, __pred);
+}
+
+template <class _Tp,
+          class _Pred,
+          __enable_if_t<is_integral<_Tp>::value && __desugars_to_v<__equal_tag, _Pred, _Tp, _Tp>, int> = 0>
+_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Tp*
+__adjacent_find(_Tp* __first, _Tp* __last, _Pred& __pred) {
+  return std::__adjacent_find_vectorized(__first, __last, __pred);
+}
+
+#endif // _LIBCPP_VECTORIZE_ALGORITHMS
+
 template <class _ForwardIterator, class _BinaryPredicate>
 _LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _ForwardIterator
 adjacent_find(_ForwardIterator __first, _ForwardIterator __last, _BinaryPredicate __pred) {
-  return std::__adjacent_find(std::move(__first), std::move(__last), __pred);
+  return std::__rewrap_iter(
+      __first, std::__adjacent_find(std::__unwrap_iter(__first), std::__unwrap_iter(__last), __pred));
 }
 
 template <class _ForwardIterator>
diff --git a/libcxx/test/std/algorithms/alg.nonmodifying/alg.adjacent.find/adjacent_find.pass.cpp b/libcxx/test/std/algorithms/alg.nonmodifying/alg.adjacent.find/adjacent_find.pass.cpp
index 6d57c5869ab704..94d2947cf629f6 100644
--- a/libcxx/test/std/algorithms/alg.nonmodifying/alg.adjacent.find/adjacent_find.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.nonmodifying/alg.adjacent.find/adjacent_find.pass.cpp
@@ -14,39 +14,113 @@
 //   adjacent_find(Iter first, Iter last);
 
 #include <algorithm>
+#include <array>
 #include <cassert>
+#include <vector>
 
 #include "test_macros.h"
 #include "test_iterators.h"
 
-#if TEST_STD_VER > 17
-TEST_CONSTEXPR bool test_constexpr() {
-    int ia[] = {0, 1, 2, 2, 0, 1, 2, 3};
-    int ib[] = {0, 1, 2, 7, 0, 1, 2, 3};
+struct Test {
+  template <class Iter>
+  TEST_CONSTEXPR_CXX20 void operator()() {
+    int ia[]          = {0, 1, 2, 2, 0, 1, 2, 3};
+    const unsigned sa = sizeof(ia) / sizeof(ia[0]);
+    assert(std::adjacent_find(Iter(ia), Iter(ia + sa)) == Iter(ia + 2));
+    assert(std::adjacent_find(Iter(ia), Iter(ia)) == Iter(ia));
+    assert(std::adjacent_find(Iter(ia + 3), Iter(ia + sa)) == Iter(ia + sa));
+  }
+};
 
-    return  (std::adjacent_find(std::begin(ia), std::end(ia)) == ia+2)
-         && (std::adjacent_find(std::begin(ib), std::end(ib)) == std::end(ib))
-         ;
-    }
-#endif
+struct NonTrivial {
+  int i_;
+
+  TEST_CONSTEXPR_CXX20 NonTrivial(int i) : i_(i) {}
+  TEST_CONSTEXPR_CXX20 NonTrivial(NonTrivial&& other) : i_(other.i_) { other.i_ = 0; }
+
+  TEST_CONSTEXPR_CXX20 friend bool operator==(const NonTrivial& lhs, const NonTrivial& rhs) { return lhs.i_ == rhs.i_; }
+};
+
+struct ModTwoComp {
+  TEST_CONSTEXPR_CXX20 bool operator()(int lhs, int rhs) { return lhs % 2 == rhs % 2; }
+};
+
+TEST_CONSTEXPR_CXX20 bool test() {
+  types::for_each(types::forward_iterator_list<int*>(), Test());
+
+  { // use a non-integer type to also test the general case - no match
+    std::array<NonTrivial, 8> arr = {1, 2, 3, 4, 5, 6, 7, 8};
+    assert(std::adjacent_find(arr.begin(), arr.end()) == arr.end());
+  }
+
+  { // use a non-integer type to also test the general case - match
+    std::array<NonTrivial, 8> lhs = {1, 2, 3, 4, 4, 6, 7, 8};
+    assert(std::adjacent_find(lhs.begin(), lhs.end()) == lhs.begin() + 3);
+  }
+
+  { // use a custom comparator
+    std::array<int, 8> lhs = {0, 1, 2, 3, 5, 6, 7, 8};
+    assert(std::adjacent_find(lhs.begin(), lhs.end(), ModTwoComp()) == lhs.begin() + 3);
+  }
+
+  return true;
+}
 
-int main(int, char**)
-{
-    int ia[] = {0, 1, 2, 2, 0, 1, 2, 3};
-    const unsigned sa = sizeof(ia)/sizeof(ia[0]);
-    assert(std::adjacent_find(forward_iterator<const int*>(ia),
-                              forward_iterator<const int*>(ia + sa)) ==
-                              forward_iterator<const int*>(ia+2));
-    assert(std::adjacent_find(forward_iterator<const int*>(ia),
-                              forward_iterator<const int*>(ia)) ==
-                              forward_iterator<const int*>(ia));
-    assert(std::adjacent_find(forward_iterator<const int*>(ia+3),
-                              forward_iterator<const int*>(ia + sa)) ==
-                              forward_iterator<const int*>(ia+sa));
-
-#if TEST_STD_VER > 17
-    static_assert(test_constexpr());
+template <class T>
+void fill_vec(std::vector<T>& vec) {
+  for (size_t i = 0; i != vec.size(); ++i) {
+    vec[i] = static_cast<T>(i);
+  }
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 20
+  static_assert(test());
 #endif
 
+  { // check with a lot of elements to test the vectorization optimization
+    {
+      std::vector<char> vec(256);
+      fill_vec(vec);
+      for (size_t i = 0; i != vec.size() - 1; ++i) {
+        vec[i] = static_cast<char>(i + 1);
+        assert(std::adjacent_find(vec.begin(), vec.end()) == vec.begin() + i);
+        vec[i] = static_cast<char>(i);
+      }
+    }
+
+    {
+      std::vector<int> vec(256);
+      fill_vec(vec);
+      for (size_t i = 0; i != vec.size() - 1; ++i) {
+        vec[i] = static_cast<int>(i + 1);
+        assert(std::adjacent_find(vec.begin(), vec.end()) == vec.begin() + i);
+        vec[i] = static_cast<int>(i);
+      }
+    }
+  }
+
+  { // check the tail of the vectorized loop
+    for (size_t vec_size = 2; vec_size != 256; ++vec_size) {
+      {
+        std::vector<char> vec(vec_size);
+        fill_vec(vec);
+
+        assert(std::adjacent_find(vec.begin(), vec.end()) == vec.end());
+        vec.back() = static_cast<char>(vec.size() - 2);
+        assert(std::adjacent_find(vec.begin(), vec.end()) == vec.end() - 2);
+      }
+      {
+        std::vector<int> vec(vec_size);
+        fill_vec(vec);
+
+        assert(std::adjacent_find(vec.begin(), vec.end()) == vec.end());
+        vec.back() = static_cast<int>(vec.size() - 2);
+        assert(std::adjacent_find(vec.begin(), vec.end()) == vec.end() - 2);
+      }
+    }
+  }
+
   return 0;
 }