[llvm] [libcxx] [clang-tools-extra] [libc++] Introduce one-sided binary search for lower_bound on non-random iterators, and use that to improve the average complexity of set_intersection. (PR #75230)

Iuri Chaer via cfe-commits cfe-commits at lists.llvm.org
Mon Jan 29 06:29:53 PST 2024


https://github.com/ichaer updated https://github.com/llvm/llvm-project/pull/75230

>From b65415f5b70591eae965cae1316054145d399158 Mon Sep 17 00:00:00 2001
From: Iuri Chaer <ichaer at splunk.com>
Date: Tue, 17 Oct 2023 13:52:13 +0100
Subject: [PATCH 01/14] [libc++][test] Add lower_bound complexity validation
 tests prior to introducing one-sided binary search for non-random iterators.

---
 .../lower.bound/lower_bound.pass.cpp          | 19 +++++--
 .../lower.bound/lower_bound_comp.pass.cpp     | 28 ++++++++--
 libcxx/test/support/test_iterators.h          | 55 ++++++++++++++-----
 3 files changed, 79 insertions(+), 23 deletions(-)

diff --git a/libcxx/test/std/algorithms/alg.sorting/alg.binary.search/lower.bound/lower_bound.pass.cpp b/libcxx/test/std/algorithms/alg.sorting/alg.binary.search/lower.bound/lower_bound.pass.cpp
index a2d8ab632303cb..5c11962d137779 100644
--- a/libcxx/test/std/algorithms/alg.sorting/alg.binary.search/lower.bound/lower_bound.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.sorting/alg.binary.search/lower.bound/lower_bound.pass.cpp
@@ -39,11 +39,20 @@ template <class Iter, class T>
 void
 test(Iter first, Iter last, const T& value)
 {
-    Iter i = std::lower_bound(first, last, value);
-    for (Iter j = first; j != i; ++j)
-        assert(*j < value);
-    for (Iter j = i; j != last; ++j)
-        assert(!(*j < value));
+  std::size_t strides{};
+  std::size_t displacement{};
+  stride_counting_iterator f(first, &strides, &displacement);
+  stride_counting_iterator l(last, &strides, &displacement);
+
+  auto i = std::lower_bound(f, l, value);
+  for (auto j = f; j != i; ++j)
+    assert(*j < value);
+  for (auto j = i; j != l; ++j)
+    assert(!(*j < value));
+
+  auto len = std::distance(first, last);
+  assert(strides <= 2.5 * len + 1);
+  assert(displacement <= 2.5 * len + 1);
 }
 
 template <class Iter>
diff --git a/libcxx/test/std/algorithms/alg.sorting/alg.binary.search/lower.bound/lower_bound_comp.pass.cpp b/libcxx/test/std/algorithms/alg.sorting/alg.binary.search/lower.bound/lower_bound_comp.pass.cpp
index b9133028d9ade2..05fd43eada4616 100644
--- a/libcxx/test/std/algorithms/alg.sorting/alg.binary.search/lower.bound/lower_bound_comp.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.sorting/alg.binary.search/lower.bound/lower_bound_comp.pass.cpp
@@ -17,6 +17,7 @@
 #include <vector>
 #include <cassert>
 #include <cstddef>
+#include <cmath>
 
 #include "test_macros.h"
 #include "test_iterators.h"
@@ -38,11 +39,28 @@ template <class Iter, class T>
 void
 test(Iter first, Iter last, const T& value)
 {
-    Iter i = std::lower_bound(first, last, value, std::greater<int>());
-    for (Iter j = first; j != i; ++j)
-        assert(std::greater<int>()(*j, value));
-    for (Iter j = i; j != last; ++j)
-        assert(!std::greater<int>()(*j, value));
+  std::size_t strides{};
+  std::size_t displacement{};
+  stride_counting_iterator f(first, &strides, &displacement);
+  stride_counting_iterator l(last, &strides, &displacement);
+
+  std::size_t comparisons{};
+  auto cmp = [&comparisons](int rhs, int lhs) {
+    ++comparisons;
+    return std::greater<int>()(rhs, lhs);
+  };
+
+  auto i = std::lower_bound(f, l, value, cmp);
+
+  for (auto j = f; j != i; ++j)
+    assert(std::greater<int>()(*j, value));
+  for (auto j = i; j != l; ++j)
+    assert(!std::greater<int>()(*j, value));
+
+  auto len = std::distance(first, last);
+  assert(strides <= 2.5 * len + 1);
+  assert(displacement <= 2.5 * len + 1);
+  assert(comparisons <= 2 * ceil(log(len + 1) + 2));
 }
 
 template <class Iter>
diff --git a/libcxx/test/support/test_iterators.h b/libcxx/test/support/test_iterators.h
index 1133b9597d09cf..3b86a93564e4b5 100644
--- a/libcxx/test/support/test_iterators.h
+++ b/libcxx/test/support/test_iterators.h
@@ -730,7 +730,9 @@ struct common_input_iterator {
 // * `stride_displacement`, which records the displacement of the calls. This means that both
 //   op++/op+= will increase the displacement counter by 1, and op--/op-= will decrease the
 //   displacement counter by 1.
-template <class It>
+template <class It,
+          class StrideCountType        = std::iter_difference_t<It>,
+          class StrideDisplacementType = std::iter_difference_t<It>>
 class stride_counting_iterator {
 public:
     using value_type = typename iter_value_or_void<It>::type;
@@ -743,16 +745,40 @@ class stride_counting_iterator {
         std::conditional_t<std::input_iterator<It>,         std::input_iterator_tag,
         /* else */                                          std::output_iterator_tag
     >>>>>;
+    using iterator_category = iterator_concept;
 
     stride_counting_iterator() requires std::default_initializable<It> = default;
 
     constexpr explicit stride_counting_iterator(It const& it) : base_(base(it)) { }
 
+    constexpr explicit stride_counting_iterator(
+        It const& it, StrideCountType* stride_count, StrideDisplacementType* stride_displacement)
+        : base_(base(it)), stride_count_(stride_count), stride_displacement_(stride_displacement) {}
+
+    constexpr stride_counting_iterator(const stride_counting_iterator& o) { *this = o; }
+    constexpr stride_counting_iterator(stride_counting_iterator&& o) { *this = o; }
+
+    constexpr stride_counting_iterator& operator=(const stride_counting_iterator& o) {
+      base_ = o.base_;
+      // if memory backing count is owned by the object, copy values
+      if (o.stride_count_ == &o.stride_count_default_) {
+        assert(o.stride_displacement_ == &o.stride_displacement_default_);
+        *stride_count_        = *o.stride_count_;
+        *stride_displacement_ = *o.stride_displacement_;
+        return *this;
+      }
+      // otherwise share the same externally-owned variables
+      stride_count_        = o.stride_count_;
+      stride_displacement_ = o.stride_displacement_;
+      return *this;
+    }
+    constexpr stride_counting_iterator& operator=(stride_counting_iterator&& o) { return *this = o; }
+
     friend constexpr It base(stride_counting_iterator const& it) { return It(it.base_); }
 
-    constexpr difference_type stride_count() const { return stride_count_; }
+    constexpr StrideCountType stride_count() const { return *stride_count_; }
 
-    constexpr difference_type stride_displacement() const { return stride_displacement_; }
+    constexpr StrideDisplacementType stride_displacement() const { return *stride_displacement_; }
 
     constexpr decltype(auto) operator*() const { return *It(base_); }
 
@@ -761,8 +787,8 @@ class stride_counting_iterator {
     constexpr stride_counting_iterator& operator++() {
         It tmp(base_);
         base_ = base(++tmp);
-        ++stride_count_;
-        ++stride_displacement_;
+        ++*stride_count_;
+        ++*stride_displacement_;
         return *this;
     }
 
@@ -781,8 +807,8 @@ class stride_counting_iterator {
     {
         It tmp(base_);
         base_ = base(--tmp);
-        ++stride_count_;
-        --stride_displacement_;
+        ++*stride_count_;
+        --*stride_displacement_;
         return *this;
     }
 
@@ -799,8 +825,8 @@ class stride_counting_iterator {
     {
         It tmp(base_);
         base_ = base(tmp += n);
-        ++stride_count_;
-        ++stride_displacement_;
+        ++*stride_count_;
+        ++*stride_displacement_;
         return *this;
     }
 
@@ -809,8 +835,8 @@ class stride_counting_iterator {
     {
         It tmp(base_);
         base_ = base(tmp -= n);
-        ++stride_count_;
-        --stride_displacement_;
+        ++*stride_count_;
+        --*stride_displacement_;
         return *this;
     }
 
@@ -873,8 +899,11 @@ class stride_counting_iterator {
 
 private:
     decltype(base(std::declval<It>())) base_;
-    difference_type stride_count_ = 0;
-    difference_type stride_displacement_ = 0;
+    StrideCountType stride_count_default_               = 0;
+    StrideDisplacementType stride_displacement_default_ = 0;
+
+    StrideCountType* stride_count_               = &stride_count_default_;
+    StrideDisplacementType* stride_displacement_ = &stride_displacement_default_;
 };
 template <class It>
 stride_counting_iterator(It) -> stride_counting_iterator<It>;

>From f6bcf2743080ced55d9d589daed611c5e9696ac5 Mon Sep 17 00:00:00 2001
From: Iuri Chaer <ichaer at splunk.com>
Date: Tue, 17 Oct 2023 13:52:37 +0100
Subject: [PATCH 02/14] [libc++] Introduce one-sided binary search for
 lower_bound on non-random iterators.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

One-sided binary search, aka meta binary search, has been in the public domain for decades, and has the general
advantage of being Ω(1) rather than the classic algorithm's Ω(log(n)), with the downside of executing at most
2*log(n) comparisons vs the classic algorithm's exact log(n). There are two scenarios in which it really shines:
the first one is when operating over non-random iterators, because the classic algorithm requires knowing the
container's size upfront, which adds Ω(n) iterator increments to the complexity. The second one is when you're
traversing the container in order, trying to fast-forward to the next value: in that case, the classic algorithm
would yield Ω(n*log(n)) comparisons and, for non-random iterators, Ω(n^2) iterator increments, whereas the one-sided
version will yield O(n) operations on both counts, with a Ω(log(n)) bound on the number of comparisons.
---
 .../include/__algorithm/iterator_operations.h | 47 +++++++++++++
 libcxx/include/__algorithm/lower_bound.h      | 69 +++++++++++++++++--
 2 files changed, 110 insertions(+), 6 deletions(-)

diff --git a/libcxx/include/__algorithm/iterator_operations.h b/libcxx/include/__algorithm/iterator_operations.h
index e6176da4f5606d..d73573747087e0 100644
--- a/libcxx/include/__algorithm/iterator_operations.h
+++ b/libcxx/include/__algorithm/iterator_operations.h
@@ -87,6 +87,53 @@ struct _IterOps<_ClassicAlgPolicy> {
     std::advance(__iter, __count);
   }
 
+  // advance with sentinel, a la std::ranges::advance
+  // it's unclear whether _Iter has a difference_type and whether that's signed, so we play it safe:
+  // use the incoming type for returning and steer clear of negative overflows
+  template <class _Iter, class _Distance>
+  _LIBCPP_HIDE_FROM_ABI constexpr static _Distance advance(_Iter& __iter, _Distance __count, const _Iter& __sentinel) {
+    return _IterOps::__advance(__iter, __count, __sentinel, typename iterator_traits<_Iter>::iterator_category());
+  }
+
+  // advance with sentinel, a la std::ranges::advance -- InputIterator specialization
+  template <class _InputIter, class _Distance>
+  _LIBCPP_HIDE_FROM_ABI constexpr static _Distance
+  __advance(_InputIter& __iter, _Distance __count, const _InputIter& __sentinel, input_iterator_tag) {
+    _Distance __dist{};
+    for (; __dist < __count && __iter != __sentinel; ++__dist)
+      ++__iter;
+    return __count - __dist;
+  }
+
+  // advance with sentinel, a la std::ranges::advance -- BidirectionalIterator specialization
+  template <class _BiDirIter, class _Distance>
+  _LIBCPP_HIDE_FROM_ABI constexpr static _Distance
+  __advance(_BiDirIter& __iter, _Distance __count, const _BiDirIter& __sentinel, bidirectional_iterator_tag) {
+    _Distance __dist{};
+    if (__count >= 0)
+      for (; __dist < __count && __iter != __sentinel; ++__dist)
+        ++__iter;
+    else
+      for (__count = -__count; __dist < __count && __iter != __sentinel; ++__dist)
+        --__iter;
+    return __count - __dist;
+  }
+
+  // advance with sentinel, a la std::ranges::advance -- RandomIterator specialization
+  template <class _RandIter, class _Distance>
+  _LIBCPP_HIDE_FROM_ABI constexpr static _Distance
+  __advance(_RandIter& __iter, _Distance __count, const _RandIter& __sentinel, random_access_iterator_tag) {
+    auto __dist = _IterOps::distance(__iter, __sentinel);
+    _LIBCPP_ASSERT_UNCATEGORIZED(
+        __count == 0 || (__dist < 0) == (__count < 0), "__sentinel must precede __iter when __count<0");
+    if (__count < 0)
+      __dist = __dist > __count ? __dist : __count;
+    else
+      __dist = __dist < __count ? __dist : __count;
+    __iter += __dist;
+    return __count - __dist;
+  }
+
   // distance
   template <class _Iter>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14
diff --git a/libcxx/include/__algorithm/lower_bound.h b/libcxx/include/__algorithm/lower_bound.h
index 91c3bdaafd0cfd..b432829667fa99 100644
--- a/libcxx/include/__algorithm/lower_bound.h
+++ b/libcxx/include/__algorithm/lower_bound.h
@@ -27,11 +27,13 @@
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
-template <class _AlgPolicy, class _Iter, class _Sent, class _Type, class _Proj, class _Comp>
-_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20
-_Iter __lower_bound(_Iter __first, _Sent __last, const _Type& __value, _Comp& __comp, _Proj& __proj) {
-  auto __len = _IterOps<_AlgPolicy>::distance(__first, __last);
-
+template <class _AlgPolicy, class _Iter, class _Type, class _Proj, class _Comp>
+_LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Iter __lower_bound_bisecting(
+    _Iter __first,
+    const _Type& __value,
+    typename iterator_traits<_Iter>::difference_type __len,
+    _Comp& __comp,
+    _Proj& __proj) {
   while (__len != 0) {
     auto __l2 = std::__half_positive(__len);
     _Iter __m = __first;
@@ -46,13 +48,68 @@ _Iter __lower_bound(_Iter __first, _Sent __last, const _Type& __value, _Comp& __
   return __first;
 }
 
+// One-sided binary search, aka meta binary search, has been in the public domain for decades, and has the general
+// advantage of being Ω(1) rather than the classic algorithm's Ω(log(n)), with the downside of executing at most
+// 2*(log(n)-1) comparisons vs the classic algorithm's exact log(n). There are two scenarios in which it really shines:
+// the first one is when operating over non-random iterators, because the classic algorithm requires knowing the
+// container's size upfront, which adds Ω(n) iterator increments to the complexity. The second one is when you're
+// traversing the container in order, trying to fast-forward to the next value: in that case, the classic algorithm
+// would yield Ω(n*log(n)) comparisons and, for non-random iterators, Ω(n^2) iterator increments, whereas the one-sided
+// version will yield O(n) operations on both counts, with a Ω(log(n)) bound on the number of comparisons.
+template <class _AlgPolicy, class _Iter, class _Sent, class _Type, class _Proj, class _Comp>
+_LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Iter
+__lower_bound_onesided(_Iter __first, _Sent __last, const _Type& __value, _Comp& __comp, _Proj& __proj) {
+  // static_assert(std::is_base_of<std::forward_iterator_tag, typename _IterOps<_AlgPolicy>::template
+  // __iterator_category<_Iter>>::value,
+  //       "lower_bound() is a multipass algorithm and requires forward iterator or better");
+
+  using _Distance = typename iterator_traits<_Iter>::difference_type;
+  for (_Distance __step = 1; __first != __last; __step <<= 1) {
+    auto __it   = __first;
+    auto __dist = __step - _IterOps<_AlgPolicy>::advance(__it, __step, __last);
+    // once we reach the last range where needle can be we must start
+    // looking inwards, bisecting that range
+    if (__it == __last || !std::__invoke(__comp, std::__invoke(__proj, *__it), __value)) {
+      return std::__lower_bound_bisecting<_AlgPolicy>(__first, __value, __dist, __comp, __proj);
+    }
+    // range not found, move forward!
+    __first = std::move(__it);
+  }
+  return __first;
+}
+
+template <class _AlgPolicy, class _InputIter, class _Sent, class _Type, class _Proj, class _Comp>
+_LIBCPP_NODISCARD_EXT inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _InputIter __lower_bound(
+    _InputIter __first, _Sent __last, const _Type& __value, _Comp& __comp, _Proj& __proj, std::input_iterator_tag) {
+  return std::__lower_bound_onesided<_AlgPolicy>(__first, __last, __value, __comp, __proj);
+}
+
+template <class _AlgPolicy, class _RandIter, class _Sent, class _Type, class _Proj, class _Comp>
+_LIBCPP_NODISCARD_EXT inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _RandIter __lower_bound(
+    _RandIter __first,
+    _Sent __last,
+    const _Type& __value,
+    _Comp& __comp,
+    _Proj& __proj,
+    std::random_access_iterator_tag) {
+  const auto __dist = _IterOps<_AlgPolicy>::distance(__first, __last);
+  return std::__lower_bound_bisecting<_AlgPolicy>(__first, __value, __dist, __comp, __proj);
+}
+
+template <class _AlgPolicy, class _Iter, class _Sent, class _Type, class _Proj, class _Comp>
+_LIBCPP_NODISCARD_EXT inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Iter
+__lower_bound(_Iter __first, _Sent __last, const _Type& __value, _Comp&& __comp, _Proj&& __proj) {
+  return std::__lower_bound<_AlgPolicy>(
+      __first, __last, __value, __comp, __proj, typename _IterOps<_AlgPolicy>::template __iterator_category<_Iter>());
+}
+
 template <class _ForwardIterator, class _Tp, class _Compare>
 _LIBCPP_NODISCARD_EXT inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20
 _ForwardIterator lower_bound(_ForwardIterator __first, _ForwardIterator __last, const _Tp& __value, _Compare __comp) {
   static_assert(__is_callable<_Compare, decltype(*__first), const _Tp&>::value,
                 "The comparator has to be callable");
   auto __proj = std::__identity();
-  return std::__lower_bound<_ClassicAlgPolicy>(__first, __last, __value, __comp, __proj);
+  return std::__lower_bound<_ClassicAlgPolicy>(__first, __last, __value, std::move(__comp), std::move(__proj));
 }
 
 template <class _ForwardIterator, class _Tp>

>From 36bb63e36b56f98da2b808ab55410bec5c1d0bb5 Mon Sep 17 00:00:00 2001
From: Iuri Chaer <ichaer at splunk.com>
Date: Tue, 17 Oct 2023 13:53:09 +0100
Subject: [PATCH 03/14] [libc++][test] Add set_intersection complexity
 validation tests prior to introducing use of one-sided binary search to
 fast-forward over ranges of elements.

---
 .../ranges_set_intersection.pass.cpp          | 240 +++++++++++++++++-
 1 file changed, 234 insertions(+), 6 deletions(-)

diff --git a/libcxx/test/std/algorithms/alg.sorting/alg.set.operations/set.intersection/ranges_set_intersection.pass.cpp b/libcxx/test/std/algorithms/alg.sorting/alg.set.operations/set.intersection/ranges_set_intersection.pass.cpp
index 0ee89e0131a073..30cedd19038d7b 100644
--- a/libcxx/test/std/algorithms/alg.sorting/alg.set.operations/set.intersection/ranges_set_intersection.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.sorting/alg.set.operations/set.intersection/ranges_set_intersection.pass.cpp
@@ -28,6 +28,9 @@
 #include <algorithm>
 #include <array>
 #include <concepts>
+#include <cstddef>
+#include <iterator>
+#include <type_traits>
 
 #include "almost_satisfies_types.h"
 #include "MoveOnly.h"
@@ -93,14 +96,17 @@ static_assert(!HasSetIntersectionRange<UncheckedRange<MoveOnly*>, UncheckedRange
 
 using std::ranges::set_intersection_result;
 
+// TODO: std::ranges::set_intersection calls std::ranges::copy
+// std::ranges::copy(contiguous_iterator<int*>, sentinel_wrapper<contiguous_iterator<int*>>, contiguous_iterator<int*>) doesn't seem to work.
+// It seems that std::ranges::copy calls std::copy, which unwraps contiguous_iterator<int*> into int*,
+// and then it failed because there is no == between int* and sentinel_wrapper<contiguous_iterator<int*>>
+template <typename Iter>
+using SentinelWorkaround = std::conditional_t<std::contiguous_iterator<Iter>, Iter, sentinel_wrapper<Iter>>;
+
 template <class In1, class In2, class Out, std::size_t N1, std::size_t N2, std::size_t N3>
 constexpr void testSetIntersectionImpl(std::array<int, N1> in1, std::array<int, N2> in2, std::array<int, N3> expected) {
-  // TODO: std::ranges::set_intersection calls std::ranges::copy
-  // std::ranges::copy(contiguous_iterator<int*>, sentinel_wrapper<contiguous_iterator<int*>>, contiguous_iterator<int*>) doesn't seem to work.
-  // It seems that std::ranges::copy calls std::copy, which unwraps contiguous_iterator<int*> into int*,
-  // and then it failed because there is no == between int* and sentinel_wrapper<contiguous_iterator<int*>>
-  using Sent1 = std::conditional_t<std::contiguous_iterator<In1>, In1, sentinel_wrapper<In1>>;
-  using Sent2 = std::conditional_t<std::contiguous_iterator<In2>, In2, sentinel_wrapper<In2>>;
+  using Sent1 = SentinelWorkaround<In1>;
+  using Sent2 = SentinelWorkaround<In2>;
 
   // iterator overload
   {
@@ -272,6 +278,225 @@ constexpr void runAllIteratorPermutationsTests() {
   static_assert(withAllPermutationsOfInIter1AndInIter2<contiguous_iterator<int*>>());
 }
 
+namespace {
+struct [[nodiscard]] OperationCounts {
+  std::size_t comparisons{};
+  struct PerInput {
+    std::size_t proj{};
+    std::size_t iterator_strides{};
+    std::ptrdiff_t iterator_displacement{};
+
+    // IGNORES proj!
+    [[nodiscard]] constexpr bool operator==(const PerInput& o) const {
+      return iterator_strides == o.iterator_strides && iterator_displacement == o.iterator_displacement;
+    }
+
+    [[nodiscard]] constexpr bool matchesExpectation(const PerInput& expect) {
+      return proj <= expect.proj && iterator_strides <= expect.iterator_strides &&
+             iterator_displacement <= expect.iterator_displacement;
+    }
+  };
+  std::array<PerInput, 2> in;
+
+  [[nodiscard]] constexpr bool matchesExpectation(const OperationCounts& expect) {
+    return comparisons <= expect.comparisons && in[0].matchesExpectation(expect.in[0]) &&
+           in[1].matchesExpectation(expect.in[1]);
+  }
+
+  [[nodiscard]] constexpr bool operator==(const OperationCounts& o) const {
+    return comparisons == o.comparisons && std::ranges::equal(in, o.in);
+  }
+};
+} // namespace
+
+#include <iostream>
+template <template <class...> class In1,
+          template <class...>
+          class In2,
+          class Out,
+          std::size_t N1,
+          std::size_t N2,
+          std::size_t N3>
+constexpr void testSetIntersectionAndReturnOpCounts(
+    std::array<int, N1> in1,
+    std::array<int, N2> in2,
+    std::array<int, N3> expected,
+    const OperationCounts& expectedOpCounts) {
+  OperationCounts ops;
+
+  const auto comp = [&ops](int x, int y) {
+    ++ops.comparisons;
+    return x < y;
+  };
+
+  std::array<int, N3> out;
+
+  stride_counting_iterator b1(
+      In1<decltype(in1.begin())>(in1.begin()), &ops.in[0].iterator_strides, &ops.in[0].iterator_displacement);
+  stride_counting_iterator e1(
+      In1<decltype(in1.end()) >(in1.end()), &ops.in[0].iterator_strides, &ops.in[0].iterator_displacement);
+  stride_counting_iterator b2(
+      In2<decltype(in2.begin())>(in2.begin()), &ops.in[1].iterator_strides, &ops.in[1].iterator_displacement);
+  stride_counting_iterator e2(
+      In2<decltype(in2.end()) >(in2.end()), &ops.in[1].iterator_strides, &ops.in[1].iterator_displacement);
+
+  std::set_intersection(b1, e1, b2, e2, Out(out.data()), comp);
+
+  assert(std::ranges::equal(out, expected));
+  assert(ops.matchesExpectation(expectedOpCounts));
+}
+
+template <template <class...> class In1,
+          template <class...>
+          class In2,
+          class Out,
+          std::size_t N1,
+          std::size_t N2,
+          std::size_t N3>
+constexpr void testRangesSetIntersectionAndReturnOpCounts(
+    std::array<int, N1> in1,
+    std::array<int, N2> in2,
+    std::array<int, N3> expected,
+    const OperationCounts& expectedOpCounts) {
+  OperationCounts ops;
+
+  const auto comp = [&ops](int x, int y) {
+    ++ops.comparisons;
+    return x < y;
+  };
+
+  const auto proj1 = [&ops](const int& i) {
+    ++ops.in[0].proj;
+    return i;
+  };
+
+  const auto proj2 = [&ops](const int& i) {
+    ++ops.in[1].proj;
+    return i;
+  };
+
+  std::array<int, N3> out;
+
+  stride_counting_iterator b1(
+      In1<decltype(in1.begin())>(in1.begin()), &ops.in[0].iterator_strides, &ops.in[0].iterator_displacement);
+  stride_counting_iterator e1(
+      In1<decltype(in1.end()) >(in1.end()), &ops.in[0].iterator_strides, &ops.in[0].iterator_displacement);
+  stride_counting_iterator b2(
+      In2<decltype(in2.begin())>(in2.begin()), &ops.in[1].iterator_strides, &ops.in[1].iterator_displacement);
+  stride_counting_iterator e2(
+      In2<decltype(in2.end()) >(in2.end()), &ops.in[1].iterator_strides, &ops.in[1].iterator_displacement);
+
+  std::ranges::subrange r1{b1, SentinelWorkaround<decltype(e1)>{e1}};
+  std::ranges::subrange r2{b2, SentinelWorkaround<decltype(e2)>{e2}};
+  std::same_as<set_intersection_result<decltype(e1), decltype(e2), Out>> decltype(auto) result =
+      std::ranges::set_intersection(r1, r2, Out{out.data()}, comp, proj1, proj2);
+  assert(std::ranges::equal(out, expected));
+  assert(base(result.in1) == base(e1));
+  assert(base(result.in2) == base(e2));
+  assert(base(result.out) == out.data() + out.size());
+  assert(ops.matchesExpectation(expectedOpCounts));
+}
+
+template <template <typename...> class In1, template <typename...> class In2, class Out>
+constexpr void testComplexityParameterizedIter() {
+  // Worst-case complexity:
+  // Let N=(last1 - first1) and M=(last2 - first2)
+  // At most 2*(N+M) - 1 comparisons and applications of each projection.
+  // At most 2*(N+M) iterator mutations.
+  {
+    std::array r1{1, 3, 5, 7, 9, 11, 13, 15, 17, 19};
+    std::array r2{2, 4, 6, 8, 10, 12, 14, 16, 18, 20};
+    std::array<int, 0> expected{};
+
+    OperationCounts expectedCounts;
+    expectedCounts.comparisons                 = 37;
+    expectedCounts.in[0].proj                  = 37;
+    expectedCounts.in[0].iterator_strides      = 30;
+    expectedCounts.in[0].iterator_displacement = 30;
+    expectedCounts.in[1]                       = expectedCounts.in[0];
+
+    testSetIntersectionAndReturnOpCounts<In1, In2, Out>(r1, r2, expected, expectedCounts);
+    testRangesSetIntersectionAndReturnOpCounts<In1, In2, Out>(r1, r2, expected, expectedCounts);
+  }
+
+  {
+    std::array r1{1, 3, 5, 7, 9, 11, 13, 15, 17, 19};
+    std::array r2{1, 3, 5, 7, 9, 11, 13, 15, 17, 19};
+    std::array expected{1, 3, 5, 7, 9, 11, 13, 15, 17, 19};
+
+    OperationCounts expectedCounts;
+    expectedCounts.comparisons                 = 38;
+    expectedCounts.in[0].proj                  = 38;
+    expectedCounts.in[0].iterator_strides      = 30;
+    expectedCounts.in[0].iterator_displacement = 30;
+    expectedCounts.in[1]                       = expectedCounts.in[0];
+
+    testSetIntersectionAndReturnOpCounts<In1, In2, Out>(r1, r2, expected, expectedCounts);
+    testRangesSetIntersectionAndReturnOpCounts<In1, In2, Out>(r1, r2, expected, expectedCounts);
+  }
+
+  // Lower complexity when there is low overlap between ranges: we can make 2*log(X) comparisons when one range
+  // has X elements that can be skipped over.
+  {
+    std::array r1{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
+    std::array r2{15};
+    std::array expected{15};
+
+    OperationCounts expectedCounts;
+    expectedCounts.comparisons                 = 8;
+    expectedCounts.in[0].proj                  = 8;
+    expectedCounts.in[0].iterator_strides      = 24;
+    expectedCounts.in[0].iterator_displacement = 24;
+    expectedCounts.in[1].proj                  = 8;
+    expectedCounts.in[1].iterator_strides      = 3;
+    expectedCounts.in[1].iterator_displacement = 3;
+
+    testSetIntersectionAndReturnOpCounts<In1, In2, Out>(r1, r2, expected, expectedCounts);
+    testRangesSetIntersectionAndReturnOpCounts<In1, In2, Out>(r1, r2, expected, expectedCounts);
+  }
+
+  {
+    std::array r1{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
+    std::array r2{0, 16};
+    std::array<int, 0> expected{};
+
+    OperationCounts expectedCounts;
+    expectedCounts.comparisons                 = 10;
+    expectedCounts.in[0].proj                  = 10;
+    expectedCounts.in[0].iterator_strides      = 24;
+    expectedCounts.in[0].iterator_displacement = 24;
+    expectedCounts.in[1].proj                  = 10;
+    expectedCounts.in[1].iterator_strides      = 4;
+    expectedCounts.in[1].iterator_displacement = 4;
+
+    testSetIntersectionAndReturnOpCounts<In1, In2, Out>(r1, r2, expected, expectedCounts);
+    testRangesSetIntersectionAndReturnOpCounts<In1, In2, Out>(r1, r2, expected, expectedCounts);
+  }
+}
+
+template <template <typename...> class In2, class Out>
+constexpr void testComplexityParameterizedIterPermutateIn1() {
+  //common_input_iterator
+  testComplexityParameterizedIter<forward_iterator, In2, Out>();
+  testComplexityParameterizedIter<bidirectional_iterator, In2, Out>();
+  testComplexityParameterizedIter<random_access_iterator, In2, Out>();
+}
+
+template <class Out>
+constexpr void testComplexityParameterizedIterPermutateIn1In2() {
+  testComplexityParameterizedIterPermutateIn1<forward_iterator, Out>();
+  testComplexityParameterizedIterPermutateIn1<bidirectional_iterator, Out>();
+  testComplexityParameterizedIterPermutateIn1<random_access_iterator, Out>();
+}
+
+constexpr bool testComplexityMultipleTypes() {
+  //testComplexityParameterizedIter<cpp20_input_iterator, random_access_iterator, OutIter>();
+  testComplexityParameterizedIterPermutateIn1In2<forward_iterator<int*>>();
+  testComplexityParameterizedIterPermutateIn1In2<bidirectional_iterator<int*>>();
+  testComplexityParameterizedIterPermutateIn1In2<random_access_iterator<int*>>();
+  return true;
+}
+
 constexpr bool test() {
   // check that every element is copied exactly once
   {
@@ -572,5 +797,8 @@ int main(int, char**) {
   // than the step limit.
   runAllIteratorPermutationsTests();
 
+  testComplexityMultipleTypes();
+  static_assert(testComplexityMultipleTypes());
+
   return 0;
 }

>From c23272c389329d3af83c0f58f896ee6ea47260ed Mon Sep 17 00:00:00 2001
From: Iuri Chaer <ichaer at splunk.com>
Date: Tue, 17 Oct 2023 13:53:31 +0100
Subject: [PATCH 04/14] [libc++] Introduce use of __lower_bound_onesided to
 improve average complexity of set_intersection.

---
 libcxx/include/__algorithm/set_intersection.h | 154 +++++++++++++++++-
 1 file changed, 150 insertions(+), 4 deletions(-)

diff --git a/libcxx/include/__algorithm/set_intersection.h b/libcxx/include/__algorithm/set_intersection.h
index f2603fe1365ac3..556738022f4859 100644
--- a/libcxx/include/__algorithm/set_intersection.h
+++ b/libcxx/include/__algorithm/set_intersection.h
@@ -12,9 +12,13 @@
 #include <__algorithm/comp.h>
 #include <__algorithm/comp_ref_type.h>
 #include <__algorithm/iterator_operations.h>
+#include <__algorithm/lower_bound.h>
 #include <__config>
+#include <__functional/identity.h>
 #include <__iterator/iterator_traits.h>
 #include <__iterator/next.h>
+#include <__type_traits/is_same.h>
+#include <__utility/exchange.h>
 #include <__utility/move.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
@@ -36,9 +40,122 @@ struct __set_intersection_result {
 };
 
 template <class _AlgPolicy, class _Compare, class _InIter1, class _Sent1, class _InIter2, class _Sent2, class _OutIter>
-_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __set_intersection_result<_InIter1, _InIter2, _OutIter>
-__set_intersection(
-    _InIter1 __first1, _Sent1 __last1, _InIter2 __first2, _Sent2 __last2, _OutIter __result, _Compare&& __comp) {
+struct _LIBCPP_NODISCARD_EXT __set_intersector {
+  _InIter1& __first1_;
+  const _Sent1& __last1_;
+  _InIter2& __first2_;
+  const _Sent2& __last2_;
+  _OutIter& __result_;
+  _Compare& __comp_;
+  static constexpr auto __proj_ = std::__identity();
+  bool __prev_advanced_         = true;
+
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __set_intersector(
+      _InIter1& __first1, _Sent1& __last1, _InIter2& __first2, _Sent2& __last2, _OutIter& __result, _Compare& __comp)
+      : __first1_(__first1),
+        __last1_(__last1),
+        __first2_(__first2),
+        __last2_(__last2),
+        __result_(__result),
+        __comp_(__comp) {}
+
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI
+      _LIBCPP_CONSTEXPR_SINCE_CXX20 __set_intersection_result<_InIter1, _InIter2, _OutIter>
+      operator()() && {
+    while (__first2_ != __last2_) {
+      __advance1_and_maybe_add_result();
+      if (__first1_ == __last1_)
+        break;
+      __advance2_and_maybe_add_result();
+    }
+    return __set_intersection_result<_InIter1, _InIter2, _OutIter>(
+        _IterOps<_AlgPolicy>::next(std::move(__first1_), std::move(__last1_)),
+        _IterOps<_AlgPolicy>::next(std::move(__first2_), std::move(__last2_)),
+        std::move(__result_));
+  }
+
+private:
+  // advance __iter to the first element in the range where !__comp_(__iter, __value)
+  // add result if this is the second consecutive call without advancing
+  // this method only works if you alternate calls between __advance1_and_maybe_add_result() and
+  // __advance2_and_maybe_add_result()
+  template <class _Iter, class _Sent, class _Value>
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void
+  __advance_and_maybe_add_result(_Iter& __iter, const _Sent& __sentinel, const _Value& __value) {
+    // use one-sided lower bound for improved algorithmic complexity bounds
+    const auto __tmp =
+        std::exchange(__iter, std::__lower_bound_onesided<_AlgPolicy>(__iter, __sentinel, __value, __comp_, __proj_));
+    __add_output_unless(__tmp != __iter);
+  }
+
+  // advance __first1_ to the first element in the range where !__comp_(*__first1_, *__first2_)
+  // add result if neither __first1_ nor __first2_ advanced in the last attempt (meaning they are equal)
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __advance1_and_maybe_add_result() {
+    __advance_and_maybe_add_result(__first1_, __last1_, *__first2_);
+  }
+
+  // advance __first2_ to the first element in the range where !__comp_(*__first2_, *__first1_)
+  // add result if neither __first1_ nor __first2_ advanced in the last attempt (meaning they are equal)
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __advance2_and_maybe_add_result() {
+    __advance_and_maybe_add_result(__first2_, __last2_, *__first1_);
+  }
+
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __add_output_unless(bool __advanced) {
+    if (__advanced | __prev_advanced_) {
+      __prev_advanced_ = __advanced;
+    } else {
+      *__result_ = *__first1_;
+      ++__result_;
+      ++__first1_;
+      ++__first2_;
+      __prev_advanced_ = true;
+    }
+  }
+};
+
+// with forward iterators we can use binary search to skip over entries
+template <class _AlgPolicy,
+          class _Compare,
+          class _InForwardIter1,
+          class _Sent1,
+          class _InForwardIter2,
+          class _Sent2,
+          class _OutIter>
+_LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI
+    _LIBCPP_CONSTEXPR_SINCE_CXX20 __set_intersection_result<_InForwardIter1, _InForwardIter2, _OutIter>
+    __set_intersection(
+        _InForwardIter1 __first1,
+        _Sent1 __last1,
+        _InForwardIter2 __first2,
+        _Sent2 __last2,
+        _OutIter __result,
+        _Compare&& __comp,
+        std::forward_iterator_tag,
+        std::forward_iterator_tag) {
+  std::__set_intersector<_AlgPolicy, _Compare, _InForwardIter1, _Sent1, _InForwardIter2, _Sent2, _OutIter>
+      __intersector(__first1, __last1, __first2, __last2, __result, __comp);
+  return std::move(__intersector)();
+}
+
+// input iterators are not suitable for multipass algorithms, so we stick to the classic single-pass version
+template <class _AlgPolicy,
+          class _Compare,
+          class _InInputIter1,
+          class _Sent1,
+          class _InInputIter2,
+          class _Sent2,
+          class _OutIter>
+_LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI
+    _LIBCPP_CONSTEXPR_SINCE_CXX20 __set_intersection_result<_InInputIter1, _InInputIter2, _OutIter>
+    __set_intersection(
+        _InInputIter1 __first1,
+        _Sent1 __last1,
+        _InInputIter2 __first2,
+        _Sent2 __last2,
+        _OutIter __result,
+        _Compare&& __comp,
+        std::input_iterator_tag,
+        std::input_iterator_tag) {
   while (__first1 != __last1 && __first2 != __last2) {
     if (__comp(*__first1, *__first2))
       ++__first1;
@@ -52,12 +169,41 @@ __set_intersection(
     }
   }
 
-  return __set_intersection_result<_InIter1, _InIter2, _OutIter>(
+  return std::__set_intersection_result<_InInputIter1, _InInputIter2, _OutIter>(
       _IterOps<_AlgPolicy>::next(std::move(__first1), std::move(__last1)),
       _IterOps<_AlgPolicy>::next(std::move(__first2), std::move(__last2)),
       std::move(__result));
 }
 
+template <class _AlgPolicy, class _Iter>
+class __set_intersection_iter_category {
+  template <class _It>
+  using __cat = typename std::_IterOps<_AlgPolicy>::template __iterator_category<_It>;
+  template <class _It>
+  static auto test(__cat<_It>*) -> __cat<_It>;
+  template <class>
+  static std::input_iterator_tag test(...);
+
+public:
+  using __type = decltype(test<_Iter>(nullptr));
+};
+
+template <class _AlgPolicy, class _Compare, class _InIter1, class _Sent1, class _InIter2, class _Sent2, class _OutIter>
+_LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI
+    _LIBCPP_CONSTEXPR_SINCE_CXX20 __set_intersection_result<_InIter1, _InIter2, _OutIter>
+    __set_intersection(
+        _InIter1 __first1, _Sent1 __last1, _InIter2 __first2, _Sent2 __last2, _OutIter __result, _Compare&& __comp) {
+  return std::__set_intersection<_AlgPolicy>(
+      std::move(__first1),
+      std::move(__last1),
+      std::move(__first2),
+      std::move(__last2),
+      std::move(__result),
+      std::forward<_Compare>(__comp),
+      typename std::__set_intersection_iter_category<_AlgPolicy, _InIter1>::__type(),
+      typename std::__set_intersection_iter_category<_AlgPolicy, _InIter2>::__type());
+}
+
 template <class _InputIterator1, class _InputIterator2, class _OutputIterator, class _Compare>
 inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _OutputIterator set_intersection(
     _InputIterator1 __first1,

>From 0b57ea00b44dbe69bc5125a08691a72b0dea42ce Mon Sep 17 00:00:00 2001
From: Iuri Chaer <ichaer at splunk.com>
Date: Tue, 2 Jan 2024 17:18:21 +0000
Subject: [PATCH 05/14] Fix `constexpr` annotations.

---
 libcxx/include/__algorithm/iterator_operations.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/libcxx/include/__algorithm/iterator_operations.h b/libcxx/include/__algorithm/iterator_operations.h
index d73573747087e0..21117e6b7d7609 100644
--- a/libcxx/include/__algorithm/iterator_operations.h
+++ b/libcxx/include/__algorithm/iterator_operations.h
@@ -91,13 +91,13 @@ struct _IterOps<_ClassicAlgPolicy> {
   // it's unclear whether _Iter has a difference_type and whether that's signed, so we play it safe:
   // use the incoming type for returning and steer clear of negative overflows
   template <class _Iter, class _Distance>
-  _LIBCPP_HIDE_FROM_ABI constexpr static _Distance advance(_Iter& __iter, _Distance __count, const _Iter& __sentinel) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 static _Distance advance(_Iter& __iter, _Distance __count, const _Iter& __sentinel) {
     return _IterOps::__advance(__iter, __count, __sentinel, typename iterator_traits<_Iter>::iterator_category());
   }
 
   // advance with sentinel, a la std::ranges::advance -- InputIterator specialization
   template <class _InputIter, class _Distance>
-  _LIBCPP_HIDE_FROM_ABI constexpr static _Distance
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 static _Distance
   __advance(_InputIter& __iter, _Distance __count, const _InputIter& __sentinel, input_iterator_tag) {
     _Distance __dist{};
     for (; __dist < __count && __iter != __sentinel; ++__dist)
@@ -107,7 +107,7 @@ struct _IterOps<_ClassicAlgPolicy> {
 
   // advance with sentinel, a la std::ranges::advance -- BidirectionalIterator specialization
   template <class _BiDirIter, class _Distance>
-  _LIBCPP_HIDE_FROM_ABI constexpr static _Distance
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 static _Distance
   __advance(_BiDirIter& __iter, _Distance __count, const _BiDirIter& __sentinel, bidirectional_iterator_tag) {
     _Distance __dist{};
     if (__count >= 0)

>From 08af54897cd8e39a25a1e97b0174b68beb408cd0 Mon Sep 17 00:00:00 2001
From: Iuri Chaer <ichaer at splunk.com>
Date: Tue, 2 Jan 2024 17:18:59 +0000
Subject: [PATCH 06/14] Remove std::exchange dependency from
 std::set_intersection so it works before C++14

---
 libcxx/include/__algorithm/set_intersection.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libcxx/include/__algorithm/set_intersection.h b/libcxx/include/__algorithm/set_intersection.h
index 556738022f4859..46f6fbe4d3dd2b 100644
--- a/libcxx/include/__algorithm/set_intersection.h
+++ b/libcxx/include/__algorithm/set_intersection.h
@@ -83,8 +83,8 @@ struct _LIBCPP_NODISCARD_EXT __set_intersector {
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void
   __advance_and_maybe_add_result(_Iter& __iter, const _Sent& __sentinel, const _Value& __value) {
     // use one-sided lower bound for improved algorithmic complexity bounds
-    const auto __tmp =
-        std::exchange(__iter, std::__lower_bound_onesided<_AlgPolicy>(__iter, __sentinel, __value, __comp_, __proj_));
+    const auto __tmp = std::move(__iter);
+    __iter = std::__lower_bound_onesided<_AlgPolicy>(__iter, __sentinel, __value, __comp_, __proj_);
     __add_output_unless(__tmp != __iter);
   }
 

>From 7aa3927064083b6a96bfcc4e00d1b4fc24d9c96e Mon Sep 17 00:00:00 2001
From: Iuri Chaer <ichaer at splunk.com>
Date: Tue, 2 Jan 2024 17:20:06 +0000
Subject: [PATCH 07/14] Review feedback: don't use one-sided lower bound in
 lower_bound() itself since that violates the complexity guarantees from the
 standard.

---
 libcxx/include/__algorithm/lower_bound.h       | 18 ++----------------
 .../lower.bound/lower_bound.pass.cpp           | 10 +++++-----
 .../lower.bound/lower_bound_comp.pass.cpp      | 13 ++++++-------
 3 files changed, 13 insertions(+), 28 deletions(-)

diff --git a/libcxx/include/__algorithm/lower_bound.h b/libcxx/include/__algorithm/lower_bound.h
index b432829667fa99..3febcb411268fb 100644
--- a/libcxx/include/__algorithm/lower_bound.h
+++ b/libcxx/include/__algorithm/lower_bound.h
@@ -78,38 +78,24 @@ __lower_bound_onesided(_Iter __first, _Sent __last, const _Type& __value, _Comp&
   return __first;
 }
 
-template <class _AlgPolicy, class _InputIter, class _Sent, class _Type, class _Proj, class _Comp>
-_LIBCPP_NODISCARD_EXT inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _InputIter __lower_bound(
-    _InputIter __first, _Sent __last, const _Type& __value, _Comp& __comp, _Proj& __proj, std::input_iterator_tag) {
-  return std::__lower_bound_onesided<_AlgPolicy>(__first, __last, __value, __comp, __proj);
-}
-
 template <class _AlgPolicy, class _RandIter, class _Sent, class _Type, class _Proj, class _Comp>
 _LIBCPP_NODISCARD_EXT inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _RandIter __lower_bound(
     _RandIter __first,
     _Sent __last,
     const _Type& __value,
     _Comp& __comp,
-    _Proj& __proj,
-    std::random_access_iterator_tag) {
+    _Proj& __proj) {
   const auto __dist = _IterOps<_AlgPolicy>::distance(__first, __last);
   return std::__lower_bound_bisecting<_AlgPolicy>(__first, __value, __dist, __comp, __proj);
 }
 
-template <class _AlgPolicy, class _Iter, class _Sent, class _Type, class _Proj, class _Comp>
-_LIBCPP_NODISCARD_EXT inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Iter
-__lower_bound(_Iter __first, _Sent __last, const _Type& __value, _Comp&& __comp, _Proj&& __proj) {
-  return std::__lower_bound<_AlgPolicy>(
-      __first, __last, __value, __comp, __proj, typename _IterOps<_AlgPolicy>::template __iterator_category<_Iter>());
-}
-
 template <class _ForwardIterator, class _Tp, class _Compare>
 _LIBCPP_NODISCARD_EXT inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20
 _ForwardIterator lower_bound(_ForwardIterator __first, _ForwardIterator __last, const _Tp& __value, _Compare __comp) {
   static_assert(__is_callable<_Compare, decltype(*__first), const _Tp&>::value,
                 "The comparator has to be callable");
   auto __proj = std::__identity();
-  return std::__lower_bound<_ClassicAlgPolicy>(__first, __last, __value, std::move(__comp), std::move(__proj));
+  return std::__lower_bound<_ClassicAlgPolicy>(__first, __last, __value, __comp, __proj);
 }
 
 template <class _ForwardIterator, class _Tp>
diff --git a/libcxx/test/std/algorithms/alg.sorting/alg.binary.search/lower.bound/lower_bound.pass.cpp b/libcxx/test/std/algorithms/alg.sorting/alg.binary.search/lower.bound/lower_bound.pass.cpp
index 5c11962d137779..dd2916338e8f6e 100644
--- a/libcxx/test/std/algorithms/alg.sorting/alg.binary.search/lower.bound/lower_bound.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.sorting/alg.binary.search/lower.bound/lower_bound.pass.cpp
@@ -45,14 +45,14 @@ test(Iter first, Iter last, const T& value)
   stride_counting_iterator l(last, &strides, &displacement);
 
   auto i = std::lower_bound(f, l, value);
-  for (auto j = f; j != i; ++j)
+  for (auto j = base(f); j != base(i); ++j)
     assert(*j < value);
-  for (auto j = i; j != l; ++j)
+  for (auto j = base(i); j != base(l); ++j)
     assert(!(*j < value));
 
-  auto len = std::distance(first, last);
-  assert(strides <= 2.5 * len + 1);
-  assert(displacement <= 2.5 * len + 1);
+  auto len = static_cast<std::size_t>(std::distance(first, last));
+  assert(strides <= 2 * len);
+  assert(displacement <= 2 * len);
 }
 
 template <class Iter>
diff --git a/libcxx/test/std/algorithms/alg.sorting/alg.binary.search/lower.bound/lower_bound_comp.pass.cpp b/libcxx/test/std/algorithms/alg.sorting/alg.binary.search/lower.bound/lower_bound_comp.pass.cpp
index 05fd43eada4616..ff928e23b9006a 100644
--- a/libcxx/test/std/algorithms/alg.sorting/alg.binary.search/lower.bound/lower_bound_comp.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.sorting/alg.binary.search/lower.bound/lower_bound_comp.pass.cpp
@@ -51,16 +51,15 @@ test(Iter first, Iter last, const T& value)
   };
 
   auto i = std::lower_bound(f, l, value, cmp);
-
-  for (auto j = f; j != i; ++j)
+  for (auto j = base(f); j != base(i); ++j)
     assert(std::greater<int>()(*j, value));
-  for (auto j = i; j != l; ++j)
+  for (auto j = base(i); j != base(l); ++j)
     assert(!std::greater<int>()(*j, value));
 
-  auto len = std::distance(first, last);
-  assert(strides <= 2.5 * len + 1);
-  assert(displacement <= 2.5 * len + 1);
-  assert(comparisons <= 2 * ceil(log(len + 1) + 2));
+  auto len = static_cast<std::size_t>(std::distance(first, last));
+  assert(strides <= 2 * len);
+  assert(displacement <= 2 * len);
+  assert(comparisons <= std::ceil(std::log2(len + 1)));
 }
 
 template <class Iter>

>From c44c2a2b8ea818287b859c5ce318d195c59e9d65 Mon Sep 17 00:00:00 2001
From: Iuri Chaer <ichaer at splunk.com>
Date: Tue, 2 Jan 2024 17:21:33 +0000
Subject: [PATCH 08/14] Create new benchmark for set_intersection().

---
 libcxx/benchmarks/CMakeLists.txt              |   1 +
 .../algorithms/set_intersection.bench.cpp     | 224 ++++++++++++++++++
 2 files changed, 225 insertions(+)
 create mode 100644 libcxx/benchmarks/algorithms/set_intersection.bench.cpp

diff --git a/libcxx/benchmarks/CMakeLists.txt b/libcxx/benchmarks/CMakeLists.txt
index 7591f34d938bf8..da2ea6fd4c3d1c 100644
--- a/libcxx/benchmarks/CMakeLists.txt
+++ b/libcxx/benchmarks/CMakeLists.txt
@@ -192,6 +192,7 @@ set(BENCHMARK_TESTS
     algorithms/ranges_sort.bench.cpp
     algorithms/ranges_sort_heap.bench.cpp
     algorithms/ranges_stable_sort.bench.cpp
+    algorithms/set_intersection.bench.cpp
     algorithms/sort.bench.cpp
     algorithms/sort_heap.bench.cpp
     algorithms/stable_sort.bench.cpp
diff --git a/libcxx/benchmarks/algorithms/set_intersection.bench.cpp b/libcxx/benchmarks/algorithms/set_intersection.bench.cpp
new file mode 100644
index 00000000000000..c6a01707d65311
--- /dev/null
+++ b/libcxx/benchmarks/algorithms/set_intersection.bench.cpp
@@ -0,0 +1,224 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <algorithm>
+#include <forward_list>
+#include <iterator>
+#include <set>
+#include <vector>
+
+#include "common.h"
+
+namespace {
+
+// types of containers we'll want to test, covering interesting iterator types
+struct VectorContainer {
+  template <typename... Args>
+  using type = std::vector<Args...>;
+
+  static constexpr const char* Name = "Vector";
+};
+
+struct SetContainer {
+  template <typename... Args>
+  using type = std::set<Args...>;
+
+  static constexpr const char* Name = "Set";
+};
+
+struct ForwardListContainer {
+  template <typename... Args>
+  using type = std::forward_list<Args...>;
+
+  static constexpr const char* Name = "ForwardList";
+};
+
+using AllContainerTypes = std::tuple<VectorContainer, SetContainer, ForwardListContainer>;
+
+// set_intersection performance may depend on where matching values lie
+enum class OverlapPosition {
+    Nowhere,
+    Front,
+    Back,
+    Interlaced,
+};
+
+struct AllOverlapPositions : EnumValuesAsTuple<AllOverlapPositions, OverlapPosition, 4> {
+  static constexpr const char* Names[] = {
+      "Nowhere", "Front", "Back", "Interlaced"};
+};
+
+// functor that moves elements from an iterator range into a new Container instance
+template <typename Container>
+struct MoveInto {};
+
+template <typename T>
+struct MoveInto<std::vector<T>> {
+    template <class It>
+    [[nodiscard]] static std::vector<T> operator()(It first, It last) {
+        std::vector<T> out;
+        std::move(first, last, std::back_inserter(out));
+        return out;
+    }
+};
+
+template <typename T>
+struct MoveInto<std::forward_list<T>> {
+    template <class It>
+    [[nodiscard]] static std::forward_list<T> operator()(It first, It last) {
+        std::forward_list<T> out;
+        std::move(first, last, std::front_inserter(out));
+        out.reverse();
+        return out;
+    }
+};
+
+template <typename T>
+struct MoveInto<std::set<T>> {
+    template <class It>
+    [[nodiscard]] static std::set<T> operator()(It first, It last) {
+        std::set<T> out;
+        std::move(first, last, std::inserter(out, out.begin()));
+        return out;
+    }
+};
+
+// lightweight wrapping around fillValues() which puts a little effort into
+// making that would be contiguous when sorted non-contiguous in memory
+template <typename T>
+std::vector<T> getVectorOfRandom(size_t N) {
+  std::vector<T> V;
+  fillValues(V, N, Order::Random);
+  sortValues(V, Order::Random);
+  return std::vector<T>(V);
+}
+
+// forward_iterator wrapping which, for each increment, moves the underlying iterator forward Stride elements
+template <typename Wrapped>
+struct StridedFwdIt {
+  Wrapped Base;
+  unsigned Stride;
+
+  using iterator_category = std::forward_iterator_tag;
+  using difference_type = typename Wrapped::difference_type;
+  using value_type = typename Wrapped::value_type;
+  using pointer = typename Wrapped::pointer;
+  using reference = typename Wrapped::reference;
+
+  StridedFwdIt(Wrapped B, unsigned Stride_) : Base(B), Stride(Stride_) { assert(Stride != 0); }
+
+  StridedFwdIt operator++() { for (unsigned I=0; I<Stride; ++I) ++Base; return *this; }
+  StridedFwdIt operator++(int) { auto Tmp = *this; ++*this; return Tmp; }
+  value_type& operator*() { return *Base; }
+  const value_type& operator*() const { return *Base; }
+  value_type& operator->() { return *Base; }
+  const value_type& operator->() const { return *Base; }
+  bool operator==(const StridedFwdIt& o) const { return Base==o.Base; }
+  bool operator!=(const StridedFwdIt& o) const { return !operator==(o); }
+};
+template <typename Wrapped> StridedFwdIt(Wrapped, unsigned) -> StridedFwdIt<Wrapped>;
+
+
+// realistically, data won't all be nicely contiguous in a container
+// we'll go through some effort to ensure that it's shuffled through memory
+template <class Container>
+std::pair<Container, Container> genCacheUnfriendlyData(size_t Size1, size_t Size2, OverlapPosition Pos) {
+  using ValueType = typename Container::value_type;
+  const MoveInto<Container> moveInto;
+  const auto SrcSize = Pos == OverlapPosition::Nowhere ? Size1 + Size2 : std::max(Size1, Size2);
+  std::vector<ValueType> Src = getVectorOfRandom<ValueType>(SrcSize);
+
+  if (Pos == OverlapPosition::Nowhere) {
+    std::sort(Src.begin(), Src.end());
+    return std::make_pair(
+        moveInto(Src.begin(), Src.begin() + Size1),
+        moveInto(Src.begin() + Size1, Src.end()));
+  }
+
+  // all other overlap types will have to copy some part of the data, but if
+  // we copy after sorting it will likely have high cache locality, so we sort
+  // each copy separately
+  auto Copy = Src;
+  std::sort(Src.begin(), Src.end());
+  std::sort(Copy.begin(), Copy.end());
+
+  switch(Pos) {
+    case OverlapPosition::Nowhere:
+      break;
+
+    case OverlapPosition::Front:
+      return std::make_pair(
+          moveInto(Src.begin(), Src.begin() + Size1),
+          moveInto(Copy.begin(), Copy.begin() + Size2));
+
+    case OverlapPosition::Back:
+      return std::make_pair(
+          moveInto(Src.begin() + (Src.size() - Size1), Src.end()),
+          moveInto(Copy.begin() + (Copy.size() - Size2), Copy.end()));
+
+    case OverlapPosition::Interlaced:
+      const auto Stride1 = Size1 < Size2 ? Size2/Size1 : 1;
+      const auto Stride2 = Size2 < Size1 ? Size1/Size2 : 1;
+      return std::make_pair(
+          moveInto(StridedFwdIt(Src.begin(), Stride1), StridedFwdIt(Src.end(), Stride1)),
+          moveInto(StridedFwdIt(Copy.begin(), Stride2), StridedFwdIt(Copy.end(), Stride2)));
+  }
+  abort();
+  return std::pair<Container, Container>();
+}
+
+
+template <class ValueType, class Container, class Overlap>
+struct SetIntersection {
+  using ContainerType = typename Container::template type<Value<ValueType>>;
+  size_t Size1;
+  size_t Size2;
+
+  SetIntersection(size_t M, size_t N) : Size1(M), Size2(N) {}
+
+  void run(benchmark::State& state) const {
+    state.PauseTiming();
+    auto Input = genCacheUnfriendlyData<ContainerType>(Size1, Size2, Overlap());
+    std::vector<Value<ValueType>> out(std::min(Size1, Size2));
+
+    size_t cmp;
+    auto trackingLess = [&cmp](const Value<ValueType>& lhs, const Value<ValueType>& rhs) {
+        ++cmp;
+        return std::less<Value<ValueType>>{}(lhs, rhs);
+    };
+
+    const auto BatchSize =  std::max(size_t{16}, (2*TestSetElements) / (Size1+Size2));
+    state.ResumeTiming();
+
+    for (const auto& _ : state) {
+      while (state.KeepRunningBatch(BatchSize)) {
+        for (unsigned i=0; i<BatchSize; ++i) {
+          const auto& [C1, C2] = Input;
+          auto outIter = std::set_intersection(C1.begin(), C1.end(), C2.begin(), C2.end(), out.begin(), trackingLess);
+          benchmark::DoNotOptimize(outIter);
+          state.counters["Comparisons"] = cmp;
+        }
+      }
+    }
+  }
+
+  std::string name() const {
+    return std::string("SetIntersection") + Overlap::name() + '_' + Container::Name +
+        ValueType::name() + '_' + std::to_string(Size1) + '_' + std::to_string(Size2);
+  }
+};
+
+} // namespace
+
+int main(int argc, char** argv) {/**/
+  benchmark::Initialize(&argc, argv);
+  if (benchmark::ReportUnrecognizedArguments(argc, argv))
+    return 1;
+  makeCartesianProductBenchmark<SetIntersection, AllValueTypes, AllContainerTypes, AllOverlapPositions>(Quantities, Quantities);
+  benchmark::RunSpecifiedBenchmarks();
+}

>From 46cc95f71742e32d8131a5b08fa271b122a919c3 Mon Sep 17 00:00:00 2001
From: Iuri Chaer <ichaer at splunk.com>
Date: Fri, 5 Jan 2024 23:04:19 +0000
Subject: [PATCH 09/14] Formatting fixups.

---
 .../algorithms/set_intersection.bench.cpp     | 201 +++++++++---------
 .../include/__algorithm/iterator_operations.h |   3 +-
 libcxx/include/__algorithm/lower_bound.h      |   8 +-
 libcxx/include/__algorithm/set_intersection.h |  52 ++---
 4 files changed, 131 insertions(+), 133 deletions(-)

diff --git a/libcxx/benchmarks/algorithms/set_intersection.bench.cpp b/libcxx/benchmarks/algorithms/set_intersection.bench.cpp
index c6a01707d65311..4fa411bba43549 100644
--- a/libcxx/benchmarks/algorithms/set_intersection.bench.cpp
+++ b/libcxx/benchmarks/algorithms/set_intersection.bench.cpp
@@ -42,15 +42,14 @@ using AllContainerTypes = std::tuple<VectorContainer, SetContainer, ForwardListC
 
 // set_intersection performance may depend on where matching values lie
 enum class OverlapPosition {
-    Nowhere,
-    Front,
-    Back,
-    Interlaced,
+  None,
+  Front,
+  Back,
+  Interlaced,
 };
 
 struct AllOverlapPositions : EnumValuesAsTuple<AllOverlapPositions, OverlapPosition, 4> {
-  static constexpr const char* Names[] = {
-      "Nowhere", "Front", "Back", "Interlaced"};
+  static constexpr const char* Names[] = {"None", "Front", "Back", "Interlaced"};
 };
 
 // functor that moves elements from an iterator range into a new Container instance
@@ -59,148 +58,149 @@ struct MoveInto {};
 
 template <typename T>
 struct MoveInto<std::vector<T>> {
-    template <class It>
-    [[nodiscard]] static std::vector<T> operator()(It first, It last) {
-        std::vector<T> out;
-        std::move(first, last, std::back_inserter(out));
-        return out;
-    }
+  template <class It>
+  [[nodiscard]] static std::vector<T> operator()(It first, It last) {
+    std::vector<T> out;
+    std::move(first, last, std::back_inserter(out));
+    return out;
+  }
 };
 
 template <typename T>
 struct MoveInto<std::forward_list<T>> {
-    template <class It>
-    [[nodiscard]] static std::forward_list<T> operator()(It first, It last) {
-        std::forward_list<T> out;
-        std::move(first, last, std::front_inserter(out));
-        out.reverse();
-        return out;
-    }
+  template <class It>
+  [[nodiscard]] static std::forward_list<T> operator()(It first, It last) {
+    std::forward_list<T> out;
+    std::move(first, last, std::front_inserter(out));
+    out.reverse();
+    return out;
+  }
 };
 
 template <typename T>
 struct MoveInto<std::set<T>> {
-    template <class It>
-    [[nodiscard]] static std::set<T> operator()(It first, It last) {
-        std::set<T> out;
-        std::move(first, last, std::inserter(out, out.begin()));
-        return out;
-    }
+  template <class It>
+  [[nodiscard]] static std::set<T> operator()(It first, It last) {
+    std::set<T> out;
+    std::move(first, last, std::inserter(out, out.begin()));
+    return out;
+  }
 };
 
 // lightweight wrapping around fillValues() which puts a little effort into
 // making that would be contiguous when sorted non-contiguous in memory
 template <typename T>
 std::vector<T> getVectorOfRandom(size_t N) {
-  std::vector<T> V;
-  fillValues(V, N, Order::Random);
-  sortValues(V, Order::Random);
-  return std::vector<T>(V);
+  std::vector<T> v;
+  fillValues(v, N, Order::Random);
+  sortValues(v, Order::Random);
+  return std::vector<T>(v);
 }
 
 // forward_iterator wrapping which, for each increment, moves the underlying iterator forward Stride elements
 template <typename Wrapped>
 struct StridedFwdIt {
-  Wrapped Base;
-  unsigned Stride;
+  Wrapped base_;
+  unsigned stride_;
 
   using iterator_category = std::forward_iterator_tag;
-  using difference_type = typename Wrapped::difference_type;
-  using value_type = typename Wrapped::value_type;
-  using pointer = typename Wrapped::pointer;
-  using reference = typename Wrapped::reference;
-
-  StridedFwdIt(Wrapped B, unsigned Stride_) : Base(B), Stride(Stride_) { assert(Stride != 0); }
-
-  StridedFwdIt operator++() { for (unsigned I=0; I<Stride; ++I) ++Base; return *this; }
-  StridedFwdIt operator++(int) { auto Tmp = *this; ++*this; return Tmp; }
-  value_type& operator*() { return *Base; }
-  const value_type& operator*() const { return *Base; }
-  value_type& operator->() { return *Base; }
-  const value_type& operator->() const { return *Base; }
-  bool operator==(const StridedFwdIt& o) const { return Base==o.Base; }
+  using difference_type   = typename Wrapped::difference_type;
+  using value_type        = typename Wrapped::value_type;
+  using pointer           = typename Wrapped::pointer;
+  using reference         = typename Wrapped::reference;
+
+  StridedFwdIt(Wrapped base, unsigned stride) : base_(base), stride_(stride) { assert(stride_ != 0); }
+
+  StridedFwdIt operator++() {
+    for (unsigned i = 0; i < stride_; ++i)
+      ++base_;
+    return *this;
+  }
+  StridedFwdIt operator++(int) {
+    auto tmp = *this;
+    ++*this;
+    return tmp;
+  }
+  value_type& operator*() { return *base_; }
+  const value_type& operator*() const { return *base_; }
+  value_type& operator->() { return *base_; }
+  const value_type& operator->() const { return *base_; }
+  bool operator==(const StridedFwdIt& o) const { return base_ == o.base_; }
   bool operator!=(const StridedFwdIt& o) const { return !operator==(o); }
 };
-template <typename Wrapped> StridedFwdIt(Wrapped, unsigned) -> StridedFwdIt<Wrapped>;
-
+template <typename Wrapped>
+StridedFwdIt(Wrapped, unsigned) -> StridedFwdIt<Wrapped>;
 
 // realistically, data won't all be nicely contiguous in a container
 // we'll go through some effort to ensure that it's shuffled through memory
 template <class Container>
-std::pair<Container, Container> genCacheUnfriendlyData(size_t Size1, size_t Size2, OverlapPosition Pos) {
+std::pair<Container, Container> genCacheUnfriendlyData(size_t size1, size_t size2, OverlapPosition pos) {
   using ValueType = typename Container::value_type;
-  const MoveInto<Container> moveInto;
-  const auto SrcSize = Pos == OverlapPosition::Nowhere ? Size1 + Size2 : std::max(Size1, Size2);
-  std::vector<ValueType> Src = getVectorOfRandom<ValueType>(SrcSize);
-
-  if (Pos == OverlapPosition::Nowhere) {
-    std::sort(Src.begin(), Src.end());
-    return std::make_pair(
-        moveInto(Src.begin(), Src.begin() + Size1),
-        moveInto(Src.begin() + Size1, Src.end()));
+  const MoveInto<Container> move_into;
+  const auto src_size = pos == OverlapPosition::None ? size1 + size2 : std::max(size1, size2);
+  std::vector<ValueType> src = getVectorOfRandom<ValueType>(src_size);
+
+  if (pos == OverlapPosition::None) {
+    std::sort(src.begin(), src.end());
+    return std::make_pair(move_into(src.begin(), src.begin() + size1), move_into(src.begin() + size1, src.end()));
   }
 
   // all other overlap types will have to copy some part of the data, but if
   // we copy after sorting it will likely have high cache locality, so we sort
   // each copy separately
-  auto Copy = Src;
-  std::sort(Src.begin(), Src.end());
-  std::sort(Copy.begin(), Copy.end());
-
-  switch(Pos) {
-    case OverlapPosition::Nowhere:
-      break;
-
-    case OverlapPosition::Front:
-      return std::make_pair(
-          moveInto(Src.begin(), Src.begin() + Size1),
-          moveInto(Copy.begin(), Copy.begin() + Size2));
-
-    case OverlapPosition::Back:
-      return std::make_pair(
-          moveInto(Src.begin() + (Src.size() - Size1), Src.end()),
-          moveInto(Copy.begin() + (Copy.size() - Size2), Copy.end()));
-
-    case OverlapPosition::Interlaced:
-      const auto Stride1 = Size1 < Size2 ? Size2/Size1 : 1;
-      const auto Stride2 = Size2 < Size1 ? Size1/Size2 : 1;
-      return std::make_pair(
-          moveInto(StridedFwdIt(Src.begin(), Stride1), StridedFwdIt(Src.end(), Stride1)),
-          moveInto(StridedFwdIt(Copy.begin(), Stride2), StridedFwdIt(Copy.end(), Stride2)));
+  auto copy = src;
+  std::sort(src.begin(), src.end());
+  std::sort(copy.begin(), copy.end());
+
+  switch (pos) {
+  case OverlapPosition::None:
+    break;
+
+  case OverlapPosition::Front:
+    return std::make_pair(move_into(src.begin(), src.begin() + size1), move_into(copy.begin(), copy.begin() + size2));
+
+  case OverlapPosition::Back:
+    return std::make_pair(move_into(src.begin() + (src.size() - size1), src.end()),
+                          move_into(copy.begin() + (copy.size() - size2), copy.end()));
+
+  case OverlapPosition::Interlaced:
+    const auto stride1 = size1 < size2 ? size2 / size1 : 1;
+    const auto stride2 = size2 < size1 ? size1 / size2 : 1;
+    return std::make_pair(move_into(StridedFwdIt(src.begin(), stride1), StridedFwdIt(src.end(), stride1)),
+                          move_into(StridedFwdIt(copy.begin(), stride2), StridedFwdIt(copy.end(), stride2)));
   }
   abort();
   return std::pair<Container, Container>();
 }
 
-
 template <class ValueType, class Container, class Overlap>
 struct SetIntersection {
   using ContainerType = typename Container::template type<Value<ValueType>>;
-  size_t Size1;
-  size_t Size2;
+  size_t size1_;
+  size_t size2_;
 
-  SetIntersection(size_t M, size_t N) : Size1(M), Size2(N) {}
+  SetIntersection(size_t size1, size_t size2) : size1_(size1), size2_(size2) {}
 
   void run(benchmark::State& state) const {
     state.PauseTiming();
-    auto Input = genCacheUnfriendlyData<ContainerType>(Size1, Size2, Overlap());
-    std::vector<Value<ValueType>> out(std::min(Size1, Size2));
+    auto input = genCacheUnfriendlyData<ContainerType>(size1_, size2_, Overlap());
+    std::vector<Value<ValueType>> out(std::min(size1_, size2_));
 
     size_t cmp;
-    auto trackingLess = [&cmp](const Value<ValueType>& lhs, const Value<ValueType>& rhs) {
-        ++cmp;
-        return std::less<Value<ValueType>>{}(lhs, rhs);
+    auto tracking_less = [&cmp](const Value<ValueType>& lhs, const Value<ValueType>& rhs) {
+      ++cmp;
+      return std::less<Value<ValueType>>{}(lhs, rhs);
     };
 
-    const auto BatchSize =  std::max(size_t{16}, (2*TestSetElements) / (Size1+Size2));
+    const auto BATCH_SIZE = std::max(size_t{16}, (2 * TestSetElements) / (size1_ + size2_));
     state.ResumeTiming();
 
     for (const auto& _ : state) {
-      while (state.KeepRunningBatch(BatchSize)) {
-        for (unsigned i=0; i<BatchSize; ++i) {
-          const auto& [C1, C2] = Input;
-          auto outIter = std::set_intersection(C1.begin(), C1.end(), C2.begin(), C2.end(), out.begin(), trackingLess);
-          benchmark::DoNotOptimize(outIter);
+      while (state.KeepRunningBatch(BATCH_SIZE)) {
+        for (unsigned i = 0; i < BATCH_SIZE; ++i) {
+          const auto& [c1, c2] = input;
+          auto res = std::set_intersection(c1.begin(), c1.end(), c2.begin(), c2.end(), out.begin(), tracking_less);
+          benchmark::DoNotOptimize(res);
           state.counters["Comparisons"] = cmp;
         }
       }
@@ -208,17 +208,18 @@ struct SetIntersection {
   }
 
   std::string name() const {
-    return std::string("SetIntersection") + Overlap::name() + '_' + Container::Name +
-        ValueType::name() + '_' + std::to_string(Size1) + '_' + std::to_string(Size2);
+    return std::string("SetIntersection") + Overlap::name() + '_' + Container::Name + ValueType::name() + '_' +
+           std::to_string(size1_) + '_' + std::to_string(size2_);
   }
 };
 
 } // namespace
 
-int main(int argc, char** argv) {/**/
+int main(int argc, char** argv) { /**/
   benchmark::Initialize(&argc, argv);
   if (benchmark::ReportUnrecognizedArguments(argc, argv))
     return 1;
-  makeCartesianProductBenchmark<SetIntersection, AllValueTypes, AllContainerTypes, AllOverlapPositions>(Quantities, Quantities);
+  makeCartesianProductBenchmark<SetIntersection, AllValueTypes, AllContainerTypes, AllOverlapPositions>(
+      Quantities, Quantities);
   benchmark::RunSpecifiedBenchmarks();
 }
diff --git a/libcxx/include/__algorithm/iterator_operations.h b/libcxx/include/__algorithm/iterator_operations.h
index 21117e6b7d7609..6ce9895f545a5d 100644
--- a/libcxx/include/__algorithm/iterator_operations.h
+++ b/libcxx/include/__algorithm/iterator_operations.h
@@ -91,7 +91,8 @@ struct _IterOps<_ClassicAlgPolicy> {
   // it's unclear whether _Iter has a difference_type and whether that's signed, so we play it safe:
   // use the incoming type for returning and steer clear of negative overflows
   template <class _Iter, class _Distance>
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 static _Distance advance(_Iter& __iter, _Distance __count, const _Iter& __sentinel) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 static _Distance
+  advance(_Iter& __iter, _Distance __count, const _Iter& __sentinel) {
     return _IterOps::__advance(__iter, __count, __sentinel, typename iterator_traits<_Iter>::iterator_category());
   }
 
diff --git a/libcxx/include/__algorithm/lower_bound.h b/libcxx/include/__algorithm/lower_bound.h
index 3febcb411268fb..b1ecd1ae0d5698 100644
--- a/libcxx/include/__algorithm/lower_bound.h
+++ b/libcxx/include/__algorithm/lower_bound.h
@@ -79,12 +79,8 @@ __lower_bound_onesided(_Iter __first, _Sent __last, const _Type& __value, _Comp&
 }
 
 template <class _AlgPolicy, class _RandIter, class _Sent, class _Type, class _Proj, class _Comp>
-_LIBCPP_NODISCARD_EXT inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _RandIter __lower_bound(
-    _RandIter __first,
-    _Sent __last,
-    const _Type& __value,
-    _Comp& __comp,
-    _Proj& __proj) {
+_LIBCPP_NODISCARD_EXT inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _RandIter
+__lower_bound(_RandIter __first, _Sent __last, const _Type& __value, _Comp& __comp, _Proj& __proj) {
   const auto __dist = _IterOps<_AlgPolicy>::distance(__first, __last);
   return std::__lower_bound_bisecting<_AlgPolicy>(__first, __value, __dist, __comp, __proj);
 }
diff --git a/libcxx/include/__algorithm/set_intersection.h b/libcxx/include/__algorithm/set_intersection.h
index 46f6fbe4d3dd2b..a18bb6ff947b77 100644
--- a/libcxx/include/__algorithm/set_intersection.h
+++ b/libcxx/include/__algorithm/set_intersection.h
@@ -60,8 +60,8 @@ struct _LIBCPP_NODISCARD_EXT __set_intersector {
         __comp_(__comp) {}
 
   _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI
-      _LIBCPP_CONSTEXPR_SINCE_CXX20 __set_intersection_result<_InIter1, _InIter2, _OutIter>
-      operator()() && {
+  _LIBCPP_CONSTEXPR_SINCE_CXX20 __set_intersection_result<_InIter1, _InIter2, _OutIter>
+  operator()() && {
     while (__first2_ != __last2_) {
       __advance1_and_maybe_add_result();
       if (__first1_ == __last1_)
@@ -84,7 +84,7 @@ struct _LIBCPP_NODISCARD_EXT __set_intersector {
   __advance_and_maybe_add_result(_Iter& __iter, const _Sent& __sentinel, const _Value& __value) {
     // use one-sided lower bound for improved algorithmic complexity bounds
     const auto __tmp = std::move(__iter);
-    __iter = std::__lower_bound_onesided<_AlgPolicy>(__iter, __sentinel, __value, __comp_, __proj_);
+    __iter           = std::__lower_bound_onesided<_AlgPolicy>(__iter, __sentinel, __value, __comp_, __proj_);
     __add_output_unless(__tmp != __iter);
   }
 
@@ -122,16 +122,16 @@ template <class _AlgPolicy,
           class _Sent2,
           class _OutIter>
 _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI
-    _LIBCPP_CONSTEXPR_SINCE_CXX20 __set_intersection_result<_InForwardIter1, _InForwardIter2, _OutIter>
-    __set_intersection(
-        _InForwardIter1 __first1,
-        _Sent1 __last1,
-        _InForwardIter2 __first2,
-        _Sent2 __last2,
-        _OutIter __result,
-        _Compare&& __comp,
-        std::forward_iterator_tag,
-        std::forward_iterator_tag) {
+_LIBCPP_CONSTEXPR_SINCE_CXX20 __set_intersection_result<_InForwardIter1, _InForwardIter2, _OutIter>
+__set_intersection(
+    _InForwardIter1 __first1,
+    _Sent1 __last1,
+    _InForwardIter2 __first2,
+    _Sent2 __last2,
+    _OutIter __result,
+    _Compare&& __comp,
+    std::forward_iterator_tag,
+    std::forward_iterator_tag) {
   std::__set_intersector<_AlgPolicy, _Compare, _InForwardIter1, _Sent1, _InForwardIter2, _Sent2, _OutIter>
       __intersector(__first1, __last1, __first2, __last2, __result, __comp);
   return std::move(__intersector)();
@@ -146,16 +146,16 @@ template <class _AlgPolicy,
           class _Sent2,
           class _OutIter>
 _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI
-    _LIBCPP_CONSTEXPR_SINCE_CXX20 __set_intersection_result<_InInputIter1, _InInputIter2, _OutIter>
-    __set_intersection(
-        _InInputIter1 __first1,
-        _Sent1 __last1,
-        _InInputIter2 __first2,
-        _Sent2 __last2,
-        _OutIter __result,
-        _Compare&& __comp,
-        std::input_iterator_tag,
-        std::input_iterator_tag) {
+_LIBCPP_CONSTEXPR_SINCE_CXX20 __set_intersection_result<_InInputIter1, _InInputIter2, _OutIter>
+__set_intersection(
+    _InInputIter1 __first1,
+    _Sent1 __last1,
+    _InInputIter2 __first2,
+    _Sent2 __last2,
+    _OutIter __result,
+    _Compare&& __comp,
+    std::input_iterator_tag,
+    std::input_iterator_tag) {
   while (__first1 != __last1 && __first2 != __last2) {
     if (__comp(*__first1, *__first2))
       ++__first1;
@@ -190,9 +190,9 @@ class __set_intersection_iter_category {
 
 template <class _AlgPolicy, class _Compare, class _InIter1, class _Sent1, class _InIter2, class _Sent2, class _OutIter>
 _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI
-    _LIBCPP_CONSTEXPR_SINCE_CXX20 __set_intersection_result<_InIter1, _InIter2, _OutIter>
-    __set_intersection(
-        _InIter1 __first1, _Sent1 __last1, _InIter2 __first2, _Sent2 __last2, _OutIter __result, _Compare&& __comp) {
+_LIBCPP_CONSTEXPR_SINCE_CXX20 __set_intersection_result<_InIter1, _InIter2, _OutIter>
+__set_intersection(
+    _InIter1 __first1, _Sent1 __last1, _InIter2 __first2, _Sent2 __last2, _OutIter __result, _Compare&& __comp) {
   return std::__set_intersection<_AlgPolicy>(
       std::move(__first1),
       std::move(__last1),

>From 450f5cebd41e425133fd221bf23b40bb20922eef Mon Sep 17 00:00:00 2001
From: Iuri Chaer <ichaer at splunk.com>
Date: Mon, 8 Jan 2024 21:51:27 +0000
Subject: [PATCH 10/14] General improvements to benchmark, including
 simplifying and slimming it down for faster runs, and including comparison
 counter.

---
 .../algorithms/set_intersection.bench.cpp     | 72 +++++++------------
 1 file changed, 27 insertions(+), 45 deletions(-)

diff --git a/libcxx/benchmarks/algorithms/set_intersection.bench.cpp b/libcxx/benchmarks/algorithms/set_intersection.bench.cpp
index 4fa411bba43549..baa5a7cdf05074 100644
--- a/libcxx/benchmarks/algorithms/set_intersection.bench.cpp
+++ b/libcxx/benchmarks/algorithms/set_intersection.bench.cpp
@@ -7,7 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 #include <algorithm>
-#include <forward_list>
 #include <iterator>
 #include <set>
 #include <vector>
@@ -31,57 +30,26 @@ struct SetContainer {
   static constexpr const char* Name = "Set";
 };
 
-struct ForwardListContainer {
-  template <typename... Args>
-  using type = std::forward_list<Args...>;
-
-  static constexpr const char* Name = "ForwardList";
-};
-
-using AllContainerTypes = std::tuple<VectorContainer, SetContainer, ForwardListContainer>;
+using AllContainerTypes = std::tuple<VectorContainer, SetContainer>;
 
 // set_intersection performance may depend on where matching values lie
 enum class OverlapPosition {
   None,
   Front,
-  Back,
+  // performance-wise, matches at the back are identical to ones at the front
   Interlaced,
 };
 
-struct AllOverlapPositions : EnumValuesAsTuple<AllOverlapPositions, OverlapPosition, 4> {
-  static constexpr const char* Names[] = {"None", "Front", "Back", "Interlaced"};
+struct AllOverlapPositions : EnumValuesAsTuple<AllOverlapPositions, OverlapPosition, 3> {
+  static constexpr const char* Names[] = {"None", "Front", "Interlaced"};
 };
 
 // functor that moves elements from an iterator range into a new Container instance
 template <typename Container>
-struct MoveInto {};
-
-template <typename T>
-struct MoveInto<std::vector<T>> {
-  template <class It>
-  [[nodiscard]] static std::vector<T> operator()(It first, It last) {
-    std::vector<T> out;
-    std::move(first, last, std::back_inserter(out));
-    return out;
-  }
-};
-
-template <typename T>
-struct MoveInto<std::forward_list<T>> {
+struct MoveInto {
   template <class It>
-  [[nodiscard]] static std::forward_list<T> operator()(It first, It last) {
-    std::forward_list<T> out;
-    std::move(first, last, std::front_inserter(out));
-    out.reverse();
-    return out;
-  }
-};
-
-template <typename T>
-struct MoveInto<std::set<T>> {
-  template <class It>
-  [[nodiscard]] static std::set<T> operator()(It first, It last) {
-    std::set<T> out;
+  [[nodiscard]] static Container operator()(It first, It last) {
+    Container out;
     std::move(first, last, std::inserter(out, out.begin()));
     return out;
   }
@@ -137,7 +105,7 @@ template <class Container>
 std::pair<Container, Container> genCacheUnfriendlyData(size_t size1, size_t size2, OverlapPosition pos) {
   using ValueType = typename Container::value_type;
   const MoveInto<Container> move_into;
-  const auto src_size = pos == OverlapPosition::None ? size1 + size2 : std::max(size1, size2);
+  const auto src_size        = pos == OverlapPosition::None ? size1 + size2 : std::max(size1, size2);
   std::vector<ValueType> src = getVectorOfRandom<ValueType>(src_size);
 
   if (pos == OverlapPosition::None) {
@@ -159,10 +127,6 @@ std::pair<Container, Container> genCacheUnfriendlyData(size_t size1, size_t size
   case OverlapPosition::Front:
     return std::make_pair(move_into(src.begin(), src.begin() + size1), move_into(copy.begin(), copy.begin() + size2));
 
-  case OverlapPosition::Back:
-    return std::make_pair(move_into(src.begin() + (src.size() - size1), src.end()),
-                          move_into(copy.begin() + (copy.size() - size2), copy.end()));
-
   case OverlapPosition::Interlaced:
     const auto stride1 = size1 < size2 ? size2 / size1 : 1;
     const auto stride2 = size2 < size1 ? size1 / size2 : 1;
@@ -181,6 +145,11 @@ struct SetIntersection {
 
   SetIntersection(size_t size1, size_t size2) : size1_(size1), size2_(size2) {}
 
+  bool skip() const noexcept {
+    // let's save some time and skip simmetrical runs
+    return size1_ <= size2_;
+  }
+
   void run(benchmark::State& state) const {
     state.PauseTiming();
     auto input = genCacheUnfriendlyData<ContainerType>(size1_, size2_, Overlap());
@@ -192,12 +161,13 @@ struct SetIntersection {
       return std::less<Value<ValueType>>{}(lhs, rhs);
     };
 
-    const auto BATCH_SIZE = std::max(size_t{16}, (2 * TestSetElements) / (size1_ + size2_));
+    const auto BATCH_SIZE = std::max(size_t{512}, (2 * TestSetElements) / (size1_ + size2_));
     state.ResumeTiming();
 
     for (const auto& _ : state) {
       while (state.KeepRunningBatch(BATCH_SIZE)) {
         for (unsigned i = 0; i < BATCH_SIZE; ++i) {
+          cmp                  = 0;
           const auto& [c1, c2] = input;
           auto res = std::set_intersection(c1.begin(), c1.end(), c2.begin(), c2.end(), out.begin(), tracking_less);
           benchmark::DoNotOptimize(res);
@@ -219,6 +189,18 @@ int main(int argc, char** argv) { /**/
   benchmark::Initialize(&argc, argv);
   if (benchmark::ReportUnrecognizedArguments(argc, argv))
     return 1;
+  const std::vector<size_t> Quantities = {
+      1 << 0,
+      1 << 4,
+      1 << 8,
+      1 << 14,
+// Running each benchmark in parallel consumes too much memory with MSAN
+// and can lead to the test process being killed.
+#if !TEST_HAS_FEATURE(memory_sanitizer)
+      1 << 18
+#endif
+  };
+
   makeCartesianProductBenchmark<SetIntersection, AllValueTypes, AllContainerTypes, AllOverlapPositions>(
       Quantities, Quantities);
   benchmark::RunSpecifiedBenchmarks();

>From d0c5f2b8d23c76db2ba325aa0fb6172d1b6eb1da Mon Sep 17 00:00:00 2001
From: Iuri Chaer <ichaer at splunk.com>
Date: Mon, 15 Jan 2024 16:19:29 +0000
Subject: [PATCH 11/14] Huh, I wonder how I got `git clang-format` to miss
 those changes =/

---
 .../algorithms/set_intersection.bench.cpp     | 10 ++--
 libcxx/include/__algorithm/set_intersection.h | 58 +++++++++----------
 2 files changed, 34 insertions(+), 34 deletions(-)

diff --git a/libcxx/benchmarks/algorithms/set_intersection.bench.cpp b/libcxx/benchmarks/algorithms/set_intersection.bench.cpp
index baa5a7cdf05074..38010170508a88 100644
--- a/libcxx/benchmarks/algorithms/set_intersection.bench.cpp
+++ b/libcxx/benchmarks/algorithms/set_intersection.bench.cpp
@@ -190,14 +190,14 @@ int main(int argc, char** argv) { /**/
   if (benchmark::ReportUnrecognizedArguments(argc, argv))
     return 1;
   const std::vector<size_t> Quantities = {
-      1 << 0,
-      1 << 4,
-      1 << 8,
-      1 << 14,
+    1 << 0,
+    1 << 4,
+    1 << 8,
+    1 << 14,
 // Running each benchmark in parallel consumes too much memory with MSAN
 // and can lead to the test process being killed.
 #if !TEST_HAS_FEATURE(memory_sanitizer)
-      1 << 18
+    1 << 18
 #endif
   };
 
diff --git a/libcxx/include/__algorithm/set_intersection.h b/libcxx/include/__algorithm/set_intersection.h
index a18bb6ff947b77..504350d10779e4 100644
--- a/libcxx/include/__algorithm/set_intersection.h
+++ b/libcxx/include/__algorithm/set_intersection.h
@@ -59,9 +59,9 @@ struct _LIBCPP_NODISCARD_EXT __set_intersector {
         __result_(__result),
         __comp_(__comp) {}
 
-  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI
-  _LIBCPP_CONSTEXPR_SINCE_CXX20 __set_intersection_result<_InIter1, _InIter2, _OutIter>
-  operator()() && {
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20
+      __set_intersection_result<_InIter1, _InIter2, _OutIter>
+      operator()() && {
     while (__first2_ != __last2_) {
       __advance1_and_maybe_add_result();
       if (__first1_ == __last1_)
@@ -121,17 +121,17 @@ template <class _AlgPolicy,
           class _InForwardIter2,
           class _Sent2,
           class _OutIter>
-_LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI
-_LIBCPP_CONSTEXPR_SINCE_CXX20 __set_intersection_result<_InForwardIter1, _InForwardIter2, _OutIter>
-__set_intersection(
-    _InForwardIter1 __first1,
-    _Sent1 __last1,
-    _InForwardIter2 __first2,
-    _Sent2 __last2,
-    _OutIter __result,
-    _Compare&& __comp,
-    std::forward_iterator_tag,
-    std::forward_iterator_tag) {
+_LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20
+    __set_intersection_result<_InForwardIter1, _InForwardIter2, _OutIter>
+    __set_intersection(
+        _InForwardIter1 __first1,
+        _Sent1 __last1,
+        _InForwardIter2 __first2,
+        _Sent2 __last2,
+        _OutIter __result,
+        _Compare&& __comp,
+        std::forward_iterator_tag,
+        std::forward_iterator_tag) {
   std::__set_intersector<_AlgPolicy, _Compare, _InForwardIter1, _Sent1, _InForwardIter2, _Sent2, _OutIter>
       __intersector(__first1, __last1, __first2, __last2, __result, __comp);
   return std::move(__intersector)();
@@ -145,17 +145,17 @@ template <class _AlgPolicy,
           class _InInputIter2,
           class _Sent2,
           class _OutIter>
-_LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI
-_LIBCPP_CONSTEXPR_SINCE_CXX20 __set_intersection_result<_InInputIter1, _InInputIter2, _OutIter>
-__set_intersection(
-    _InInputIter1 __first1,
-    _Sent1 __last1,
-    _InInputIter2 __first2,
-    _Sent2 __last2,
-    _OutIter __result,
-    _Compare&& __comp,
-    std::input_iterator_tag,
-    std::input_iterator_tag) {
+_LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20
+    __set_intersection_result<_InInputIter1, _InInputIter2, _OutIter>
+    __set_intersection(
+        _InInputIter1 __first1,
+        _Sent1 __last1,
+        _InInputIter2 __first2,
+        _Sent2 __last2,
+        _OutIter __result,
+        _Compare&& __comp,
+        std::input_iterator_tag,
+        std::input_iterator_tag) {
   while (__first1 != __last1 && __first2 != __last2) {
     if (__comp(*__first1, *__first2))
       ++__first1;
@@ -189,10 +189,10 @@ class __set_intersection_iter_category {
 };
 
 template <class _AlgPolicy, class _Compare, class _InIter1, class _Sent1, class _InIter2, class _Sent2, class _OutIter>
-_LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI
-_LIBCPP_CONSTEXPR_SINCE_CXX20 __set_intersection_result<_InIter1, _InIter2, _OutIter>
-__set_intersection(
-    _InIter1 __first1, _Sent1 __last1, _InIter2 __first2, _Sent2 __last2, _OutIter __result, _Compare&& __comp) {
+_LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20
+    __set_intersection_result<_InIter1, _InIter2, _OutIter>
+    __set_intersection(
+        _InIter1 __first1, _Sent1 __last1, _InIter2 __first2, _Sent2 __last2, _OutIter __result, _Compare&& __comp) {
   return std::__set_intersection<_AlgPolicy>(
       std::move(__first1),
       std::move(__last1),

>From faa31150e13902941cfa0c9ef87bff265b12d898 Mon Sep 17 00:00:00 2001
From: Iuri Chaer <ichaer at splunk.com>
Date: Tue, 23 Jan 2024 16:25:56 +0000
Subject: [PATCH 12/14] Oops, bad mistake while porting into libc++!
 `__lower_bound_onesided()` must start with `__step==0`, otherwise we can't
 match the complexity of linear search when continually matching (like a
 std::set_intersection() of matching containers will).

---
 libcxx/include/__algorithm/lower_bound.h | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/libcxx/include/__algorithm/lower_bound.h b/libcxx/include/__algorithm/lower_bound.h
index b1ecd1ae0d5698..dc86e2fa5c81d1 100644
--- a/libcxx/include/__algorithm/lower_bound.h
+++ b/libcxx/include/__algorithm/lower_bound.h
@@ -63,6 +63,12 @@ __lower_bound_onesided(_Iter __first, _Sent __last, const _Type& __value, _Comp&
   // __iterator_category<_Iter>>::value,
   //       "lower_bound() is a multipass algorithm and requires forward iterator or better");
 
+  // split the step 0 scenario: this allows us to match worst-case complexity
+  // when replacing linear search
+  if (__first == __last || !std::__invoke(__comp, std::__invoke(__proj, *__first), __value))
+    return __first;
+  ++__first;
+
   using _Distance = typename iterator_traits<_Iter>::difference_type;
   for (_Distance __step = 1; __first != __last; __step <<= 1) {
     auto __it   = __first;

>From 995d04b872c8552633c36e38d382897e8329d1e2 Mon Sep 17 00:00:00 2001
From: Iuri Chaer <ichaer at splunk.com>
Date: Tue, 23 Jan 2024 22:31:41 +0000
Subject: [PATCH 13/14] Oops, bad tracking of displacement on
 `stride_counting_iterator`

---
 libcxx/test/support/test_iterators.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libcxx/test/support/test_iterators.h b/libcxx/test/support/test_iterators.h
index 3b86a93564e4b5..d1e077e1b26554 100644
--- a/libcxx/test/support/test_iterators.h
+++ b/libcxx/test/support/test_iterators.h
@@ -826,7 +826,7 @@ class stride_counting_iterator {
         It tmp(base_);
         base_ = base(tmp += n);
         ++*stride_count_;
-        ++*stride_displacement_;
+        *stride_displacement_ += n;
         return *this;
     }
 
@@ -836,7 +836,7 @@ class stride_counting_iterator {
         It tmp(base_);
         base_ = base(tmp -= n);
         ++*stride_count_;
-        --*stride_displacement_;
+        *stride_displacement_ -= n;
         return *this;
     }
 

>From d568d491cef941e2cb03d85bcce9b7d2ec7314c4 Mon Sep 17 00:00:00 2001
From: Iuri Chaer <ichaer at splunk.com>
Date: Tue, 23 Jan 2024 22:33:13 +0000
Subject: [PATCH 14/14] Add more counters to the set_intersection benchmark,
 guard them behind an environment variable so we can choose to either measure
 time more accurately or obtain more information.

This led me down an interesting road of validating benchmark results and finding a significant discrepancy in timings between when I run all test cases at once or `--benchmark-filter` them individually.
---
 .../algorithms/set_intersection.bench.cpp     | 38 +++++++++++++------
 1 file changed, 27 insertions(+), 11 deletions(-)

diff --git a/libcxx/benchmarks/algorithms/set_intersection.bench.cpp b/libcxx/benchmarks/algorithms/set_intersection.bench.cpp
index 38010170508a88..b2de0c3223b005 100644
--- a/libcxx/benchmarks/algorithms/set_intersection.bench.cpp
+++ b/libcxx/benchmarks/algorithms/set_intersection.bench.cpp
@@ -9,9 +9,11 @@
 #include <algorithm>
 #include <iterator>
 #include <set>
+#include <stdlib.h>
 #include <vector>
 
 #include "common.h"
+#include "test_iterators.h"
 
 namespace {
 
@@ -137,6 +139,10 @@ std::pair<Container, Container> genCacheUnfriendlyData(size_t size1, size_t size
   return std::pair<Container, Container>();
 }
 
+// use environment variable to enable additional counters: instrumentation will
+// impact CPU utilisation, let's give the user the option
+static const bool TRACK_COUNTERS = getenv("TRACK_COUNTERS") != nullptr;
+
 template <class ValueType, class Container, class Overlap>
 struct SetIntersection {
   using ContainerType = typename Container::template type<Value<ValueType>>;
@@ -147,7 +153,7 @@ struct SetIntersection {
 
   bool skip() const noexcept {
     // let's save some time and skip simmetrical runs
-    return size1_ <= size2_;
+    return size1_ < size2_;
   }
 
   void run(benchmark::State& state) const {
@@ -155,23 +161,33 @@ struct SetIntersection {
     auto input = genCacheUnfriendlyData<ContainerType>(size1_, size2_, Overlap());
     std::vector<Value<ValueType>> out(std::min(size1_, size2_));
 
-    size_t cmp;
-    auto tracking_less = [&cmp](const Value<ValueType>& lhs, const Value<ValueType>& rhs) {
-      ++cmp;
-      return std::less<Value<ValueType>>{}(lhs, rhs);
-    };
-
     const auto BATCH_SIZE = std::max(size_t{512}, (2 * TestSetElements) / (size1_ + size2_));
     state.ResumeTiming();
 
     for (const auto& _ : state) {
       while (state.KeepRunningBatch(BATCH_SIZE)) {
         for (unsigned i = 0; i < BATCH_SIZE; ++i) {
-          cmp                  = 0;
           const auto& [c1, c2] = input;
-          auto res = std::set_intersection(c1.begin(), c1.end(), c2.begin(), c2.end(), out.begin(), tracking_less);
-          benchmark::DoNotOptimize(res);
-          state.counters["Comparisons"] = cmp;
+          if (TRACK_COUNTERS) {
+            size_t cmp{}, strides{}, displacement{};
+            auto tracking_less = [&cmp](const Value<ValueType>& lhs, const Value<ValueType>& rhs) {
+              ++cmp;
+              return std::less<Value<ValueType>>{}(lhs, rhs);
+            };
+            stride_counting_iterator b1(c1.begin(), &strides, &displacement);
+            stride_counting_iterator e1(c1.end(), &strides, &displacement);
+            stride_counting_iterator b2(c2.begin(), &strides, &displacement);
+            stride_counting_iterator e2(c2.end(), &strides, &displacement);
+            auto res = std::set_intersection(b1, e1, b2, e2, out.begin(), tracking_less);
+            benchmark::DoNotOptimize(res);
+            state.counters["comparisons"]       = cmp;
+            state.counters["iter_strides"]      = strides;
+            state.counters["iter_displacement"] = displacement;
+
+          } else {
+            auto res = std::set_intersection(c1.begin(), c1.end(), c2.begin(), c2.end(), out.begin());
+            benchmark::DoNotOptimize(res);
+          }
         }
       }
     }



More information about the cfe-commits mailing list