[libcxx-commits] [libcxx] [libc++] Speed up set_intersection() by fast-forwarding over ranges of non-matching elements with one-sided binary search. (PR #75230)

Iuri Chaer via libcxx-commits libcxx-commits at lists.llvm.org
Tue Apr 30 04:01:18 PDT 2024


https://github.com/ichaer updated https://github.com/llvm/llvm-project/pull/75230

>From b65415f5b70591eae965cae1316054145d399158 Mon Sep 17 00:00:00 2001
From: Iuri Chaer <ichaer at splunk.com>
Date: Tue, 17 Oct 2023 13:52:13 +0100
Subject: [PATCH 01/44] [libc++][test] Add lower_bound complexity validation
 tests prior to introducing one-sided binary search for non-random iterators.

---
 .../lower.bound/lower_bound.pass.cpp          | 19 +++++--
 .../lower.bound/lower_bound_comp.pass.cpp     | 28 ++++++++--
 libcxx/test/support/test_iterators.h          | 55 ++++++++++++++-----
 3 files changed, 79 insertions(+), 23 deletions(-)

diff --git a/libcxx/test/std/algorithms/alg.sorting/alg.binary.search/lower.bound/lower_bound.pass.cpp b/libcxx/test/std/algorithms/alg.sorting/alg.binary.search/lower.bound/lower_bound.pass.cpp
index a2d8ab632303cb..5c11962d137779 100644
--- a/libcxx/test/std/algorithms/alg.sorting/alg.binary.search/lower.bound/lower_bound.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.sorting/alg.binary.search/lower.bound/lower_bound.pass.cpp
@@ -39,11 +39,20 @@ template <class Iter, class T>
 void
 test(Iter first, Iter last, const T& value)
 {
-    Iter i = std::lower_bound(first, last, value);
-    for (Iter j = first; j != i; ++j)
-        assert(*j < value);
-    for (Iter j = i; j != last; ++j)
-        assert(!(*j < value));
+  std::size_t strides{};
+  std::size_t displacement{};
+  stride_counting_iterator f(first, &strides, &displacement);
+  stride_counting_iterator l(last, &strides, &displacement);
+
+  auto i = std::lower_bound(f, l, value);
+  for (auto j = f; j != i; ++j)
+    assert(*j < value);
+  for (auto j = i; j != l; ++j)
+    assert(!(*j < value));
+
+  auto len = std::distance(first, last);
+  assert(strides <= 2.5 * len + 1);
+  assert(displacement <= 2.5 * len + 1);
 }
 
 template <class Iter>
diff --git a/libcxx/test/std/algorithms/alg.sorting/alg.binary.search/lower.bound/lower_bound_comp.pass.cpp b/libcxx/test/std/algorithms/alg.sorting/alg.binary.search/lower.bound/lower_bound_comp.pass.cpp
index b9133028d9ade2..05fd43eada4616 100644
--- a/libcxx/test/std/algorithms/alg.sorting/alg.binary.search/lower.bound/lower_bound_comp.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.sorting/alg.binary.search/lower.bound/lower_bound_comp.pass.cpp
@@ -17,6 +17,7 @@
 #include <vector>
 #include <cassert>
 #include <cstddef>
+#include <cmath>
 
 #include "test_macros.h"
 #include "test_iterators.h"
@@ -38,11 +39,28 @@ template <class Iter, class T>
 void
 test(Iter first, Iter last, const T& value)
 {
-    Iter i = std::lower_bound(first, last, value, std::greater<int>());
-    for (Iter j = first; j != i; ++j)
-        assert(std::greater<int>()(*j, value));
-    for (Iter j = i; j != last; ++j)
-        assert(!std::greater<int>()(*j, value));
+  std::size_t strides{};
+  std::size_t displacement{};
+  stride_counting_iterator f(first, &strides, &displacement);
+  stride_counting_iterator l(last, &strides, &displacement);
+
+  std::size_t comparisons{};
+  auto cmp = [&comparisons](int rhs, int lhs) {
+    ++comparisons;
+    return std::greater<int>()(rhs, lhs);
+  };
+
+  auto i = std::lower_bound(f, l, value, cmp);
+
+  for (auto j = f; j != i; ++j)
+    assert(std::greater<int>()(*j, value));
+  for (auto j = i; j != l; ++j)
+    assert(!std::greater<int>()(*j, value));
+
+  auto len = std::distance(first, last);
+  assert(strides <= 2.5 * len + 1);
+  assert(displacement <= 2.5 * len + 1);
+  assert(comparisons <= 2 * ceil(log(len + 1) + 2));
 }
 
 template <class Iter>
diff --git a/libcxx/test/support/test_iterators.h b/libcxx/test/support/test_iterators.h
index 1133b9597d09cf..3b86a93564e4b5 100644
--- a/libcxx/test/support/test_iterators.h
+++ b/libcxx/test/support/test_iterators.h
@@ -730,7 +730,9 @@ struct common_input_iterator {
 // * `stride_displacement`, which records the displacement of the calls. This means that both
 //   op++/op+= will increase the displacement counter by 1, and op--/op-= will decrease the
 //   displacement counter by 1.
-template <class It>
+template <class It,
+          class StrideCountType        = std::iter_difference_t<It>,
+          class StrideDisplacementType = std::iter_difference_t<It>>
 class stride_counting_iterator {
 public:
     using value_type = typename iter_value_or_void<It>::type;
@@ -743,16 +745,40 @@ class stride_counting_iterator {
         std::conditional_t<std::input_iterator<It>,         std::input_iterator_tag,
         /* else */                                          std::output_iterator_tag
     >>>>>;
+    using iterator_category = iterator_concept;
 
     stride_counting_iterator() requires std::default_initializable<It> = default;
 
     constexpr explicit stride_counting_iterator(It const& it) : base_(base(it)) { }
 
+    constexpr explicit stride_counting_iterator(
+        It const& it, StrideCountType* stride_count, StrideDisplacementType* stride_displacement)
+        : base_(base(it)), stride_count_(stride_count), stride_displacement_(stride_displacement) {}
+
+    constexpr stride_counting_iterator(const stride_counting_iterator& o) { *this = o; }
+    constexpr stride_counting_iterator(stride_counting_iterator&& o) { *this = o; }
+
+    constexpr stride_counting_iterator& operator=(const stride_counting_iterator& o) {
+      base_ = o.base_;
+      // if memory backing count is owned by the object, copy values
+      if (o.stride_count_ == &o.stride_count_default_) {
+        assert(o.stride_displacement_ == &o.stride_displacement_default_);
+        *stride_count_        = *o.stride_count_;
+        *stride_displacement_ = *o.stride_displacement_;
+        return *this;
+      }
+      // otherwise share the same externally-owned variables
+      stride_count_        = o.stride_count_;
+      stride_displacement_ = o.stride_displacement_;
+      return *this;
+    }
+    constexpr stride_counting_iterator& operator=(stride_counting_iterator&& o) { return *this = o; }
+
     friend constexpr It base(stride_counting_iterator const& it) { return It(it.base_); }
 
-    constexpr difference_type stride_count() const { return stride_count_; }
+    constexpr StrideCountType stride_count() const { return *stride_count_; }
 
-    constexpr difference_type stride_displacement() const { return stride_displacement_; }
+    constexpr StrideDisplacementType stride_displacement() const { return *stride_displacement_; }
 
     constexpr decltype(auto) operator*() const { return *It(base_); }
 
@@ -761,8 +787,8 @@ class stride_counting_iterator {
     constexpr stride_counting_iterator& operator++() {
         It tmp(base_);
         base_ = base(++tmp);
-        ++stride_count_;
-        ++stride_displacement_;
+        ++*stride_count_;
+        ++*stride_displacement_;
         return *this;
     }
 
@@ -781,8 +807,8 @@ class stride_counting_iterator {
     {
         It tmp(base_);
         base_ = base(--tmp);
-        ++stride_count_;
-        --stride_displacement_;
+        ++*stride_count_;
+        --*stride_displacement_;
         return *this;
     }
 
@@ -799,8 +825,8 @@ class stride_counting_iterator {
     {
         It tmp(base_);
         base_ = base(tmp += n);
-        ++stride_count_;
-        ++stride_displacement_;
+        ++*stride_count_;
+        ++*stride_displacement_;
         return *this;
     }
 
@@ -809,8 +835,8 @@ class stride_counting_iterator {
     {
         It tmp(base_);
         base_ = base(tmp -= n);
-        ++stride_count_;
-        --stride_displacement_;
+        ++*stride_count_;
+        --*stride_displacement_;
         return *this;
     }
 
@@ -873,8 +899,11 @@ class stride_counting_iterator {
 
 private:
     decltype(base(std::declval<It>())) base_;
-    difference_type stride_count_ = 0;
-    difference_type stride_displacement_ = 0;
+    StrideCountType stride_count_default_               = 0;
+    StrideDisplacementType stride_displacement_default_ = 0;
+
+    StrideCountType* stride_count_               = &stride_count_default_;
+    StrideDisplacementType* stride_displacement_ = &stride_displacement_default_;
 };
 template <class It>
 stride_counting_iterator(It) -> stride_counting_iterator<It>;

>From f6bcf2743080ced55d9d589daed611c5e9696ac5 Mon Sep 17 00:00:00 2001
From: Iuri Chaer <ichaer at splunk.com>
Date: Tue, 17 Oct 2023 13:52:37 +0100
Subject: [PATCH 02/44] [libc++] Introduce one-sided binary search for
 lower_bound on non-random iterators.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

One-sided binary search, aka meta binary search, has been in the public domain for decades, and has the general
advantage of being Ω(1) rather than the classic algorithm's Ω(log(n)), with the downside of executing at most
2*log(n) comparisons vs the classic algorithm's exact log(n). There are two scenarios in which it really shines:
the first one is when operating over non-random iterators, because the classic algorithm requires knowing the
container's size upfront, which adds Ω(n) iterator increments to the complexity. The second one is when you're
traversing the container in order, trying to fast-forward to the next value: in that case, the classic algorithm
would yield Ω(n*log(n)) comparisons and, for non-random iterators, Ω(n^2) iterator increments, whereas the one-sided
version will yield O(n) operations on both counts, with a Ω(log(n)) bound on the number of comparisons.
---
 .../include/__algorithm/iterator_operations.h | 47 +++++++++++++
 libcxx/include/__algorithm/lower_bound.h      | 69 +++++++++++++++++--
 2 files changed, 110 insertions(+), 6 deletions(-)

diff --git a/libcxx/include/__algorithm/iterator_operations.h b/libcxx/include/__algorithm/iterator_operations.h
index e6176da4f5606d..d73573747087e0 100644
--- a/libcxx/include/__algorithm/iterator_operations.h
+++ b/libcxx/include/__algorithm/iterator_operations.h
@@ -87,6 +87,53 @@ struct _IterOps<_ClassicAlgPolicy> {
     std::advance(__iter, __count);
   }
 
+  // advance with sentinel, a la std::ranges::advance
+  // it's unclear whether _Iter has a difference_type and whether that's signed, so we play it safe:
+  // use the incoming type for returning and steer clear of negative overflows
+  template <class _Iter, class _Distance>
+  _LIBCPP_HIDE_FROM_ABI constexpr static _Distance advance(_Iter& __iter, _Distance __count, const _Iter& __sentinel) {
+    return _IterOps::__advance(__iter, __count, __sentinel, typename iterator_traits<_Iter>::iterator_category());
+  }
+
+  // advance with sentinel, a la std::ranges::advance -- InputIterator specialization
+  template <class _InputIter, class _Distance>
+  _LIBCPP_HIDE_FROM_ABI constexpr static _Distance
+  __advance(_InputIter& __iter, _Distance __count, const _InputIter& __sentinel, input_iterator_tag) {
+    _Distance __dist{};
+    for (; __dist < __count && __iter != __sentinel; ++__dist)
+      ++__iter;
+    return __count - __dist;
+  }
+
+  // advance with sentinel, a la std::ranges::advance -- BidirectionalIterator specialization
+  template <class _BiDirIter, class _Distance>
+  _LIBCPP_HIDE_FROM_ABI constexpr static _Distance
+  __advance(_BiDirIter& __iter, _Distance __count, const _BiDirIter& __sentinel, bidirectional_iterator_tag) {
+    _Distance __dist{};
+    if (__count >= 0)
+      for (; __dist < __count && __iter != __sentinel; ++__dist)
+        ++__iter;
+    else
+      for (__count = -__count; __dist < __count && __iter != __sentinel; ++__dist)
+        --__iter;
+    return __count - __dist;
+  }
+
+  // advance with sentinel, a la std::ranges::advance -- RandomIterator specialization
+  template <class _RandIter, class _Distance>
+  _LIBCPP_HIDE_FROM_ABI constexpr static _Distance
+  __advance(_RandIter& __iter, _Distance __count, const _RandIter& __sentinel, random_access_iterator_tag) {
+    auto __dist = _IterOps::distance(__iter, __sentinel);
+    _LIBCPP_ASSERT_UNCATEGORIZED(
+        __count == 0 || (__dist < 0) == (__count < 0), "__sentinel must precede __iter when __count<0");
+    if (__count < 0)
+      __dist = __dist > __count ? __dist : __count;
+    else
+      __dist = __dist < __count ? __dist : __count;
+    __iter += __dist;
+    return __count - __dist;
+  }
+
   // distance
   template <class _Iter>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14
diff --git a/libcxx/include/__algorithm/lower_bound.h b/libcxx/include/__algorithm/lower_bound.h
index 91c3bdaafd0cfd..b432829667fa99 100644
--- a/libcxx/include/__algorithm/lower_bound.h
+++ b/libcxx/include/__algorithm/lower_bound.h
@@ -27,11 +27,13 @@
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
-template <class _AlgPolicy, class _Iter, class _Sent, class _Type, class _Proj, class _Comp>
-_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20
-_Iter __lower_bound(_Iter __first, _Sent __last, const _Type& __value, _Comp& __comp, _Proj& __proj) {
-  auto __len = _IterOps<_AlgPolicy>::distance(__first, __last);
-
+template <class _AlgPolicy, class _Iter, class _Type, class _Proj, class _Comp>
+_LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Iter __lower_bound_bisecting(
+    _Iter __first,
+    const _Type& __value,
+    typename iterator_traits<_Iter>::difference_type __len,
+    _Comp& __comp,
+    _Proj& __proj) {
   while (__len != 0) {
     auto __l2 = std::__half_positive(__len);
     _Iter __m = __first;
@@ -46,13 +48,68 @@ _Iter __lower_bound(_Iter __first, _Sent __last, const _Type& __value, _Comp& __
   return __first;
 }
 
+// One-sided binary search, aka meta binary search, has been in the public domain for decades, and has the general
+// advantage of being Ω(1) rather than the classic algorithm's Ω(log(n)), with the downside of executing at most
+// 2*(log(n)-1) comparisons vs the classic algorithm's exact log(n). There are two scenarios in which it really shines:
+// the first one is when operating over non-random iterators, because the classic algorithm requires knowing the
+// container's size upfront, which adds Ω(n) iterator increments to the complexity. The second one is when you're
+// traversing the container in order, trying to fast-forward to the next value: in that case, the classic algorithm
+// would yield Ω(n*log(n)) comparisons and, for non-random iterators, Ω(n^2) iterator increments, whereas the one-sided
+// version will yield O(n) operations on both counts, with a Ω(log(n)) bound on the number of comparisons.
+template <class _AlgPolicy, class _Iter, class _Sent, class _Type, class _Proj, class _Comp>
+_LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Iter
+__lower_bound_onesided(_Iter __first, _Sent __last, const _Type& __value, _Comp& __comp, _Proj& __proj) {
+  // static_assert(std::is_base_of<std::forward_iterator_tag, typename _IterOps<_AlgPolicy>::template
+  // __iterator_category<_Iter>>::value,
+  //       "lower_bound() is a multipass algorithm and requires forward iterator or better");
+
+  using _Distance = typename iterator_traits<_Iter>::difference_type;
+  for (_Distance __step = 1; __first != __last; __step <<= 1) {
+    auto __it   = __first;
+    auto __dist = __step - _IterOps<_AlgPolicy>::advance(__it, __step, __last);
+    // once we reach the last range where needle can be we must start
+    // looking inwards, bisecting that range
+    if (__it == __last || !std::__invoke(__comp, std::__invoke(__proj, *__it), __value)) {
+      return std::__lower_bound_bisecting<_AlgPolicy>(__first, __value, __dist, __comp, __proj);
+    }
+    // range not found, move forward!
+    __first = std::move(__it);
+  }
+  return __first;
+}
+
+template <class _AlgPolicy, class _InputIter, class _Sent, class _Type, class _Proj, class _Comp>
+_LIBCPP_NODISCARD_EXT inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _InputIter __lower_bound(
+    _InputIter __first, _Sent __last, const _Type& __value, _Comp& __comp, _Proj& __proj, std::input_iterator_tag) {
+  return std::__lower_bound_onesided<_AlgPolicy>(__first, __last, __value, __comp, __proj);
+}
+
+template <class _AlgPolicy, class _RandIter, class _Sent, class _Type, class _Proj, class _Comp>
+_LIBCPP_NODISCARD_EXT inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _RandIter __lower_bound(
+    _RandIter __first,
+    _Sent __last,
+    const _Type& __value,
+    _Comp& __comp,
+    _Proj& __proj,
+    std::random_access_iterator_tag) {
+  const auto __dist = _IterOps<_AlgPolicy>::distance(__first, __last);
+  return std::__lower_bound_bisecting<_AlgPolicy>(__first, __value, __dist, __comp, __proj);
+}
+
+template <class _AlgPolicy, class _Iter, class _Sent, class _Type, class _Proj, class _Comp>
+_LIBCPP_NODISCARD_EXT inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Iter
+__lower_bound(_Iter __first, _Sent __last, const _Type& __value, _Comp&& __comp, _Proj&& __proj) {
+  return std::__lower_bound<_AlgPolicy>(
+      __first, __last, __value, __comp, __proj, typename _IterOps<_AlgPolicy>::template __iterator_category<_Iter>());
+}
+
 template <class _ForwardIterator, class _Tp, class _Compare>
 _LIBCPP_NODISCARD_EXT inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20
 _ForwardIterator lower_bound(_ForwardIterator __first, _ForwardIterator __last, const _Tp& __value, _Compare __comp) {
   static_assert(__is_callable<_Compare, decltype(*__first), const _Tp&>::value,
                 "The comparator has to be callable");
   auto __proj = std::__identity();
-  return std::__lower_bound<_ClassicAlgPolicy>(__first, __last, __value, __comp, __proj);
+  return std::__lower_bound<_ClassicAlgPolicy>(__first, __last, __value, std::move(__comp), std::move(__proj));
 }
 
 template <class _ForwardIterator, class _Tp>

>From 36bb63e36b56f98da2b808ab55410bec5c1d0bb5 Mon Sep 17 00:00:00 2001
From: Iuri Chaer <ichaer at splunk.com>
Date: Tue, 17 Oct 2023 13:53:09 +0100
Subject: [PATCH 03/44] [libc++][test] Add set_intersection complexity
 validation tests prior to introducing use of one-sided binary search to
 fast-forward over ranges of elements.

---
 .../ranges_set_intersection.pass.cpp          | 240 +++++++++++++++++-
 1 file changed, 234 insertions(+), 6 deletions(-)

diff --git a/libcxx/test/std/algorithms/alg.sorting/alg.set.operations/set.intersection/ranges_set_intersection.pass.cpp b/libcxx/test/std/algorithms/alg.sorting/alg.set.operations/set.intersection/ranges_set_intersection.pass.cpp
index 0ee89e0131a073..30cedd19038d7b 100644
--- a/libcxx/test/std/algorithms/alg.sorting/alg.set.operations/set.intersection/ranges_set_intersection.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.sorting/alg.set.operations/set.intersection/ranges_set_intersection.pass.cpp
@@ -28,6 +28,9 @@
 #include <algorithm>
 #include <array>
 #include <concepts>
+#include <cstddef>
+#include <iterator>
+#include <type_traits>
 
 #include "almost_satisfies_types.h"
 #include "MoveOnly.h"
@@ -93,14 +96,17 @@ static_assert(!HasSetIntersectionRange<UncheckedRange<MoveOnly*>, UncheckedRange
 
 using std::ranges::set_intersection_result;
 
+// TODO: std::ranges::set_intersection calls std::ranges::copy
+// std::ranges::copy(contiguous_iterator<int*>, sentinel_wrapper<contiguous_iterator<int*>>, contiguous_iterator<int*>) doesn't seem to work.
+// It seems that std::ranges::copy calls std::copy, which unwraps contiguous_iterator<int*> into int*,
+// and then it failed because there is no == between int* and sentinel_wrapper<contiguous_iterator<int*>>
+template <typename Iter>
+using SentinelWorkaround = std::conditional_t<std::contiguous_iterator<Iter>, Iter, sentinel_wrapper<Iter>>;
+
 template <class In1, class In2, class Out, std::size_t N1, std::size_t N2, std::size_t N3>
 constexpr void testSetIntersectionImpl(std::array<int, N1> in1, std::array<int, N2> in2, std::array<int, N3> expected) {
-  // TODO: std::ranges::set_intersection calls std::ranges::copy
-  // std::ranges::copy(contiguous_iterator<int*>, sentinel_wrapper<contiguous_iterator<int*>>, contiguous_iterator<int*>) doesn't seem to work.
-  // It seems that std::ranges::copy calls std::copy, which unwraps contiguous_iterator<int*> into int*,
-  // and then it failed because there is no == between int* and sentinel_wrapper<contiguous_iterator<int*>>
-  using Sent1 = std::conditional_t<std::contiguous_iterator<In1>, In1, sentinel_wrapper<In1>>;
-  using Sent2 = std::conditional_t<std::contiguous_iterator<In2>, In2, sentinel_wrapper<In2>>;
+  using Sent1 = SentinelWorkaround<In1>;
+  using Sent2 = SentinelWorkaround<In2>;
 
   // iterator overload
   {
@@ -272,6 +278,225 @@ constexpr void runAllIteratorPermutationsTests() {
   static_assert(withAllPermutationsOfInIter1AndInIter2<contiguous_iterator<int*>>());
 }
 
+namespace {
+struct [[nodiscard]] OperationCounts {
+  std::size_t comparisons{};
+  struct PerInput {
+    std::size_t proj{};
+    std::size_t iterator_strides{};
+    std::ptrdiff_t iterator_displacement{};
+
+    // IGNORES proj!
+    [[nodiscard]] constexpr bool operator==(const PerInput& o) const {
+      return iterator_strides == o.iterator_strides && iterator_displacement == o.iterator_displacement;
+    }
+
+    [[nodiscard]] constexpr bool matchesExpectation(const PerInput& expect) {
+      return proj <= expect.proj && iterator_strides <= expect.iterator_strides &&
+             iterator_displacement <= expect.iterator_displacement;
+    }
+  };
+  std::array<PerInput, 2> in;
+
+  [[nodiscard]] constexpr bool matchesExpectation(const OperationCounts& expect) {
+    return comparisons <= expect.comparisons && in[0].matchesExpectation(expect.in[0]) &&
+           in[1].matchesExpectation(expect.in[1]);
+  }
+
+  [[nodiscard]] constexpr bool operator==(const OperationCounts& o) const {
+    return comparisons == o.comparisons && std::ranges::equal(in, o.in);
+  }
+};
+} // namespace
+
+#include <iostream>
+template <template <class...> class In1,
+          template <class...>
+          class In2,
+          class Out,
+          std::size_t N1,
+          std::size_t N2,
+          std::size_t N3>
+constexpr void testSetIntersectionAndReturnOpCounts(
+    std::array<int, N1> in1,
+    std::array<int, N2> in2,
+    std::array<int, N3> expected,
+    const OperationCounts& expectedOpCounts) {
+  OperationCounts ops;
+
+  const auto comp = [&ops](int x, int y) {
+    ++ops.comparisons;
+    return x < y;
+  };
+
+  std::array<int, N3> out;
+
+  stride_counting_iterator b1(
+      In1<decltype(in1.begin())>(in1.begin()), &ops.in[0].iterator_strides, &ops.in[0].iterator_displacement);
+  stride_counting_iterator e1(
+      In1<decltype(in1.end()) >(in1.end()), &ops.in[0].iterator_strides, &ops.in[0].iterator_displacement);
+  stride_counting_iterator b2(
+      In2<decltype(in2.begin())>(in2.begin()), &ops.in[1].iterator_strides, &ops.in[1].iterator_displacement);
+  stride_counting_iterator e2(
+      In2<decltype(in2.end()) >(in2.end()), &ops.in[1].iterator_strides, &ops.in[1].iterator_displacement);
+
+  std::set_intersection(b1, e1, b2, e2, Out(out.data()), comp);
+
+  assert(std::ranges::equal(out, expected));
+  assert(ops.matchesExpectation(expectedOpCounts));
+}
+
+template <template <class...> class In1,
+          template <class...>
+          class In2,
+          class Out,
+          std::size_t N1,
+          std::size_t N2,
+          std::size_t N3>
+constexpr void testRangesSetIntersectionAndReturnOpCounts(
+    std::array<int, N1> in1,
+    std::array<int, N2> in2,
+    std::array<int, N3> expected,
+    const OperationCounts& expectedOpCounts) {
+  OperationCounts ops;
+
+  const auto comp = [&ops](int x, int y) {
+    ++ops.comparisons;
+    return x < y;
+  };
+
+  const auto proj1 = [&ops](const int& i) {
+    ++ops.in[0].proj;
+    return i;
+  };
+
+  const auto proj2 = [&ops](const int& i) {
+    ++ops.in[1].proj;
+    return i;
+  };
+
+  std::array<int, N3> out;
+
+  stride_counting_iterator b1(
+      In1<decltype(in1.begin())>(in1.begin()), &ops.in[0].iterator_strides, &ops.in[0].iterator_displacement);
+  stride_counting_iterator e1(
+      In1<decltype(in1.end()) >(in1.end()), &ops.in[0].iterator_strides, &ops.in[0].iterator_displacement);
+  stride_counting_iterator b2(
+      In2<decltype(in2.begin())>(in2.begin()), &ops.in[1].iterator_strides, &ops.in[1].iterator_displacement);
+  stride_counting_iterator e2(
+      In2<decltype(in2.end()) >(in2.end()), &ops.in[1].iterator_strides, &ops.in[1].iterator_displacement);
+
+  std::ranges::subrange r1{b1, SentinelWorkaround<decltype(e1)>{e1}};
+  std::ranges::subrange r2{b2, SentinelWorkaround<decltype(e2)>{e2}};
+  std::same_as<set_intersection_result<decltype(e1), decltype(e2), Out>> decltype(auto) result =
+      std::ranges::set_intersection(r1, r2, Out{out.data()}, comp, proj1, proj2);
+  assert(std::ranges::equal(out, expected));
+  assert(base(result.in1) == base(e1));
+  assert(base(result.in2) == base(e2));
+  assert(base(result.out) == out.data() + out.size());
+  assert(ops.matchesExpectation(expectedOpCounts));
+}
+
+template <template <typename...> class In1, template <typename...> class In2, class Out>
+constexpr void testComplexityParameterizedIter() {
+  // Worst-case complexity:
+  // Let N=(last1 - first1) and M=(last2 - first2)
+  // At most 2*(N+M) - 1 comparisons and applications of each projection.
+  // At most 2*(N+M) iterator mutations.
+  {
+    std::array r1{1, 3, 5, 7, 9, 11, 13, 15, 17, 19};
+    std::array r2{2, 4, 6, 8, 10, 12, 14, 16, 18, 20};
+    std::array<int, 0> expected{};
+
+    OperationCounts expectedCounts;
+    expectedCounts.comparisons                 = 37;
+    expectedCounts.in[0].proj                  = 37;
+    expectedCounts.in[0].iterator_strides      = 30;
+    expectedCounts.in[0].iterator_displacement = 30;
+    expectedCounts.in[1]                       = expectedCounts.in[0];
+
+    testSetIntersectionAndReturnOpCounts<In1, In2, Out>(r1, r2, expected, expectedCounts);
+    testRangesSetIntersectionAndReturnOpCounts<In1, In2, Out>(r1, r2, expected, expectedCounts);
+  }
+
+  {
+    std::array r1{1, 3, 5, 7, 9, 11, 13, 15, 17, 19};
+    std::array r2{1, 3, 5, 7, 9, 11, 13, 15, 17, 19};
+    std::array expected{1, 3, 5, 7, 9, 11, 13, 15, 17, 19};
+
+    OperationCounts expectedCounts;
+    expectedCounts.comparisons                 = 38;
+    expectedCounts.in[0].proj                  = 38;
+    expectedCounts.in[0].iterator_strides      = 30;
+    expectedCounts.in[0].iterator_displacement = 30;
+    expectedCounts.in[1]                       = expectedCounts.in[0];
+
+    testSetIntersectionAndReturnOpCounts<In1, In2, Out>(r1, r2, expected, expectedCounts);
+    testRangesSetIntersectionAndReturnOpCounts<In1, In2, Out>(r1, r2, expected, expectedCounts);
+  }
+
+  // Lower complexity when there is low overlap between ranges: we can make 2*log(X) comparisons when one range
+  // has X elements that can be skipped over.
+  {
+    std::array r1{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
+    std::array r2{15};
+    std::array expected{15};
+
+    OperationCounts expectedCounts;
+    expectedCounts.comparisons                 = 8;
+    expectedCounts.in[0].proj                  = 8;
+    expectedCounts.in[0].iterator_strides      = 24;
+    expectedCounts.in[0].iterator_displacement = 24;
+    expectedCounts.in[1].proj                  = 8;
+    expectedCounts.in[1].iterator_strides      = 3;
+    expectedCounts.in[1].iterator_displacement = 3;
+
+    testSetIntersectionAndReturnOpCounts<In1, In2, Out>(r1, r2, expected, expectedCounts);
+    testRangesSetIntersectionAndReturnOpCounts<In1, In2, Out>(r1, r2, expected, expectedCounts);
+  }
+
+  {
+    std::array r1{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
+    std::array r2{0, 16};
+    std::array<int, 0> expected{};
+
+    OperationCounts expectedCounts;
+    expectedCounts.comparisons                 = 10;
+    expectedCounts.in[0].proj                  = 10;
+    expectedCounts.in[0].iterator_strides      = 24;
+    expectedCounts.in[0].iterator_displacement = 24;
+    expectedCounts.in[1].proj                  = 10;
+    expectedCounts.in[1].iterator_strides      = 4;
+    expectedCounts.in[1].iterator_displacement = 4;
+
+    testSetIntersectionAndReturnOpCounts<In1, In2, Out>(r1, r2, expected, expectedCounts);
+    testRangesSetIntersectionAndReturnOpCounts<In1, In2, Out>(r1, r2, expected, expectedCounts);
+  }
+}
+
+template <template <typename...> class In2, class Out>
+constexpr void testComplexityParameterizedIterPermutateIn1() {
+  //common_input_iterator
+  testComplexityParameterizedIter<forward_iterator, In2, Out>();
+  testComplexityParameterizedIter<bidirectional_iterator, In2, Out>();
+  testComplexityParameterizedIter<random_access_iterator, In2, Out>();
+}
+
+template <class Out>
+constexpr void testComplexityParameterizedIterPermutateIn1In2() {
+  testComplexityParameterizedIterPermutateIn1<forward_iterator, Out>();
+  testComplexityParameterizedIterPermutateIn1<bidirectional_iterator, Out>();
+  testComplexityParameterizedIterPermutateIn1<random_access_iterator, Out>();
+}
+
+constexpr bool testComplexityMultipleTypes() {
+  //testComplexityParameterizedIter<cpp20_input_iterator, random_access_iterator, OutIter>();
+  testComplexityParameterizedIterPermutateIn1In2<forward_iterator<int*>>();
+  testComplexityParameterizedIterPermutateIn1In2<bidirectional_iterator<int*>>();
+  testComplexityParameterizedIterPermutateIn1In2<random_access_iterator<int*>>();
+  return true;
+}
+
 constexpr bool test() {
   // check that every element is copied exactly once
   {
@@ -572,5 +797,8 @@ int main(int, char**) {
   // than the step limit.
   runAllIteratorPermutationsTests();
 
+  testComplexityMultipleTypes();
+  static_assert(testComplexityMultipleTypes());
+
   return 0;
 }

>From c23272c389329d3af83c0f58f896ee6ea47260ed Mon Sep 17 00:00:00 2001
From: Iuri Chaer <ichaer at splunk.com>
Date: Tue, 17 Oct 2023 13:53:31 +0100
Subject: [PATCH 04/44] [libc++] Introduce use of __lower_bound_onesided to
 improve average complexity of set_intersection.

---
 libcxx/include/__algorithm/set_intersection.h | 154 +++++++++++++++++-
 1 file changed, 150 insertions(+), 4 deletions(-)

diff --git a/libcxx/include/__algorithm/set_intersection.h b/libcxx/include/__algorithm/set_intersection.h
index f2603fe1365ac3..556738022f4859 100644
--- a/libcxx/include/__algorithm/set_intersection.h
+++ b/libcxx/include/__algorithm/set_intersection.h
@@ -12,9 +12,13 @@
 #include <__algorithm/comp.h>
 #include <__algorithm/comp_ref_type.h>
 #include <__algorithm/iterator_operations.h>
+#include <__algorithm/lower_bound.h>
 #include <__config>
+#include <__functional/identity.h>
 #include <__iterator/iterator_traits.h>
 #include <__iterator/next.h>
+#include <__type_traits/is_same.h>
+#include <__utility/exchange.h>
 #include <__utility/move.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
@@ -36,9 +40,122 @@ struct __set_intersection_result {
 };
 
 template <class _AlgPolicy, class _Compare, class _InIter1, class _Sent1, class _InIter2, class _Sent2, class _OutIter>
-_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __set_intersection_result<_InIter1, _InIter2, _OutIter>
-__set_intersection(
-    _InIter1 __first1, _Sent1 __last1, _InIter2 __first2, _Sent2 __last2, _OutIter __result, _Compare&& __comp) {
+struct _LIBCPP_NODISCARD_EXT __set_intersector {
+  _InIter1& __first1_;
+  const _Sent1& __last1_;
+  _InIter2& __first2_;
+  const _Sent2& __last2_;
+  _OutIter& __result_;
+  _Compare& __comp_;
+  static constexpr auto __proj_ = std::__identity();
+  bool __prev_advanced_         = true;
+
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __set_intersector(
+      _InIter1& __first1, _Sent1& __last1, _InIter2& __first2, _Sent2& __last2, _OutIter& __result, _Compare& __comp)
+      : __first1_(__first1),
+        __last1_(__last1),
+        __first2_(__first2),
+        __last2_(__last2),
+        __result_(__result),
+        __comp_(__comp) {}
+
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI
+      _LIBCPP_CONSTEXPR_SINCE_CXX20 __set_intersection_result<_InIter1, _InIter2, _OutIter>
+      operator()() && {
+    while (__first2_ != __last2_) {
+      __advance1_and_maybe_add_result();
+      if (__first1_ == __last1_)
+        break;
+      __advance2_and_maybe_add_result();
+    }
+    return __set_intersection_result<_InIter1, _InIter2, _OutIter>(
+        _IterOps<_AlgPolicy>::next(std::move(__first1_), std::move(__last1_)),
+        _IterOps<_AlgPolicy>::next(std::move(__first2_), std::move(__last2_)),
+        std::move(__result_));
+  }
+
+private:
+  // advance __iter to the first element in the range where !__comp_(__iter, __value)
+  // add result if this is the second consecutive call without advancing
+  // this method only works if you alternate calls between __advance1_and_maybe_add_result() and
+  // __advance2_and_maybe_add_result()
+  template <class _Iter, class _Sent, class _Value>
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void
+  __advance_and_maybe_add_result(_Iter& __iter, const _Sent& __sentinel, const _Value& __value) {
+    // use one-sided lower bound for improved algorithmic complexity bounds
+    const auto __tmp =
+        std::exchange(__iter, std::__lower_bound_onesided<_AlgPolicy>(__iter, __sentinel, __value, __comp_, __proj_));
+    __add_output_unless(__tmp != __iter);
+  }
+
+  // advance __first1_ to the first element in the range where !__comp_(*__first1_, *__first2_)
+  // add result if neither __first1_ nor __first2_ advanced in the last attempt (meaning they are equal)
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __advance1_and_maybe_add_result() {
+    __advance_and_maybe_add_result(__first1_, __last1_, *__first2_);
+  }
+
+  // advance __first2_ to the first element in the range where !__comp_(*__first2_, *__first1_)
+  // add result if neither __first1_ nor __first2_ advanced in the last attempt (meaning they are equal)
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __advance2_and_maybe_add_result() {
+    __advance_and_maybe_add_result(__first2_, __last2_, *__first1_);
+  }
+
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __add_output_unless(bool __advanced) {
+    if (__advanced | __prev_advanced_) {
+      __prev_advanced_ = __advanced;
+    } else {
+      *__result_ = *__first1_;
+      ++__result_;
+      ++__first1_;
+      ++__first2_;
+      __prev_advanced_ = true;
+    }
+  }
+};
+
+// with forward iterators we can use binary search to skip over entries
+template <class _AlgPolicy,
+          class _Compare,
+          class _InForwardIter1,
+          class _Sent1,
+          class _InForwardIter2,
+          class _Sent2,
+          class _OutIter>
+_LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI
+    _LIBCPP_CONSTEXPR_SINCE_CXX20 __set_intersection_result<_InForwardIter1, _InForwardIter2, _OutIter>
+    __set_intersection(
+        _InForwardIter1 __first1,
+        _Sent1 __last1,
+        _InForwardIter2 __first2,
+        _Sent2 __last2,
+        _OutIter __result,
+        _Compare&& __comp,
+        std::forward_iterator_tag,
+        std::forward_iterator_tag) {
+  std::__set_intersector<_AlgPolicy, _Compare, _InForwardIter1, _Sent1, _InForwardIter2, _Sent2, _OutIter>
+      __intersector(__first1, __last1, __first2, __last2, __result, __comp);
+  return std::move(__intersector)();
+}
+
+// input iterators are not suitable for multipass algorithms, so we stick to the classic single-pass version
+template <class _AlgPolicy,
+          class _Compare,
+          class _InInputIter1,
+          class _Sent1,
+          class _InInputIter2,
+          class _Sent2,
+          class _OutIter>
+_LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI
+    _LIBCPP_CONSTEXPR_SINCE_CXX20 __set_intersection_result<_InInputIter1, _InInputIter2, _OutIter>
+    __set_intersection(
+        _InInputIter1 __first1,
+        _Sent1 __last1,
+        _InInputIter2 __first2,
+        _Sent2 __last2,
+        _OutIter __result,
+        _Compare&& __comp,
+        std::input_iterator_tag,
+        std::input_iterator_tag) {
   while (__first1 != __last1 && __first2 != __last2) {
     if (__comp(*__first1, *__first2))
       ++__first1;
@@ -52,12 +169,41 @@ __set_intersection(
     }
   }
 
-  return __set_intersection_result<_InIter1, _InIter2, _OutIter>(
+  return std::__set_intersection_result<_InInputIter1, _InInputIter2, _OutIter>(
       _IterOps<_AlgPolicy>::next(std::move(__first1), std::move(__last1)),
       _IterOps<_AlgPolicy>::next(std::move(__first2), std::move(__last2)),
       std::move(__result));
 }
 
+template <class _AlgPolicy, class _Iter>
+class __set_intersection_iter_category {
+  template <class _It>
+  using __cat = typename std::_IterOps<_AlgPolicy>::template __iterator_category<_It>;
+  template <class _It>
+  static auto test(__cat<_It>*) -> __cat<_It>;
+  template <class>
+  static std::input_iterator_tag test(...);
+
+public:
+  using __type = decltype(test<_Iter>(nullptr));
+};
+
+template <class _AlgPolicy, class _Compare, class _InIter1, class _Sent1, class _InIter2, class _Sent2, class _OutIter>
+_LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI
+    _LIBCPP_CONSTEXPR_SINCE_CXX20 __set_intersection_result<_InIter1, _InIter2, _OutIter>
+    __set_intersection(
+        _InIter1 __first1, _Sent1 __last1, _InIter2 __first2, _Sent2 __last2, _OutIter __result, _Compare&& __comp) {
+  return std::__set_intersection<_AlgPolicy>(
+      std::move(__first1),
+      std::move(__last1),
+      std::move(__first2),
+      std::move(__last2),
+      std::move(__result),
+      std::forward<_Compare>(__comp),
+      typename std::__set_intersection_iter_category<_AlgPolicy, _InIter1>::__type(),
+      typename std::__set_intersection_iter_category<_AlgPolicy, _InIter2>::__type());
+}
+
 template <class _InputIterator1, class _InputIterator2, class _OutputIterator, class _Compare>
 inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _OutputIterator set_intersection(
     _InputIterator1 __first1,

>From 0b57ea00b44dbe69bc5125a08691a72b0dea42ce Mon Sep 17 00:00:00 2001
From: Iuri Chaer <ichaer at splunk.com>
Date: Tue, 2 Jan 2024 17:18:21 +0000
Subject: [PATCH 05/44] Fix `constexpr` annotations.

---
 libcxx/include/__algorithm/iterator_operations.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/libcxx/include/__algorithm/iterator_operations.h b/libcxx/include/__algorithm/iterator_operations.h
index d73573747087e0..21117e6b7d7609 100644
--- a/libcxx/include/__algorithm/iterator_operations.h
+++ b/libcxx/include/__algorithm/iterator_operations.h
@@ -91,13 +91,13 @@ struct _IterOps<_ClassicAlgPolicy> {
   // it's unclear whether _Iter has a difference_type and whether that's signed, so we play it safe:
   // use the incoming type for returning and steer clear of negative overflows
   template <class _Iter, class _Distance>
-  _LIBCPP_HIDE_FROM_ABI constexpr static _Distance advance(_Iter& __iter, _Distance __count, const _Iter& __sentinel) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 static _Distance advance(_Iter& __iter, _Distance __count, const _Iter& __sentinel) {
     return _IterOps::__advance(__iter, __count, __sentinel, typename iterator_traits<_Iter>::iterator_category());
   }
 
   // advance with sentinel, a la std::ranges::advance -- InputIterator specialization
   template <class _InputIter, class _Distance>
-  _LIBCPP_HIDE_FROM_ABI constexpr static _Distance
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 static _Distance
   __advance(_InputIter& __iter, _Distance __count, const _InputIter& __sentinel, input_iterator_tag) {
     _Distance __dist{};
     for (; __dist < __count && __iter != __sentinel; ++__dist)
@@ -107,7 +107,7 @@ struct _IterOps<_ClassicAlgPolicy> {
 
   // advance with sentinel, a la std::ranges::advance -- BidirectionalIterator specialization
   template <class _BiDirIter, class _Distance>
-  _LIBCPP_HIDE_FROM_ABI constexpr static _Distance
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 static _Distance
   __advance(_BiDirIter& __iter, _Distance __count, const _BiDirIter& __sentinel, bidirectional_iterator_tag) {
     _Distance __dist{};
     if (__count >= 0)

>From 08af54897cd8e39a25a1e97b0174b68beb408cd0 Mon Sep 17 00:00:00 2001
From: Iuri Chaer <ichaer at splunk.com>
Date: Tue, 2 Jan 2024 17:18:59 +0000
Subject: [PATCH 06/44] Remove std::exchange dependency from
 std::set_intersection so it works before C++14

---
 libcxx/include/__algorithm/set_intersection.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libcxx/include/__algorithm/set_intersection.h b/libcxx/include/__algorithm/set_intersection.h
index 556738022f4859..46f6fbe4d3dd2b 100644
--- a/libcxx/include/__algorithm/set_intersection.h
+++ b/libcxx/include/__algorithm/set_intersection.h
@@ -83,8 +83,8 @@ struct _LIBCPP_NODISCARD_EXT __set_intersector {
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void
   __advance_and_maybe_add_result(_Iter& __iter, const _Sent& __sentinel, const _Value& __value) {
     // use one-sided lower bound for improved algorithmic complexity bounds
-    const auto __tmp =
-        std::exchange(__iter, std::__lower_bound_onesided<_AlgPolicy>(__iter, __sentinel, __value, __comp_, __proj_));
+    const auto __tmp = std::move(__iter);
+    __iter = std::__lower_bound_onesided<_AlgPolicy>(__iter, __sentinel, __value, __comp_, __proj_);
     __add_output_unless(__tmp != __iter);
   }
 

>From 7aa3927064083b6a96bfcc4e00d1b4fc24d9c96e Mon Sep 17 00:00:00 2001
From: Iuri Chaer <ichaer at splunk.com>
Date: Tue, 2 Jan 2024 17:20:06 +0000
Subject: [PATCH 07/44] Review feedback: don't use one-sided lower bound in
 lower_bound() itself since that violates the complexity guarantees from the
 standard.

---
 libcxx/include/__algorithm/lower_bound.h       | 18 ++----------------
 .../lower.bound/lower_bound.pass.cpp           | 10 +++++-----
 .../lower.bound/lower_bound_comp.pass.cpp      | 13 ++++++-------
 3 files changed, 13 insertions(+), 28 deletions(-)

diff --git a/libcxx/include/__algorithm/lower_bound.h b/libcxx/include/__algorithm/lower_bound.h
index b432829667fa99..3febcb411268fb 100644
--- a/libcxx/include/__algorithm/lower_bound.h
+++ b/libcxx/include/__algorithm/lower_bound.h
@@ -78,38 +78,24 @@ __lower_bound_onesided(_Iter __first, _Sent __last, const _Type& __value, _Comp&
   return __first;
 }
 
-template <class _AlgPolicy, class _InputIter, class _Sent, class _Type, class _Proj, class _Comp>
-_LIBCPP_NODISCARD_EXT inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _InputIter __lower_bound(
-    _InputIter __first, _Sent __last, const _Type& __value, _Comp& __comp, _Proj& __proj, std::input_iterator_tag) {
-  return std::__lower_bound_onesided<_AlgPolicy>(__first, __last, __value, __comp, __proj);
-}
-
 template <class _AlgPolicy, class _RandIter, class _Sent, class _Type, class _Proj, class _Comp>
 _LIBCPP_NODISCARD_EXT inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _RandIter __lower_bound(
     _RandIter __first,
     _Sent __last,
     const _Type& __value,
     _Comp& __comp,
-    _Proj& __proj,
-    std::random_access_iterator_tag) {
+    _Proj& __proj) {
   const auto __dist = _IterOps<_AlgPolicy>::distance(__first, __last);
   return std::__lower_bound_bisecting<_AlgPolicy>(__first, __value, __dist, __comp, __proj);
 }
 
-template <class _AlgPolicy, class _Iter, class _Sent, class _Type, class _Proj, class _Comp>
-_LIBCPP_NODISCARD_EXT inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Iter
-__lower_bound(_Iter __first, _Sent __last, const _Type& __value, _Comp&& __comp, _Proj&& __proj) {
-  return std::__lower_bound<_AlgPolicy>(
-      __first, __last, __value, __comp, __proj, typename _IterOps<_AlgPolicy>::template __iterator_category<_Iter>());
-}
-
 template <class _ForwardIterator, class _Tp, class _Compare>
 _LIBCPP_NODISCARD_EXT inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20
 _ForwardIterator lower_bound(_ForwardIterator __first, _ForwardIterator __last, const _Tp& __value, _Compare __comp) {
   static_assert(__is_callable<_Compare, decltype(*__first), const _Tp&>::value,
                 "The comparator has to be callable");
   auto __proj = std::__identity();
-  return std::__lower_bound<_ClassicAlgPolicy>(__first, __last, __value, std::move(__comp), std::move(__proj));
+  return std::__lower_bound<_ClassicAlgPolicy>(__first, __last, __value, __comp, __proj);
 }
 
 template <class _ForwardIterator, class _Tp>
diff --git a/libcxx/test/std/algorithms/alg.sorting/alg.binary.search/lower.bound/lower_bound.pass.cpp b/libcxx/test/std/algorithms/alg.sorting/alg.binary.search/lower.bound/lower_bound.pass.cpp
index 5c11962d137779..dd2916338e8f6e 100644
--- a/libcxx/test/std/algorithms/alg.sorting/alg.binary.search/lower.bound/lower_bound.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.sorting/alg.binary.search/lower.bound/lower_bound.pass.cpp
@@ -45,14 +45,14 @@ test(Iter first, Iter last, const T& value)
   stride_counting_iterator l(last, &strides, &displacement);
 
   auto i = std::lower_bound(f, l, value);
-  for (auto j = f; j != i; ++j)
+  for (auto j = base(f); j != base(i); ++j)
     assert(*j < value);
-  for (auto j = i; j != l; ++j)
+  for (auto j = base(i); j != base(l); ++j)
     assert(!(*j < value));
 
-  auto len = std::distance(first, last);
-  assert(strides <= 2.5 * len + 1);
-  assert(displacement <= 2.5 * len + 1);
+  auto len = static_cast<std::size_t>(std::distance(first, last));
+  assert(strides <= 2 * len);
+  assert(displacement <= 2 * len);
 }
 
 template <class Iter>
diff --git a/libcxx/test/std/algorithms/alg.sorting/alg.binary.search/lower.bound/lower_bound_comp.pass.cpp b/libcxx/test/std/algorithms/alg.sorting/alg.binary.search/lower.bound/lower_bound_comp.pass.cpp
index 05fd43eada4616..ff928e23b9006a 100644
--- a/libcxx/test/std/algorithms/alg.sorting/alg.binary.search/lower.bound/lower_bound_comp.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.sorting/alg.binary.search/lower.bound/lower_bound_comp.pass.cpp
@@ -51,16 +51,15 @@ test(Iter first, Iter last, const T& value)
   };
 
   auto i = std::lower_bound(f, l, value, cmp);
-
-  for (auto j = f; j != i; ++j)
+  for (auto j = base(f); j != base(i); ++j)
     assert(std::greater<int>()(*j, value));
-  for (auto j = i; j != l; ++j)
+  for (auto j = base(i); j != base(l); ++j)
     assert(!std::greater<int>()(*j, value));
 
-  auto len = std::distance(first, last);
-  assert(strides <= 2.5 * len + 1);
-  assert(displacement <= 2.5 * len + 1);
-  assert(comparisons <= 2 * ceil(log(len + 1) + 2));
+  auto len = static_cast<std::size_t>(std::distance(first, last));
+  assert(strides <= 2 * len);
+  assert(displacement <= 2 * len);
+  assert(comparisons <= std::ceil(std::log2(len + 1)));
 }
 
 template <class Iter>

>From c44c2a2b8ea818287b859c5ce318d195c59e9d65 Mon Sep 17 00:00:00 2001
From: Iuri Chaer <ichaer at splunk.com>
Date: Tue, 2 Jan 2024 17:21:33 +0000
Subject: [PATCH 08/44] Create new benchmark for set_intersection().

---
 libcxx/benchmarks/CMakeLists.txt              |   1 +
 .../algorithms/set_intersection.bench.cpp     | 224 ++++++++++++++++++
 2 files changed, 225 insertions(+)
 create mode 100644 libcxx/benchmarks/algorithms/set_intersection.bench.cpp

diff --git a/libcxx/benchmarks/CMakeLists.txt b/libcxx/benchmarks/CMakeLists.txt
index 7591f34d938bf8..da2ea6fd4c3d1c 100644
--- a/libcxx/benchmarks/CMakeLists.txt
+++ b/libcxx/benchmarks/CMakeLists.txt
@@ -192,6 +192,7 @@ set(BENCHMARK_TESTS
     algorithms/ranges_sort.bench.cpp
     algorithms/ranges_sort_heap.bench.cpp
     algorithms/ranges_stable_sort.bench.cpp
+    algorithms/set_intersection.bench.cpp
     algorithms/sort.bench.cpp
     algorithms/sort_heap.bench.cpp
     algorithms/stable_sort.bench.cpp
diff --git a/libcxx/benchmarks/algorithms/set_intersection.bench.cpp b/libcxx/benchmarks/algorithms/set_intersection.bench.cpp
new file mode 100644
index 00000000000000..c6a01707d65311
--- /dev/null
+++ b/libcxx/benchmarks/algorithms/set_intersection.bench.cpp
@@ -0,0 +1,224 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <algorithm>
+#include <forward_list>
+#include <iterator>
+#include <set>
+#include <vector>
+
+#include "common.h"
+
+namespace {
+
+// types of containers we'll want to test, covering interesting iterator types
+struct VectorContainer {
+  template <typename... Args>
+  using type = std::vector<Args...>;
+
+  static constexpr const char* Name = "Vector";
+};
+
+struct SetContainer {
+  template <typename... Args>
+  using type = std::set<Args...>;
+
+  static constexpr const char* Name = "Set";
+};
+
+struct ForwardListContainer {
+  template <typename... Args>
+  using type = std::forward_list<Args...>;
+
+  static constexpr const char* Name = "ForwardList";
+};
+
+using AllContainerTypes = std::tuple<VectorContainer, SetContainer, ForwardListContainer>;
+
+// set_intersection performance may depend on where matching values lie
+enum class OverlapPosition {
+    Nowhere,
+    Front,
+    Back,
+    Interlaced,
+};
+
+struct AllOverlapPositions : EnumValuesAsTuple<AllOverlapPositions, OverlapPosition, 4> {
+  static constexpr const char* Names[] = {
+      "Nowhere", "Front", "Back", "Interlaced"};
+};
+
+// functor that moves elements from an iterator range into a new Container instance
+template <typename Container>
+struct MoveInto {};
+
+template <typename T>
+struct MoveInto<std::vector<T>> {
+    template <class It>
+    [[nodiscard]] static std::vector<T> operator()(It first, It last) {
+        std::vector<T> out;
+        std::move(first, last, std::back_inserter(out));
+        return out;
+    }
+};
+
+template <typename T>
+struct MoveInto<std::forward_list<T>> {
+    template <class It>
+    [[nodiscard]] static std::forward_list<T> operator()(It first, It last) {
+        std::forward_list<T> out;
+        std::move(first, last, std::front_inserter(out));
+        out.reverse();
+        return out;
+    }
+};
+
+template <typename T>
+struct MoveInto<std::set<T>> {
+    template <class It>
+    [[nodiscard]] static std::set<T> operator()(It first, It last) {
+        std::set<T> out;
+        std::move(first, last, std::inserter(out, out.begin()));
+        return out;
+    }
+};
+
+// lightweight wrapping around fillValues() which puts a little effort into
+// making that would be contiguous when sorted non-contiguous in memory
+template <typename T>
+std::vector<T> getVectorOfRandom(size_t N) {
+  std::vector<T> V;
+  fillValues(V, N, Order::Random);
+  sortValues(V, Order::Random);
+  return std::vector<T>(V);
+}
+
+// forward_iterator wrapping which, for each increment, moves the underlying iterator forward Stride elements
+template <typename Wrapped>
+struct StridedFwdIt {
+  Wrapped Base;
+  unsigned Stride;
+
+  using iterator_category = std::forward_iterator_tag;
+  using difference_type = typename Wrapped::difference_type;
+  using value_type = typename Wrapped::value_type;
+  using pointer = typename Wrapped::pointer;
+  using reference = typename Wrapped::reference;
+
+  StridedFwdIt(Wrapped B, unsigned Stride_) : Base(B), Stride(Stride_) { assert(Stride != 0); }
+
+  StridedFwdIt operator++() { for (unsigned I=0; I<Stride; ++I) ++Base; return *this; }
+  StridedFwdIt operator++(int) { auto Tmp = *this; ++*this; return Tmp; }
+  value_type& operator*() { return *Base; }
+  const value_type& operator*() const { return *Base; }
+  value_type& operator->() { return *Base; }
+  const value_type& operator->() const { return *Base; }
+  bool operator==(const StridedFwdIt& o) const { return Base==o.Base; }
+  bool operator!=(const StridedFwdIt& o) const { return !operator==(o); }
+};
+template <typename Wrapped> StridedFwdIt(Wrapped, unsigned) -> StridedFwdIt<Wrapped>;
+
+
+// realistically, data won't all be nicely contiguous in a container
+// we'll go through some effort to ensure that it's shuffled through memory
+template <class Container>
+std::pair<Container, Container> genCacheUnfriendlyData(size_t Size1, size_t Size2, OverlapPosition Pos) {
+  using ValueType = typename Container::value_type;
+  const MoveInto<Container> moveInto;
+  const auto SrcSize = Pos == OverlapPosition::Nowhere ? Size1 + Size2 : std::max(Size1, Size2);
+  std::vector<ValueType> Src = getVectorOfRandom<ValueType>(SrcSize);
+
+  if (Pos == OverlapPosition::Nowhere) {
+    std::sort(Src.begin(), Src.end());
+    return std::make_pair(
+        moveInto(Src.begin(), Src.begin() + Size1),
+        moveInto(Src.begin() + Size1, Src.end()));
+  }
+
+  // all other overlap types will have to copy some part of the data, but if
+  // we copy after sorting it will likely have high cache locality, so we sort
+  // each copy separately
+  auto Copy = Src;
+  std::sort(Src.begin(), Src.end());
+  std::sort(Copy.begin(), Copy.end());
+
+  switch(Pos) {
+    case OverlapPosition::Nowhere:
+      break;
+
+    case OverlapPosition::Front:
+      return std::make_pair(
+          moveInto(Src.begin(), Src.begin() + Size1),
+          moveInto(Copy.begin(), Copy.begin() + Size2));
+
+    case OverlapPosition::Back:
+      return std::make_pair(
+          moveInto(Src.begin() + (Src.size() - Size1), Src.end()),
+          moveInto(Copy.begin() + (Copy.size() - Size2), Copy.end()));
+
+    case OverlapPosition::Interlaced:
+      const auto Stride1 = Size1 < Size2 ? Size2/Size1 : 1;
+      const auto Stride2 = Size2 < Size1 ? Size1/Size2 : 1;
+      return std::make_pair(
+          moveInto(StridedFwdIt(Src.begin(), Stride1), StridedFwdIt(Src.end(), Stride1)),
+          moveInto(StridedFwdIt(Copy.begin(), Stride2), StridedFwdIt(Copy.end(), Stride2)));
+  }
+  abort();
+  return std::pair<Container, Container>();
+}
+
+
+template <class ValueType, class Container, class Overlap>
+struct SetIntersection {
+  using ContainerType = typename Container::template type<Value<ValueType>>;
+  size_t Size1;
+  size_t Size2;
+
+  SetIntersection(size_t M, size_t N) : Size1(M), Size2(N) {}
+
+  void run(benchmark::State& state) const {
+    state.PauseTiming();
+    auto Input = genCacheUnfriendlyData<ContainerType>(Size1, Size2, Overlap());
+    std::vector<Value<ValueType>> out(std::min(Size1, Size2));
+
+    size_t cmp;
+    auto trackingLess = [&cmp](const Value<ValueType>& lhs, const Value<ValueType>& rhs) {
+        ++cmp;
+        return std::less<Value<ValueType>>{}(lhs, rhs);
+    };
+
+    const auto BatchSize =  std::max(size_t{16}, (2*TestSetElements) / (Size1+Size2));
+    state.ResumeTiming();
+
+    for (const auto& _ : state) {
+      while (state.KeepRunningBatch(BatchSize)) {
+        for (unsigned i=0; i<BatchSize; ++i) {
+          const auto& [C1, C2] = Input;
+          auto outIter = std::set_intersection(C1.begin(), C1.end(), C2.begin(), C2.end(), out.begin(), trackingLess);
+          benchmark::DoNotOptimize(outIter);
+          state.counters["Comparisons"] = cmp;
+        }
+      }
+    }
+  }
+
+  std::string name() const {
+    return std::string("SetIntersection") + Overlap::name() + '_' + Container::Name +
+        ValueType::name() + '_' + std::to_string(Size1) + '_' + std::to_string(Size2);
+  }
+};
+
+} // namespace
+
+int main(int argc, char** argv) {/**/
+  benchmark::Initialize(&argc, argv);
+  if (benchmark::ReportUnrecognizedArguments(argc, argv))
+    return 1;
+  makeCartesianProductBenchmark<SetIntersection, AllValueTypes, AllContainerTypes, AllOverlapPositions>(Quantities, Quantities);
+  benchmark::RunSpecifiedBenchmarks();
+}

>From 46cc95f71742e32d8131a5b08fa271b122a919c3 Mon Sep 17 00:00:00 2001
From: Iuri Chaer <ichaer at splunk.com>
Date: Fri, 5 Jan 2024 23:04:19 +0000
Subject: [PATCH 09/44] Formatting fixups.

---
 .../algorithms/set_intersection.bench.cpp     | 201 +++++++++---------
 .../include/__algorithm/iterator_operations.h |   3 +-
 libcxx/include/__algorithm/lower_bound.h      |   8 +-
 libcxx/include/__algorithm/set_intersection.h |  52 ++---
 4 files changed, 131 insertions(+), 133 deletions(-)

diff --git a/libcxx/benchmarks/algorithms/set_intersection.bench.cpp b/libcxx/benchmarks/algorithms/set_intersection.bench.cpp
index c6a01707d65311..4fa411bba43549 100644
--- a/libcxx/benchmarks/algorithms/set_intersection.bench.cpp
+++ b/libcxx/benchmarks/algorithms/set_intersection.bench.cpp
@@ -42,15 +42,14 @@ using AllContainerTypes = std::tuple<VectorContainer, SetContainer, ForwardListC
 
 // set_intersection performance may depend on where matching values lie
 enum class OverlapPosition {
-    Nowhere,
-    Front,
-    Back,
-    Interlaced,
+  None,
+  Front,
+  Back,
+  Interlaced,
 };
 
 struct AllOverlapPositions : EnumValuesAsTuple<AllOverlapPositions, OverlapPosition, 4> {
-  static constexpr const char* Names[] = {
-      "Nowhere", "Front", "Back", "Interlaced"};
+  static constexpr const char* Names[] = {"None", "Front", "Back", "Interlaced"};
 };
 
 // functor that moves elements from an iterator range into a new Container instance
@@ -59,148 +58,149 @@ struct MoveInto {};
 
 template <typename T>
 struct MoveInto<std::vector<T>> {
-    template <class It>
-    [[nodiscard]] static std::vector<T> operator()(It first, It last) {
-        std::vector<T> out;
-        std::move(first, last, std::back_inserter(out));
-        return out;
-    }
+  template <class It>
+  [[nodiscard]] static std::vector<T> operator()(It first, It last) {
+    std::vector<T> out;
+    std::move(first, last, std::back_inserter(out));
+    return out;
+  }
 };
 
 template <typename T>
 struct MoveInto<std::forward_list<T>> {
-    template <class It>
-    [[nodiscard]] static std::forward_list<T> operator()(It first, It last) {
-        std::forward_list<T> out;
-        std::move(first, last, std::front_inserter(out));
-        out.reverse();
-        return out;
-    }
+  template <class It>
+  [[nodiscard]] static std::forward_list<T> operator()(It first, It last) {
+    std::forward_list<T> out;
+    std::move(first, last, std::front_inserter(out));
+    out.reverse();
+    return out;
+  }
 };
 
 template <typename T>
 struct MoveInto<std::set<T>> {
-    template <class It>
-    [[nodiscard]] static std::set<T> operator()(It first, It last) {
-        std::set<T> out;
-        std::move(first, last, std::inserter(out, out.begin()));
-        return out;
-    }
+  template <class It>
+  [[nodiscard]] static std::set<T> operator()(It first, It last) {
+    std::set<T> out;
+    std::move(first, last, std::inserter(out, out.begin()));
+    return out;
+  }
 };
 
 // lightweight wrapping around fillValues() which puts a little effort into
 // making that would be contiguous when sorted non-contiguous in memory
 template <typename T>
 std::vector<T> getVectorOfRandom(size_t N) {
-  std::vector<T> V;
-  fillValues(V, N, Order::Random);
-  sortValues(V, Order::Random);
-  return std::vector<T>(V);
+  std::vector<T> v;
+  fillValues(v, N, Order::Random);
+  sortValues(v, Order::Random);
+  return std::vector<T>(v);
 }
 
 // forward_iterator wrapping which, for each increment, moves the underlying iterator forward Stride elements
 template <typename Wrapped>
 struct StridedFwdIt {
-  Wrapped Base;
-  unsigned Stride;
+  Wrapped base_;
+  unsigned stride_;
 
   using iterator_category = std::forward_iterator_tag;
-  using difference_type = typename Wrapped::difference_type;
-  using value_type = typename Wrapped::value_type;
-  using pointer = typename Wrapped::pointer;
-  using reference = typename Wrapped::reference;
-
-  StridedFwdIt(Wrapped B, unsigned Stride_) : Base(B), Stride(Stride_) { assert(Stride != 0); }
-
-  StridedFwdIt operator++() { for (unsigned I=0; I<Stride; ++I) ++Base; return *this; }
-  StridedFwdIt operator++(int) { auto Tmp = *this; ++*this; return Tmp; }
-  value_type& operator*() { return *Base; }
-  const value_type& operator*() const { return *Base; }
-  value_type& operator->() { return *Base; }
-  const value_type& operator->() const { return *Base; }
-  bool operator==(const StridedFwdIt& o) const { return Base==o.Base; }
+  using difference_type   = typename Wrapped::difference_type;
+  using value_type        = typename Wrapped::value_type;
+  using pointer           = typename Wrapped::pointer;
+  using reference         = typename Wrapped::reference;
+
+  StridedFwdIt(Wrapped base, unsigned stride) : base_(base), stride_(stride) { assert(stride_ != 0); }
+
+  StridedFwdIt operator++() {
+    for (unsigned i = 0; i < stride_; ++i)
+      ++base_;
+    return *this;
+  }
+  StridedFwdIt operator++(int) {
+    auto tmp = *this;
+    ++*this;
+    return tmp;
+  }
+  value_type& operator*() { return *base_; }
+  const value_type& operator*() const { return *base_; }
+  value_type& operator->() { return *base_; }
+  const value_type& operator->() const { return *base_; }
+  bool operator==(const StridedFwdIt& o) const { return base_ == o.base_; }
   bool operator!=(const StridedFwdIt& o) const { return !operator==(o); }
 };
-template <typename Wrapped> StridedFwdIt(Wrapped, unsigned) -> StridedFwdIt<Wrapped>;
-
+template <typename Wrapped>
+StridedFwdIt(Wrapped, unsigned) -> StridedFwdIt<Wrapped>;
 
 // realistically, data won't all be nicely contiguous in a container
 // we'll go through some effort to ensure that it's shuffled through memory
 template <class Container>
-std::pair<Container, Container> genCacheUnfriendlyData(size_t Size1, size_t Size2, OverlapPosition Pos) {
+std::pair<Container, Container> genCacheUnfriendlyData(size_t size1, size_t size2, OverlapPosition pos) {
   using ValueType = typename Container::value_type;
-  const MoveInto<Container> moveInto;
-  const auto SrcSize = Pos == OverlapPosition::Nowhere ? Size1 + Size2 : std::max(Size1, Size2);
-  std::vector<ValueType> Src = getVectorOfRandom<ValueType>(SrcSize);
-
-  if (Pos == OverlapPosition::Nowhere) {
-    std::sort(Src.begin(), Src.end());
-    return std::make_pair(
-        moveInto(Src.begin(), Src.begin() + Size1),
-        moveInto(Src.begin() + Size1, Src.end()));
+  const MoveInto<Container> move_into;
+  const auto src_size = pos == OverlapPosition::None ? size1 + size2 : std::max(size1, size2);
+  std::vector<ValueType> src = getVectorOfRandom<ValueType>(src_size);
+
+  if (pos == OverlapPosition::None) {
+    std::sort(src.begin(), src.end());
+    return std::make_pair(move_into(src.begin(), src.begin() + size1), move_into(src.begin() + size1, src.end()));
   }
 
   // all other overlap types will have to copy some part of the data, but if
   // we copy after sorting it will likely have high cache locality, so we sort
   // each copy separately
-  auto Copy = Src;
-  std::sort(Src.begin(), Src.end());
-  std::sort(Copy.begin(), Copy.end());
-
-  switch(Pos) {
-    case OverlapPosition::Nowhere:
-      break;
-
-    case OverlapPosition::Front:
-      return std::make_pair(
-          moveInto(Src.begin(), Src.begin() + Size1),
-          moveInto(Copy.begin(), Copy.begin() + Size2));
-
-    case OverlapPosition::Back:
-      return std::make_pair(
-          moveInto(Src.begin() + (Src.size() - Size1), Src.end()),
-          moveInto(Copy.begin() + (Copy.size() - Size2), Copy.end()));
-
-    case OverlapPosition::Interlaced:
-      const auto Stride1 = Size1 < Size2 ? Size2/Size1 : 1;
-      const auto Stride2 = Size2 < Size1 ? Size1/Size2 : 1;
-      return std::make_pair(
-          moveInto(StridedFwdIt(Src.begin(), Stride1), StridedFwdIt(Src.end(), Stride1)),
-          moveInto(StridedFwdIt(Copy.begin(), Stride2), StridedFwdIt(Copy.end(), Stride2)));
+  auto copy = src;
+  std::sort(src.begin(), src.end());
+  std::sort(copy.begin(), copy.end());
+
+  switch (pos) {
+  case OverlapPosition::None:
+    break;
+
+  case OverlapPosition::Front:
+    return std::make_pair(move_into(src.begin(), src.begin() + size1), move_into(copy.begin(), copy.begin() + size2));
+
+  case OverlapPosition::Back:
+    return std::make_pair(move_into(src.begin() + (src.size() - size1), src.end()),
+                          move_into(copy.begin() + (copy.size() - size2), copy.end()));
+
+  case OverlapPosition::Interlaced:
+    const auto stride1 = size1 < size2 ? size2 / size1 : 1;
+    const auto stride2 = size2 < size1 ? size1 / size2 : 1;
+    return std::make_pair(move_into(StridedFwdIt(src.begin(), stride1), StridedFwdIt(src.end(), stride1)),
+                          move_into(StridedFwdIt(copy.begin(), stride2), StridedFwdIt(copy.end(), stride2)));
   }
   abort();
   return std::pair<Container, Container>();
 }
 
-
 template <class ValueType, class Container, class Overlap>
 struct SetIntersection {
   using ContainerType = typename Container::template type<Value<ValueType>>;
-  size_t Size1;
-  size_t Size2;
+  size_t size1_;
+  size_t size2_;
 
-  SetIntersection(size_t M, size_t N) : Size1(M), Size2(N) {}
+  SetIntersection(size_t size1, size_t size2) : size1_(size1), size2_(size2) {}
 
   void run(benchmark::State& state) const {
     state.PauseTiming();
-    auto Input = genCacheUnfriendlyData<ContainerType>(Size1, Size2, Overlap());
-    std::vector<Value<ValueType>> out(std::min(Size1, Size2));
+    auto input = genCacheUnfriendlyData<ContainerType>(size1_, size2_, Overlap());
+    std::vector<Value<ValueType>> out(std::min(size1_, size2_));
 
     size_t cmp;
-    auto trackingLess = [&cmp](const Value<ValueType>& lhs, const Value<ValueType>& rhs) {
-        ++cmp;
-        return std::less<Value<ValueType>>{}(lhs, rhs);
+    auto tracking_less = [&cmp](const Value<ValueType>& lhs, const Value<ValueType>& rhs) {
+      ++cmp;
+      return std::less<Value<ValueType>>{}(lhs, rhs);
     };
 
-    const auto BatchSize =  std::max(size_t{16}, (2*TestSetElements) / (Size1+Size2));
+    const auto BATCH_SIZE = std::max(size_t{16}, (2 * TestSetElements) / (size1_ + size2_));
     state.ResumeTiming();
 
     for (const auto& _ : state) {
-      while (state.KeepRunningBatch(BatchSize)) {
-        for (unsigned i=0; i<BatchSize; ++i) {
-          const auto& [C1, C2] = Input;
-          auto outIter = std::set_intersection(C1.begin(), C1.end(), C2.begin(), C2.end(), out.begin(), trackingLess);
-          benchmark::DoNotOptimize(outIter);
+      while (state.KeepRunningBatch(BATCH_SIZE)) {
+        for (unsigned i = 0; i < BATCH_SIZE; ++i) {
+          const auto& [c1, c2] = input;
+          auto res = std::set_intersection(c1.begin(), c1.end(), c2.begin(), c2.end(), out.begin(), tracking_less);
+          benchmark::DoNotOptimize(res);
           state.counters["Comparisons"] = cmp;
         }
       }
@@ -208,17 +208,18 @@ struct SetIntersection {
   }
 
   std::string name() const {
-    return std::string("SetIntersection") + Overlap::name() + '_' + Container::Name +
-        ValueType::name() + '_' + std::to_string(Size1) + '_' + std::to_string(Size2);
+    return std::string("SetIntersection") + Overlap::name() + '_' + Container::Name + ValueType::name() + '_' +
+           std::to_string(size1_) + '_' + std::to_string(size2_);
   }
 };
 
 } // namespace
 
-int main(int argc, char** argv) {/**/
+int main(int argc, char** argv) { /**/
   benchmark::Initialize(&argc, argv);
   if (benchmark::ReportUnrecognizedArguments(argc, argv))
     return 1;
-  makeCartesianProductBenchmark<SetIntersection, AllValueTypes, AllContainerTypes, AllOverlapPositions>(Quantities, Quantities);
+  makeCartesianProductBenchmark<SetIntersection, AllValueTypes, AllContainerTypes, AllOverlapPositions>(
+      Quantities, Quantities);
   benchmark::RunSpecifiedBenchmarks();
 }
diff --git a/libcxx/include/__algorithm/iterator_operations.h b/libcxx/include/__algorithm/iterator_operations.h
index 21117e6b7d7609..6ce9895f545a5d 100644
--- a/libcxx/include/__algorithm/iterator_operations.h
+++ b/libcxx/include/__algorithm/iterator_operations.h
@@ -91,7 +91,8 @@ struct _IterOps<_ClassicAlgPolicy> {
   // it's unclear whether _Iter has a difference_type and whether that's signed, so we play it safe:
   // use the incoming type for returning and steer clear of negative overflows
   template <class _Iter, class _Distance>
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 static _Distance advance(_Iter& __iter, _Distance __count, const _Iter& __sentinel) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 static _Distance
+  advance(_Iter& __iter, _Distance __count, const _Iter& __sentinel) {
     return _IterOps::__advance(__iter, __count, __sentinel, typename iterator_traits<_Iter>::iterator_category());
   }
 
diff --git a/libcxx/include/__algorithm/lower_bound.h b/libcxx/include/__algorithm/lower_bound.h
index 3febcb411268fb..b1ecd1ae0d5698 100644
--- a/libcxx/include/__algorithm/lower_bound.h
+++ b/libcxx/include/__algorithm/lower_bound.h
@@ -79,12 +79,8 @@ __lower_bound_onesided(_Iter __first, _Sent __last, const _Type& __value, _Comp&
 }
 
 template <class _AlgPolicy, class _RandIter, class _Sent, class _Type, class _Proj, class _Comp>
-_LIBCPP_NODISCARD_EXT inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _RandIter __lower_bound(
-    _RandIter __first,
-    _Sent __last,
-    const _Type& __value,
-    _Comp& __comp,
-    _Proj& __proj) {
+_LIBCPP_NODISCARD_EXT inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _RandIter
+__lower_bound(_RandIter __first, _Sent __last, const _Type& __value, _Comp& __comp, _Proj& __proj) {
   const auto __dist = _IterOps<_AlgPolicy>::distance(__first, __last);
   return std::__lower_bound_bisecting<_AlgPolicy>(__first, __value, __dist, __comp, __proj);
 }
diff --git a/libcxx/include/__algorithm/set_intersection.h b/libcxx/include/__algorithm/set_intersection.h
index 46f6fbe4d3dd2b..a18bb6ff947b77 100644
--- a/libcxx/include/__algorithm/set_intersection.h
+++ b/libcxx/include/__algorithm/set_intersection.h
@@ -60,8 +60,8 @@ struct _LIBCPP_NODISCARD_EXT __set_intersector {
         __comp_(__comp) {}
 
   _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI
-      _LIBCPP_CONSTEXPR_SINCE_CXX20 __set_intersection_result<_InIter1, _InIter2, _OutIter>
-      operator()() && {
+  _LIBCPP_CONSTEXPR_SINCE_CXX20 __set_intersection_result<_InIter1, _InIter2, _OutIter>
+  operator()() && {
     while (__first2_ != __last2_) {
       __advance1_and_maybe_add_result();
       if (__first1_ == __last1_)
@@ -84,7 +84,7 @@ struct _LIBCPP_NODISCARD_EXT __set_intersector {
   __advance_and_maybe_add_result(_Iter& __iter, const _Sent& __sentinel, const _Value& __value) {
     // use one-sided lower bound for improved algorithmic complexity bounds
     const auto __tmp = std::move(__iter);
-    __iter = std::__lower_bound_onesided<_AlgPolicy>(__iter, __sentinel, __value, __comp_, __proj_);
+    __iter           = std::__lower_bound_onesided<_AlgPolicy>(__iter, __sentinel, __value, __comp_, __proj_);
     __add_output_unless(__tmp != __iter);
   }
 
@@ -122,16 +122,16 @@ template <class _AlgPolicy,
           class _Sent2,
           class _OutIter>
 _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI
-    _LIBCPP_CONSTEXPR_SINCE_CXX20 __set_intersection_result<_InForwardIter1, _InForwardIter2, _OutIter>
-    __set_intersection(
-        _InForwardIter1 __first1,
-        _Sent1 __last1,
-        _InForwardIter2 __first2,
-        _Sent2 __last2,
-        _OutIter __result,
-        _Compare&& __comp,
-        std::forward_iterator_tag,
-        std::forward_iterator_tag) {
+_LIBCPP_CONSTEXPR_SINCE_CXX20 __set_intersection_result<_InForwardIter1, _InForwardIter2, _OutIter>
+__set_intersection(
+    _InForwardIter1 __first1,
+    _Sent1 __last1,
+    _InForwardIter2 __first2,
+    _Sent2 __last2,
+    _OutIter __result,
+    _Compare&& __comp,
+    std::forward_iterator_tag,
+    std::forward_iterator_tag) {
   std::__set_intersector<_AlgPolicy, _Compare, _InForwardIter1, _Sent1, _InForwardIter2, _Sent2, _OutIter>
       __intersector(__first1, __last1, __first2, __last2, __result, __comp);
   return std::move(__intersector)();
@@ -146,16 +146,16 @@ template <class _AlgPolicy,
           class _Sent2,
           class _OutIter>
 _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI
-    _LIBCPP_CONSTEXPR_SINCE_CXX20 __set_intersection_result<_InInputIter1, _InInputIter2, _OutIter>
-    __set_intersection(
-        _InInputIter1 __first1,
-        _Sent1 __last1,
-        _InInputIter2 __first2,
-        _Sent2 __last2,
-        _OutIter __result,
-        _Compare&& __comp,
-        std::input_iterator_tag,
-        std::input_iterator_tag) {
+_LIBCPP_CONSTEXPR_SINCE_CXX20 __set_intersection_result<_InInputIter1, _InInputIter2, _OutIter>
+__set_intersection(
+    _InInputIter1 __first1,
+    _Sent1 __last1,
+    _InInputIter2 __first2,
+    _Sent2 __last2,
+    _OutIter __result,
+    _Compare&& __comp,
+    std::input_iterator_tag,
+    std::input_iterator_tag) {
   while (__first1 != __last1 && __first2 != __last2) {
     if (__comp(*__first1, *__first2))
       ++__first1;
@@ -190,9 +190,9 @@ class __set_intersection_iter_category {
 
 template <class _AlgPolicy, class _Compare, class _InIter1, class _Sent1, class _InIter2, class _Sent2, class _OutIter>
 _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI
-    _LIBCPP_CONSTEXPR_SINCE_CXX20 __set_intersection_result<_InIter1, _InIter2, _OutIter>
-    __set_intersection(
-        _InIter1 __first1, _Sent1 __last1, _InIter2 __first2, _Sent2 __last2, _OutIter __result, _Compare&& __comp) {
+_LIBCPP_CONSTEXPR_SINCE_CXX20 __set_intersection_result<_InIter1, _InIter2, _OutIter>
+__set_intersection(
+    _InIter1 __first1, _Sent1 __last1, _InIter2 __first2, _Sent2 __last2, _OutIter __result, _Compare&& __comp) {
   return std::__set_intersection<_AlgPolicy>(
       std::move(__first1),
       std::move(__last1),

>From 450f5cebd41e425133fd221bf23b40bb20922eef Mon Sep 17 00:00:00 2001
From: Iuri Chaer <ichaer at splunk.com>
Date: Mon, 8 Jan 2024 21:51:27 +0000
Subject: [PATCH 10/44] General improvements to benchmark, including
 simplifying and slimming it down for faster runs, and including comparison
 counter.

---
 .../algorithms/set_intersection.bench.cpp     | 72 +++++++------------
 1 file changed, 27 insertions(+), 45 deletions(-)

diff --git a/libcxx/benchmarks/algorithms/set_intersection.bench.cpp b/libcxx/benchmarks/algorithms/set_intersection.bench.cpp
index 4fa411bba43549..baa5a7cdf05074 100644
--- a/libcxx/benchmarks/algorithms/set_intersection.bench.cpp
+++ b/libcxx/benchmarks/algorithms/set_intersection.bench.cpp
@@ -7,7 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 #include <algorithm>
-#include <forward_list>
 #include <iterator>
 #include <set>
 #include <vector>
@@ -31,57 +30,26 @@ struct SetContainer {
   static constexpr const char* Name = "Set";
 };
 
-struct ForwardListContainer {
-  template <typename... Args>
-  using type = std::forward_list<Args...>;
-
-  static constexpr const char* Name = "ForwardList";
-};
-
-using AllContainerTypes = std::tuple<VectorContainer, SetContainer, ForwardListContainer>;
+using AllContainerTypes = std::tuple<VectorContainer, SetContainer>;
 
 // set_intersection performance may depend on where matching values lie
 enum class OverlapPosition {
   None,
   Front,
-  Back,
+  // performance-wise, matches at the back are identical to ones at the front
   Interlaced,
 };
 
-struct AllOverlapPositions : EnumValuesAsTuple<AllOverlapPositions, OverlapPosition, 4> {
-  static constexpr const char* Names[] = {"None", "Front", "Back", "Interlaced"};
+struct AllOverlapPositions : EnumValuesAsTuple<AllOverlapPositions, OverlapPosition, 3> {
+  static constexpr const char* Names[] = {"None", "Front", "Interlaced"};
 };
 
 // functor that moves elements from an iterator range into a new Container instance
 template <typename Container>
-struct MoveInto {};
-
-template <typename T>
-struct MoveInto<std::vector<T>> {
-  template <class It>
-  [[nodiscard]] static std::vector<T> operator()(It first, It last) {
-    std::vector<T> out;
-    std::move(first, last, std::back_inserter(out));
-    return out;
-  }
-};
-
-template <typename T>
-struct MoveInto<std::forward_list<T>> {
+struct MoveInto {
   template <class It>
-  [[nodiscard]] static std::forward_list<T> operator()(It first, It last) {
-    std::forward_list<T> out;
-    std::move(first, last, std::front_inserter(out));
-    out.reverse();
-    return out;
-  }
-};
-
-template <typename T>
-struct MoveInto<std::set<T>> {
-  template <class It>
-  [[nodiscard]] static std::set<T> operator()(It first, It last) {
-    std::set<T> out;
+  [[nodiscard]] static Container operator()(It first, It last) {
+    Container out;
     std::move(first, last, std::inserter(out, out.begin()));
     return out;
   }
@@ -137,7 +105,7 @@ template <class Container>
 std::pair<Container, Container> genCacheUnfriendlyData(size_t size1, size_t size2, OverlapPosition pos) {
   using ValueType = typename Container::value_type;
   const MoveInto<Container> move_into;
-  const auto src_size = pos == OverlapPosition::None ? size1 + size2 : std::max(size1, size2);
+  const auto src_size        = pos == OverlapPosition::None ? size1 + size2 : std::max(size1, size2);
   std::vector<ValueType> src = getVectorOfRandom<ValueType>(src_size);
 
   if (pos == OverlapPosition::None) {
@@ -159,10 +127,6 @@ std::pair<Container, Container> genCacheUnfriendlyData(size_t size1, size_t size
   case OverlapPosition::Front:
     return std::make_pair(move_into(src.begin(), src.begin() + size1), move_into(copy.begin(), copy.begin() + size2));
 
-  case OverlapPosition::Back:
-    return std::make_pair(move_into(src.begin() + (src.size() - size1), src.end()),
-                          move_into(copy.begin() + (copy.size() - size2), copy.end()));
-
   case OverlapPosition::Interlaced:
     const auto stride1 = size1 < size2 ? size2 / size1 : 1;
     const auto stride2 = size2 < size1 ? size1 / size2 : 1;
@@ -181,6 +145,11 @@ struct SetIntersection {
 
   SetIntersection(size_t size1, size_t size2) : size1_(size1), size2_(size2) {}
 
+  bool skip() const noexcept {
+    // let's save some time and skip simmetrical runs
+    return size1_ <= size2_;
+  }
+
   void run(benchmark::State& state) const {
     state.PauseTiming();
     auto input = genCacheUnfriendlyData<ContainerType>(size1_, size2_, Overlap());
@@ -192,12 +161,13 @@ struct SetIntersection {
       return std::less<Value<ValueType>>{}(lhs, rhs);
     };
 
-    const auto BATCH_SIZE = std::max(size_t{16}, (2 * TestSetElements) / (size1_ + size2_));
+    const auto BATCH_SIZE = std::max(size_t{512}, (2 * TestSetElements) / (size1_ + size2_));
     state.ResumeTiming();
 
     for (const auto& _ : state) {
       while (state.KeepRunningBatch(BATCH_SIZE)) {
         for (unsigned i = 0; i < BATCH_SIZE; ++i) {
+          cmp                  = 0;
           const auto& [c1, c2] = input;
           auto res = std::set_intersection(c1.begin(), c1.end(), c2.begin(), c2.end(), out.begin(), tracking_less);
           benchmark::DoNotOptimize(res);
@@ -219,6 +189,18 @@ int main(int argc, char** argv) { /**/
   benchmark::Initialize(&argc, argv);
   if (benchmark::ReportUnrecognizedArguments(argc, argv))
     return 1;
+  const std::vector<size_t> Quantities = {
+      1 << 0,
+      1 << 4,
+      1 << 8,
+      1 << 14,
+// Running each benchmark in parallel consumes too much memory with MSAN
+// and can lead to the test process being killed.
+#if !TEST_HAS_FEATURE(memory_sanitizer)
+      1 << 18
+#endif
+  };
+
   makeCartesianProductBenchmark<SetIntersection, AllValueTypes, AllContainerTypes, AllOverlapPositions>(
       Quantities, Quantities);
   benchmark::RunSpecifiedBenchmarks();

>From d0c5f2b8d23c76db2ba325aa0fb6172d1b6eb1da Mon Sep 17 00:00:00 2001
From: Iuri Chaer <ichaer at splunk.com>
Date: Mon, 15 Jan 2024 16:19:29 +0000
Subject: [PATCH 11/44] Huh, I wonder how I got `git clang-format` to miss
 those changes =/

---
 .../algorithms/set_intersection.bench.cpp     | 10 ++--
 libcxx/include/__algorithm/set_intersection.h | 58 +++++++++----------
 2 files changed, 34 insertions(+), 34 deletions(-)

diff --git a/libcxx/benchmarks/algorithms/set_intersection.bench.cpp b/libcxx/benchmarks/algorithms/set_intersection.bench.cpp
index baa5a7cdf05074..38010170508a88 100644
--- a/libcxx/benchmarks/algorithms/set_intersection.bench.cpp
+++ b/libcxx/benchmarks/algorithms/set_intersection.bench.cpp
@@ -190,14 +190,14 @@ int main(int argc, char** argv) { /**/
   if (benchmark::ReportUnrecognizedArguments(argc, argv))
     return 1;
   const std::vector<size_t> Quantities = {
-      1 << 0,
-      1 << 4,
-      1 << 8,
-      1 << 14,
+    1 << 0,
+    1 << 4,
+    1 << 8,
+    1 << 14,
 // Running each benchmark in parallel consumes too much memory with MSAN
 // and can lead to the test process being killed.
 #if !TEST_HAS_FEATURE(memory_sanitizer)
-      1 << 18
+    1 << 18
 #endif
   };
 
diff --git a/libcxx/include/__algorithm/set_intersection.h b/libcxx/include/__algorithm/set_intersection.h
index a18bb6ff947b77..504350d10779e4 100644
--- a/libcxx/include/__algorithm/set_intersection.h
+++ b/libcxx/include/__algorithm/set_intersection.h
@@ -59,9 +59,9 @@ struct _LIBCPP_NODISCARD_EXT __set_intersector {
         __result_(__result),
         __comp_(__comp) {}
 
-  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI
-  _LIBCPP_CONSTEXPR_SINCE_CXX20 __set_intersection_result<_InIter1, _InIter2, _OutIter>
-  operator()() && {
+  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20
+      __set_intersection_result<_InIter1, _InIter2, _OutIter>
+      operator()() && {
     while (__first2_ != __last2_) {
       __advance1_and_maybe_add_result();
       if (__first1_ == __last1_)
@@ -121,17 +121,17 @@ template <class _AlgPolicy,
           class _InForwardIter2,
           class _Sent2,
           class _OutIter>
-_LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI
-_LIBCPP_CONSTEXPR_SINCE_CXX20 __set_intersection_result<_InForwardIter1, _InForwardIter2, _OutIter>
-__set_intersection(
-    _InForwardIter1 __first1,
-    _Sent1 __last1,
-    _InForwardIter2 __first2,
-    _Sent2 __last2,
-    _OutIter __result,
-    _Compare&& __comp,
-    std::forward_iterator_tag,
-    std::forward_iterator_tag) {
+_LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20
+    __set_intersection_result<_InForwardIter1, _InForwardIter2, _OutIter>
+    __set_intersection(
+        _InForwardIter1 __first1,
+        _Sent1 __last1,
+        _InForwardIter2 __first2,
+        _Sent2 __last2,
+        _OutIter __result,
+        _Compare&& __comp,
+        std::forward_iterator_tag,
+        std::forward_iterator_tag) {
   std::__set_intersector<_AlgPolicy, _Compare, _InForwardIter1, _Sent1, _InForwardIter2, _Sent2, _OutIter>
       __intersector(__first1, __last1, __first2, __last2, __result, __comp);
   return std::move(__intersector)();
@@ -145,17 +145,17 @@ template <class _AlgPolicy,
           class _InInputIter2,
           class _Sent2,
           class _OutIter>
-_LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI
-_LIBCPP_CONSTEXPR_SINCE_CXX20 __set_intersection_result<_InInputIter1, _InInputIter2, _OutIter>
-__set_intersection(
-    _InInputIter1 __first1,
-    _Sent1 __last1,
-    _InInputIter2 __first2,
-    _Sent2 __last2,
-    _OutIter __result,
-    _Compare&& __comp,
-    std::input_iterator_tag,
-    std::input_iterator_tag) {
+_LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20
+    __set_intersection_result<_InInputIter1, _InInputIter2, _OutIter>
+    __set_intersection(
+        _InInputIter1 __first1,
+        _Sent1 __last1,
+        _InInputIter2 __first2,
+        _Sent2 __last2,
+        _OutIter __result,
+        _Compare&& __comp,
+        std::input_iterator_tag,
+        std::input_iterator_tag) {
   while (__first1 != __last1 && __first2 != __last2) {
     if (__comp(*__first1, *__first2))
       ++__first1;
@@ -189,10 +189,10 @@ class __set_intersection_iter_category {
 };
 
 template <class _AlgPolicy, class _Compare, class _InIter1, class _Sent1, class _InIter2, class _Sent2, class _OutIter>
-_LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI
-_LIBCPP_CONSTEXPR_SINCE_CXX20 __set_intersection_result<_InIter1, _InIter2, _OutIter>
-__set_intersection(
-    _InIter1 __first1, _Sent1 __last1, _InIter2 __first2, _Sent2 __last2, _OutIter __result, _Compare&& __comp) {
+_LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20
+    __set_intersection_result<_InIter1, _InIter2, _OutIter>
+    __set_intersection(
+        _InIter1 __first1, _Sent1 __last1, _InIter2 __first2, _Sent2 __last2, _OutIter __result, _Compare&& __comp) {
   return std::__set_intersection<_AlgPolicy>(
       std::move(__first1),
       std::move(__last1),

>From faa31150e13902941cfa0c9ef87bff265b12d898 Mon Sep 17 00:00:00 2001
From: Iuri Chaer <ichaer at splunk.com>
Date: Tue, 23 Jan 2024 16:25:56 +0000
Subject: [PATCH 12/44] Oops, bad mistake while porting into libc++!
 `__lower_bound_onesided()` must start with `__step==0`, otherwise we can't
 match the complexity of linear search when continually matching (like a
 std::set_intersection() of matching containers will).

---
 libcxx/include/__algorithm/lower_bound.h | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/libcxx/include/__algorithm/lower_bound.h b/libcxx/include/__algorithm/lower_bound.h
index b1ecd1ae0d5698..dc86e2fa5c81d1 100644
--- a/libcxx/include/__algorithm/lower_bound.h
+++ b/libcxx/include/__algorithm/lower_bound.h
@@ -63,6 +63,12 @@ __lower_bound_onesided(_Iter __first, _Sent __last, const _Type& __value, _Comp&
   // __iterator_category<_Iter>>::value,
   //       "lower_bound() is a multipass algorithm and requires forward iterator or better");
 
+  // split the step 0 scenario: this allows us to match worst-case complexity
+  // when replacing linear search
+  if (__first == __last || !std::__invoke(__comp, std::__invoke(__proj, *__first), __value))
+    return __first;
+  ++__first;
+
   using _Distance = typename iterator_traits<_Iter>::difference_type;
   for (_Distance __step = 1; __first != __last; __step <<= 1) {
     auto __it   = __first;

>From 995d04b872c8552633c36e38d382897e8329d1e2 Mon Sep 17 00:00:00 2001
From: Iuri Chaer <ichaer at splunk.com>
Date: Tue, 23 Jan 2024 22:31:41 +0000
Subject: [PATCH 13/44] Oops, bad tracking of displacement on
 `stride_counting_iterator`

---
 libcxx/test/support/test_iterators.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libcxx/test/support/test_iterators.h b/libcxx/test/support/test_iterators.h
index 3b86a93564e4b5..d1e077e1b26554 100644
--- a/libcxx/test/support/test_iterators.h
+++ b/libcxx/test/support/test_iterators.h
@@ -826,7 +826,7 @@ class stride_counting_iterator {
         It tmp(base_);
         base_ = base(tmp += n);
         ++*stride_count_;
-        ++*stride_displacement_;
+        *stride_displacement_ += n;
         return *this;
     }
 
@@ -836,7 +836,7 @@ class stride_counting_iterator {
         It tmp(base_);
         base_ = base(tmp -= n);
         ++*stride_count_;
-        --*stride_displacement_;
+        *stride_displacement_ -= n;
         return *this;
     }
 

>From d568d491cef941e2cb03d85bcce9b7d2ec7314c4 Mon Sep 17 00:00:00 2001
From: Iuri Chaer <ichaer at splunk.com>
Date: Tue, 23 Jan 2024 22:33:13 +0000
Subject: [PATCH 14/44] Add more counters to the set_intersection benchmark,
 guard them behind an environment variable so we can choose to either measure
 time more accurately or obtain more information.

This led me down an interesting road of validating benchmark results and finding a significant discrepancy in timings between when I run all test cases at once or `--benchmark-filter` them individually.
---
 .../algorithms/set_intersection.bench.cpp     | 38 +++++++++++++------
 1 file changed, 27 insertions(+), 11 deletions(-)

diff --git a/libcxx/benchmarks/algorithms/set_intersection.bench.cpp b/libcxx/benchmarks/algorithms/set_intersection.bench.cpp
index 38010170508a88..b2de0c3223b005 100644
--- a/libcxx/benchmarks/algorithms/set_intersection.bench.cpp
+++ b/libcxx/benchmarks/algorithms/set_intersection.bench.cpp
@@ -9,9 +9,11 @@
 #include <algorithm>
 #include <iterator>
 #include <set>
+#include <stdlib.h>
 #include <vector>
 
 #include "common.h"
+#include "test_iterators.h"
 
 namespace {
 
@@ -137,6 +139,10 @@ std::pair<Container, Container> genCacheUnfriendlyData(size_t size1, size_t size
   return std::pair<Container, Container>();
 }
 
+// use environment variable to enable additional counters: instrumentation will
+// impact CPU utilisation, let's give the user the option
+static const bool TRACK_COUNTERS = getenv("TRACK_COUNTERS") != nullptr;
+
 template <class ValueType, class Container, class Overlap>
 struct SetIntersection {
   using ContainerType = typename Container::template type<Value<ValueType>>;
@@ -147,7 +153,7 @@ struct SetIntersection {
 
   bool skip() const noexcept {
     // let's save some time and skip simmetrical runs
-    return size1_ <= size2_;
+    return size1_ < size2_;
   }
 
   void run(benchmark::State& state) const {
@@ -155,23 +161,33 @@ struct SetIntersection {
     auto input = genCacheUnfriendlyData<ContainerType>(size1_, size2_, Overlap());
     std::vector<Value<ValueType>> out(std::min(size1_, size2_));
 
-    size_t cmp;
-    auto tracking_less = [&cmp](const Value<ValueType>& lhs, const Value<ValueType>& rhs) {
-      ++cmp;
-      return std::less<Value<ValueType>>{}(lhs, rhs);
-    };
-
     const auto BATCH_SIZE = std::max(size_t{512}, (2 * TestSetElements) / (size1_ + size2_));
     state.ResumeTiming();
 
     for (const auto& _ : state) {
       while (state.KeepRunningBatch(BATCH_SIZE)) {
         for (unsigned i = 0; i < BATCH_SIZE; ++i) {
-          cmp                  = 0;
           const auto& [c1, c2] = input;
-          auto res = std::set_intersection(c1.begin(), c1.end(), c2.begin(), c2.end(), out.begin(), tracking_less);
-          benchmark::DoNotOptimize(res);
-          state.counters["Comparisons"] = cmp;
+          if (TRACK_COUNTERS) {
+            size_t cmp{}, strides{}, displacement{};
+            auto tracking_less = [&cmp](const Value<ValueType>& lhs, const Value<ValueType>& rhs) {
+              ++cmp;
+              return std::less<Value<ValueType>>{}(lhs, rhs);
+            };
+            stride_counting_iterator b1(c1.begin(), &strides, &displacement);
+            stride_counting_iterator e1(c1.end(), &strides, &displacement);
+            stride_counting_iterator b2(c2.begin(), &strides, &displacement);
+            stride_counting_iterator e2(c2.end(), &strides, &displacement);
+            auto res = std::set_intersection(b1, e1, b2, e2, out.begin(), tracking_less);
+            benchmark::DoNotOptimize(res);
+            state.counters["comparisons"]       = cmp;
+            state.counters["iter_strides"]      = strides;
+            state.counters["iter_displacement"] = displacement;
+
+          } else {
+            auto res = std::set_intersection(c1.begin(), c1.end(), c2.begin(), c2.end(), out.begin());
+            benchmark::DoNotOptimize(res);
+          }
         }
       }
     }

>From bb872e0b1d19a77450b8455c348d3f4669adcefb Mon Sep 17 00:00:00 2001
From: Iuri Chaer <ichaer at splunk.com>
Date: Thu, 1 Feb 2024 13:41:07 +0000
Subject: [PATCH 15/44] Revert "Oops, bad tracking of displacement on
 `stride_counting_iterator`"

This reverts commit 995d04b872c8552633c36e38d382897e8329d1e2.
---
 libcxx/test/support/test_iterators.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libcxx/test/support/test_iterators.h b/libcxx/test/support/test_iterators.h
index 189684022d1a34..191de7f3c8a36d 100644
--- a/libcxx/test/support/test_iterators.h
+++ b/libcxx/test/support/test_iterators.h
@@ -826,7 +826,7 @@ class stride_counting_iterator {
         It tmp(base_);
         base_ = base(tmp += n);
         ++*stride_count_;
-        *stride_displacement_ += n;
+        ++*stride_displacement_;
         return *this;
     }
 
@@ -836,7 +836,7 @@ class stride_counting_iterator {
         It tmp(base_);
         base_ = base(tmp -= n);
         ++*stride_count_;
-        *stride_displacement_ -= n;
+        --*stride_displacement_;
         return *this;
     }
 

>From a1cd8ffc82fe6021dbafb7b543e5472eda3ef87a Mon Sep 17 00:00:00 2001
From: Iuri Chaer <ichaer at splunk.com>
Date: Thu, 1 Feb 2024 17:39:08 +0000
Subject: [PATCH 16/44] * Fix C++03 compatibility issues. * Fix tests I had
 broken. * More tweaks and better comments.

---
 .../include/__algorithm/iterator_operations.h |  6 ++--
 libcxx/include/__algorithm/lower_bound.h      | 12 ++++---
 libcxx/include/__algorithm/set_intersection.h | 36 ++++++++++++++-----
 .../lower.bound/lower_bound.pass.cpp          | 11 ++++--
 .../lower.bound/lower_bound_comp.pass.cpp     | 25 +++++++++----
 .../ranges_set_intersection.pass.cpp          | 22 ++++++------
 6 files changed, 76 insertions(+), 36 deletions(-)

diff --git a/libcxx/include/__algorithm/iterator_operations.h b/libcxx/include/__algorithm/iterator_operations.h
index d9a6e7f35df725..449d03d52e324b 100644
--- a/libcxx/include/__algorithm/iterator_operations.h
+++ b/libcxx/include/__algorithm/iterator_operations.h
@@ -98,7 +98,7 @@ struct _IterOps<_ClassicAlgPolicy> {
   template <class _InputIter, class _Distance>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 static _Distance
   __advance(_InputIter& __iter, _Distance __count, const _InputIter& __sentinel, input_iterator_tag) {
-    _Distance __dist{};
+    _Distance __dist = _Distance();
     for (; __dist < __count && __iter != __sentinel; ++__dist)
       ++__iter;
     return __count - __dist;
@@ -108,7 +108,7 @@ struct _IterOps<_ClassicAlgPolicy> {
   template <class _BiDirIter, class _Distance>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 static _Distance
   __advance(_BiDirIter& __iter, _Distance __count, const _BiDirIter& __sentinel, bidirectional_iterator_tag) {
-    _Distance __dist{};
+    _Distance __dist = _Distance();
     if (__count >= 0)
       for (; __dist < __count && __iter != __sentinel; ++__dist)
         ++__iter;
@@ -120,7 +120,7 @@ struct _IterOps<_ClassicAlgPolicy> {
 
   // advance with sentinel, a la std::ranges::advance -- RandomIterator specialization
   template <class _RandIter, class _Distance>
-  _LIBCPP_HIDE_FROM_ABI constexpr static _Distance
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR static _Distance
   __advance(_RandIter& __iter, _Distance __count, const _RandIter& __sentinel, random_access_iterator_tag) {
     auto __dist = _IterOps::distance(__iter, __sentinel);
     _LIBCPP_ASSERT_UNCATEGORIZED(
diff --git a/libcxx/include/__algorithm/lower_bound.h b/libcxx/include/__algorithm/lower_bound.h
index 12a9c4850460b7..e22700fa80269b 100644
--- a/libcxx/include/__algorithm/lower_bound.h
+++ b/libcxx/include/__algorithm/lower_bound.h
@@ -50,7 +50,7 @@ _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Iter
 
 // One-sided binary search, aka meta binary search, has been in the public domain for decades, and has the general
 // advantage of being Ω(1) rather than the classic algorithm's Ω(log(n)), with the downside of executing at most
-// 2*(log(n)-1) comparisons vs the classic algorithm's exact log(n). There are two scenarios in which it really shines:
+// 2*log(n) comparisons vs the classic algorithm's exact log(n). There are two scenarios in which it really shines:
 // the first one is when operating over non-random iterators, because the classic algorithm requires knowing the
 // container's size upfront, which adds Ω(n) iterator increments to the complexity. The second one is when you're
 // traversing the container in order, trying to fast-forward to the next value: in that case, the classic algorithm
@@ -63,11 +63,9 @@ __lower_bound_onesided(_Iter __first, _Sent __last, const _Type& __value, _Comp&
   // __iterator_category<_Iter>>::value,
   //       "lower_bound() is a multipass algorithm and requires forward iterator or better");
 
-  // split the step 0 scenario: this allows us to match worst-case complexity
-  // when replacing linear search
+  // step = 0, ensuring we can always short-circuit when distance is 1 later on
   if (__first == __last || !std::__invoke(__comp, std::__invoke(__proj, *__first), __value))
     return __first;
-  ++__first;
 
   using _Distance = typename iterator_traits<_Iter>::difference_type;
   for (_Distance __step = 1; __first != __last; __step <<= 1) {
@@ -76,10 +74,14 @@ __lower_bound_onesided(_Iter __first, _Sent __last, const _Type& __value, _Comp&
     // once we reach the last range where needle can be we must start
     // looking inwards, bisecting that range
     if (__it == __last || !std::__invoke(__comp, std::__invoke(__proj, *__it), __value)) {
+      // we've already checked the previous value and it was less, we can save
+      // one comparison by skipping bisection
+      if (__dist == 1)
+        return __it;
       return std::__lower_bound_bisecting<_AlgPolicy>(__first, __value, __dist, __comp, __proj);
     }
     // range not found, move forward!
-    __first = std::move(__it);
+    __first = __it;
   }
   return __first;
 }
diff --git a/libcxx/include/__algorithm/set_intersection.h b/libcxx/include/__algorithm/set_intersection.h
index e3aa99d004eee5..00fedec3701d68 100644
--- a/libcxx/include/__algorithm/set_intersection.h
+++ b/libcxx/include/__algorithm/set_intersection.h
@@ -20,6 +20,7 @@
 #include <__type_traits/is_same.h>
 #include <__utility/exchange.h>
 #include <__utility/move.h>
+#include <__utility/swap.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
@@ -50,8 +51,7 @@ struct _LIBCPP_NODISCARD_EXT __set_intersector {
   const _Sent2& __last2_;
   _OutIter& __result_;
   _Compare& __comp_;
-  static constexpr auto __proj_ = std::__identity();
-  bool __prev_advanced_         = true;
+  bool __prev_advanced_ = true;
 
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __set_intersector(
       _InIter1& __first1, _Sent1& __last1, _InIter2& __first2, _Sent2& __last2, _OutIter& __result, _Compare& __comp)
@@ -64,7 +64,7 @@ struct _LIBCPP_NODISCARD_EXT __set_intersector {
 
   _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20
       __set_intersection_result<_InIter1, _InIter2, _OutIter>
-      operator()() && {
+      operator()() {
     while (__first2_ != __last2_) {
       __advance1_and_maybe_add_result();
       if (__first1_ == __last1_)
@@ -85,9 +85,27 @@ struct _LIBCPP_NODISCARD_EXT __set_intersector {
   template <class _Iter, class _Sent, class _Value>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void
   __advance_and_maybe_add_result(_Iter& __iter, const _Sent& __sentinel, const _Value& __value) {
-    // use one-sided lower bound for improved algorithmic complexity bounds
-    const auto __tmp = std::move(__iter);
-    __iter           = std::__lower_bound_onesided<_AlgPolicy>(__iter, __sentinel, __value, __comp_, __proj_);
+    static _LIBCPP_CONSTEXPR std::__identity __proj;
+    // use one-sided binary search for improved algorithmic complexity bounds
+    // understanding how we can use binary search and still respect complexity
+    // guarantees is _not_ straightforward, so let me explain: the guarantee
+    // is "at most 2*(N+M)-1 comparisons", and one-sided binary search will
+    // necessarily overshoot depending on the position of the needle in the
+    // haystack -- for instance, if we're searching for 3 in (1, 2, 3, 4),
+    // we'll check if 3<1, then 3<2, then 3<4, and, finally, 3<3, for a total of
+    // 4 comparisons, when linear search would have yielded 3. However,
+    // because we won't need to perform the intervening reciprocal comparisons
+    // (ie 1<3, 2<3, 4<3), that extra comparison doesn't run afoul of the
+    // guarantee. Additionally, this type of scenario can only happen for match
+    // distances of up to 5 elements, because 2*log2(8) is 6, and we'll still
+    // be worse-off at position 5 of an 8-element set. From then onwards
+    // these scenarios can't happen.
+    // TL;DR: we'll be 1 comparison worse-off compared to the classic linear-
+    // searching algorithm if matching position 3 of a set with 4 elements,
+    // or position 5 if the set has 7 or 8 elements, but we'll never exceed
+    // the complexity guarantees from the standard.
+    _Iter __tmp = std::__lower_bound_onesided<_AlgPolicy>(__iter, __sentinel, __value, __comp_, __proj);
+    std::swap(__tmp, __iter);
     __add_output_unless(__tmp != __iter);
   }
 
@@ -137,7 +155,7 @@ _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20
         std::forward_iterator_tag) {
   std::__set_intersector<_AlgPolicy, _Compare, _InForwardIter1, _Sent1, _InForwardIter2, _Sent2, _OutIter>
       __intersector(__first1, __last1, __first2, __last2, __result, __comp);
-  return std::move(__intersector)();
+  return __intersector();
 }
 
 // input iterators are not suitable for multipass algorithms, so we stick to the classic single-pass version
@@ -183,7 +201,7 @@ class __set_intersection_iter_category {
   template <class _It>
   using __cat = typename std::_IterOps<_AlgPolicy>::template __iterator_category<_It>;
   template <class _It>
-  static auto test(__cat<_It>*) -> __cat<_It>;
+  static __cat<_It> test(__cat<_It>*);
   template <class>
   static std::input_iterator_tag test(...);
 
@@ -202,7 +220,7 @@ _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20
       std::move(__first2),
       std::move(__last2),
       std::move(__result),
-      std::forward<_Compare>(__comp),
+      __comp,
       typename std::__set_intersection_iter_category<_AlgPolicy, _InIter1>::__type(),
       typename std::__set_intersection_iter_category<_AlgPolicy, _InIter2>::__type());
 }
diff --git a/libcxx/test/std/algorithms/alg.sorting/alg.binary.search/lower.bound/lower_bound.pass.cpp b/libcxx/test/std/algorithms/alg.sorting/alg.binary.search/lower.bound/lower_bound.pass.cpp
index dd2916338e8f6e..196af84b69222f 100644
--- a/libcxx/test/std/algorithms/alg.sorting/alg.binary.search/lower.bound/lower_bound.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.sorting/alg.binary.search/lower.bound/lower_bound.pass.cpp
@@ -39,10 +39,15 @@ template <class Iter, class T>
 void
 test(Iter first, Iter last, const T& value)
 {
-  std::size_t strides{};
-  std::size_t displacement{};
+#if TEST_STD_VER > 17
+  std::size_t strides      = 0;
+  std::size_t displacement = 0;
   stride_counting_iterator f(first, &strides, &displacement);
   stride_counting_iterator l(last, &strides, &displacement);
+#else
+  Iter& f = first;
+  Iter& l = last;
+#endif
 
   auto i = std::lower_bound(f, l, value);
   for (auto j = base(f); j != base(i); ++j)
@@ -50,9 +55,11 @@ test(Iter first, Iter last, const T& value)
   for (auto j = base(i); j != base(l); ++j)
     assert(!(*j < value));
 
+#if TEST_STD_VER > 17
   auto len = static_cast<std::size_t>(std::distance(first, last));
   assert(strides <= 2 * len);
   assert(displacement <= 2 * len);
+#endif
 }
 
 template <class Iter>
diff --git a/libcxx/test/std/algorithms/alg.sorting/alg.binary.search/lower.bound/lower_bound_comp.pass.cpp b/libcxx/test/std/algorithms/alg.sorting/alg.binary.search/lower.bound/lower_bound_comp.pass.cpp
index ff928e23b9006a..643fd0052e479e 100644
--- a/libcxx/test/std/algorithms/alg.sorting/alg.binary.search/lower.bound/lower_bound_comp.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.sorting/alg.binary.search/lower.bound/lower_bound_comp.pass.cpp
@@ -39,16 +39,27 @@ template <class Iter, class T>
 void
 test(Iter first, Iter last, const T& value)
 {
-  std::size_t strides{};
-  std::size_t displacement{};
+#if TEST_STD_VER > 17
+  std::size_t strides      = 0;
+  std::size_t displacement = 0;
   stride_counting_iterator f(first, &strides, &displacement);
   stride_counting_iterator l(last, &strides, &displacement);
+#else
+  Iter& f = first;
+  Iter& l = last;
+#endif
+
+  std::size_t comparisons = 0;
+  struct InstrumentedGreater {
+    explicit InstrumentedGreater(std::size_t* cmp) : comparisons_(cmp) {}
+    bool operator()(int rhs, int lhs) const {
+      ++*comparisons_;
+      return std::greater<int>()(rhs, lhs);
+    }
 
-  std::size_t comparisons{};
-  auto cmp = [&comparisons](int rhs, int lhs) {
-    ++comparisons;
-    return std::greater<int>()(rhs, lhs);
+    std::size_t* comparisons_;
   };
+  InstrumentedGreater cmp(&comparisons);
 
   auto i = std::lower_bound(f, l, value, cmp);
   for (auto j = base(f); j != base(i); ++j)
@@ -57,8 +68,10 @@ test(Iter first, Iter last, const T& value)
     assert(!std::greater<int>()(*j, value));
 
   auto len = static_cast<std::size_t>(std::distance(first, last));
+#if TEST_STD_VER > 17
   assert(strides <= 2 * len);
   assert(displacement <= 2 * len);
+#endif
   assert(comparisons <= std::ceil(std::log2(len + 1)));
 }
 
diff --git a/libcxx/test/std/algorithms/alg.sorting/alg.set.operations/set.intersection/ranges_set_intersection.pass.cpp b/libcxx/test/std/algorithms/alg.sorting/alg.set.operations/set.intersection/ranges_set_intersection.pass.cpp
index f658a95778c858..2f3b0df9cda7cf 100644
--- a/libcxx/test/std/algorithms/alg.sorting/alg.set.operations/set.intersection/ranges_set_intersection.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.sorting/alg.set.operations/set.intersection/ranges_set_intersection.pass.cpp
@@ -436,20 +436,20 @@ constexpr void testComplexityParameterizedIter() {
   }
 
   // Lower complexity when there is low overlap between ranges: we can make 2*log(X) comparisons when one range
-  // has X elements that can be skipped over.
+  // has X elements that can be skipped over (and then 1 more to confirm that the value we found is equal).
   {
     std::array r1{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
     std::array r2{15};
     std::array expected{15};
 
     OperationCounts expectedCounts;
-    expectedCounts.comparisons                 = 8;
-    expectedCounts.in[0].proj                  = 8;
-    expectedCounts.in[0].iterator_strides      = 24;
-    expectedCounts.in[0].iterator_displacement = 24;
-    expectedCounts.in[1].proj                  = 8;
-    expectedCounts.in[1].iterator_strides      = 3;
-    expectedCounts.in[1].iterator_displacement = 3;
+    expectedCounts.comparisons                 = 9;
+    expectedCounts.in[0].proj                  = 9;
+    expectedCounts.in[0].iterator_strides      = 23;
+    expectedCounts.in[0].iterator_displacement = 23;
+    expectedCounts.in[1].proj                  = 9;
+    expectedCounts.in[1].iterator_strides      = 1;
+    expectedCounts.in[1].iterator_displacement = 1;
 
     testSetIntersectionAndReturnOpCounts<In1, In2, Out>(r1, r2, expected, expectedCounts);
     testRangesSetIntersectionAndReturnOpCounts<In1, In2, Out>(r1, r2, expected, expectedCounts);
@@ -721,9 +721,9 @@ constexpr bool test() {
       std::ranges::set_intersection(r1.begin(), r1.end(), r2.begin(), r2.end(), out.data(), comp, proj1, proj2);
 
       assert(std::ranges::equal(out, expected, {}, &Data::data));
-      assert(numberOfComp < maxOperation);
-      assert(numberOfProj1 < maxOperation);
-      assert(numberOfProj2 < maxOperation);
+      assert(numberOfComp <= maxOperation);
+      assert(numberOfProj1 <= maxOperation);
+      assert(numberOfProj2 <= maxOperation);
     }
 
     // range overload

>From 24d1d5b9a9d93f567be004e4a36e5b9147898b06 Mon Sep 17 00:00:00 2001
From: Iuri Chaer <ichaer at splunk.com>
Date: Thu, 1 Feb 2024 18:04:17 +0000
Subject: [PATCH 17/44] Remove non-ascii characters, CI doesn't like them.

---
 libcxx/include/__algorithm/lower_bound.h | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/libcxx/include/__algorithm/lower_bound.h b/libcxx/include/__algorithm/lower_bound.h
index e22700fa80269b..6016502404002a 100644
--- a/libcxx/include/__algorithm/lower_bound.h
+++ b/libcxx/include/__algorithm/lower_bound.h
@@ -49,13 +49,14 @@ _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Iter
 }
 
 // One-sided binary search, aka meta binary search, has been in the public domain for decades, and has the general
-// advantage of being Ω(1) rather than the classic algorithm's Ω(log(n)), with the downside of executing at most
-// 2*log(n) comparisons vs the classic algorithm's exact log(n). There are two scenarios in which it really shines:
+// advantage of being \Omega(1) rather than the classic algorithm's \Omega(log(n)), with the downside of executing at
+// most 2*log(n) comparisons vs the classic algorithm's exact log(n). There are two scenarios in which it really shines:
 // the first one is when operating over non-random iterators, because the classic algorithm requires knowing the
-// container's size upfront, which adds Ω(n) iterator increments to the complexity. The second one is when you're
+// container's size upfront, which adds \Omega(n) iterator increments to the complexity. The second one is when you're
 // traversing the container in order, trying to fast-forward to the next value: in that case, the classic algorithm
-// would yield Ω(n*log(n)) comparisons and, for non-random iterators, Ω(n^2) iterator increments, whereas the one-sided
-// version will yield O(n) operations on both counts, with a Ω(log(n)) bound on the number of comparisons.
+// would yield \Omega(n*log(n)) comparisons and, for non-random iterators, \Omega(n^2) iterator increments, whereas the
+// one-sided version will yield O(n) operations on both counts, with a \Omega(log(n)) bound on the number of
+// comparisons.
 template <class _AlgPolicy, class _Iter, class _Sent, class _Type, class _Proj, class _Comp>
 _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Iter
 __lower_bound_onesided(_Iter __first, _Sent __last, const _Type& __value, _Comp& __comp, _Proj& __proj) {

>From 4b7377367ed671f9a23e81f2211ff54d50a84d17 Mon Sep 17 00:00:00 2001
From: Iuri Chaer <ichaer at splunk.com>
Date: Fri, 2 Feb 2024 19:02:19 +0000
Subject: [PATCH 18/44] Oops, missed an #include

---
 libcxx/include/__algorithm/iterator_operations.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/libcxx/include/__algorithm/iterator_operations.h b/libcxx/include/__algorithm/iterator_operations.h
index 449d03d52e324b..5797e1d7e78d86 100644
--- a/libcxx/include/__algorithm/iterator_operations.h
+++ b/libcxx/include/__algorithm/iterator_operations.h
@@ -11,6 +11,7 @@
 
 #include <__algorithm/iter_swap.h>
 #include <__algorithm/ranges_iterator_concept.h>
+#include <__assert>
 #include <__config>
 #include <__iterator/advance.h>
 #include <__iterator/distance.h>

>From d0facc560c7d8ed38b27ee0a3bbbe367d134a096 Mon Sep 17 00:00:00 2001
From: Iuri Chaer <ichaer at splunk.com>
Date: Mon, 5 Feb 2024 22:14:42 +0000
Subject: [PATCH 19/44] set_intersection.h: remove `static constexpr`, it
 breaks constexprness of the method.

---
 libcxx/include/__algorithm/set_intersection.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libcxx/include/__algorithm/set_intersection.h b/libcxx/include/__algorithm/set_intersection.h
index 00fedec3701d68..e3e93f9e755622 100644
--- a/libcxx/include/__algorithm/set_intersection.h
+++ b/libcxx/include/__algorithm/set_intersection.h
@@ -85,7 +85,7 @@ struct _LIBCPP_NODISCARD_EXT __set_intersector {
   template <class _Iter, class _Sent, class _Value>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void
   __advance_and_maybe_add_result(_Iter& __iter, const _Sent& __sentinel, const _Value& __value) {
-    static _LIBCPP_CONSTEXPR std::__identity __proj;
+    _LIBCPP_CONSTEXPR std::__identity __proj;
     // use one-sided binary search for improved algorithmic complexity bounds
     // understanding how we can use binary search and still respect complexity
     // guarantees is _not_ straightforward, so let me explain: the guarantee

>From a12aa376ec6eff642c6c0ed8b448215c4f8a89c9 Mon Sep 17 00:00:00 2001
From: Iuri Chaer <ichaer at splunk.com>
Date: Mon, 5 Feb 2024 22:16:24 +0000
Subject: [PATCH 20/44] Fix constexpr shenanigans with gcc and
 `stride_counting_iterator`

---
 libcxx/test/support/test_iterators.h | 56 ++++++++++++++--------------
 1 file changed, 27 insertions(+), 29 deletions(-)

diff --git a/libcxx/test/support/test_iterators.h b/libcxx/test/support/test_iterators.h
index 191de7f3c8a36d..d9eb149be38e32 100644
--- a/libcxx/test/support/test_iterators.h
+++ b/libcxx/test/support/test_iterators.h
@@ -734,6 +734,20 @@ template <class It,
           class StrideCountType        = std::iter_difference_t<It>,
           class StrideDisplacementType = std::iter_difference_t<It>>
 class stride_counting_iterator {
+  template <typename UnderlyingType>
+  struct concrete_or_ref {
+    using value_type            = std::remove_cv_t<std::remove_reference_t<UnderlyingType>>;
+    constexpr concrete_or_ref() = default;
+    explicit constexpr concrete_or_ref(UnderlyingType* c) noexcept : ptr_{c} {}
+
+    constexpr operator value_type&() noexcept { return ptr_ ? *ptr_ : val_; }
+    constexpr operator const value_type&() const noexcept { return ptr_ ? *ptr_ : val_; }
+
+  private:
+    value_type val_{};
+    value_type* ptr_{nullptr};
+  };
+
 public:
     using value_type = typename iter_value_or_void<It>::type;
     using difference_type = std::iter_difference_t<It>;
@@ -758,27 +772,14 @@ class stride_counting_iterator {
     constexpr stride_counting_iterator(const stride_counting_iterator& o) { *this = o; }
     constexpr stride_counting_iterator(stride_counting_iterator&& o) { *this = o; }
 
-    constexpr stride_counting_iterator& operator=(const stride_counting_iterator& o) {
-      base_ = o.base_;
-      // if memory backing count is owned by the object, copy values
-      if (o.stride_count_ == &o.stride_count_default_) {
-        assert(o.stride_displacement_ == &o.stride_displacement_default_);
-        *stride_count_        = *o.stride_count_;
-        *stride_displacement_ = *o.stride_displacement_;
-        return *this;
-      }
-      // otherwise share the same externally-owned variables
-      stride_count_        = o.stride_count_;
-      stride_displacement_ = o.stride_displacement_;
-      return *this;
-    }
+    constexpr stride_counting_iterator& operator=(const stride_counting_iterator& o) = default;
     constexpr stride_counting_iterator& operator=(stride_counting_iterator&& o) { return *this = o; }
 
     friend constexpr It base(stride_counting_iterator const& it) { return It(it.base_); }
 
-    constexpr StrideCountType stride_count() const { return *stride_count_; }
+    constexpr StrideCountType stride_count() const { return stride_count_; }
 
-    constexpr StrideDisplacementType stride_displacement() const { return *stride_displacement_; }
+    constexpr StrideDisplacementType stride_displacement() const { return stride_displacement_; }
 
     constexpr decltype(auto) operator*() const { return *It(base_); }
 
@@ -787,8 +788,8 @@ class stride_counting_iterator {
     constexpr stride_counting_iterator& operator++() {
         It tmp(base_);
         base_ = base(++tmp);
-        ++*stride_count_;
-        ++*stride_displacement_;
+        ++stride_count_;
+        ++stride_displacement_;
         return *this;
     }
 
@@ -807,8 +808,8 @@ class stride_counting_iterator {
     {
         It tmp(base_);
         base_ = base(--tmp);
-        ++*stride_count_;
-        --*stride_displacement_;
+        ++stride_count_;
+        --stride_displacement_;
         return *this;
     }
 
@@ -825,8 +826,8 @@ class stride_counting_iterator {
     {
         It tmp(base_);
         base_ = base(tmp += n);
-        ++*stride_count_;
-        ++*stride_displacement_;
+        ++stride_count_;
+        ++stride_displacement_;
         return *this;
     }
 
@@ -835,8 +836,8 @@ class stride_counting_iterator {
     {
         It tmp(base_);
         base_ = base(tmp -= n);
-        ++*stride_count_;
-        --*stride_displacement_;
+        ++stride_count_;
+        --stride_displacement_;
         return *this;
     }
 
@@ -899,11 +900,8 @@ class stride_counting_iterator {
 
 private:
     decltype(base(std::declval<It>())) base_;
-    StrideCountType stride_count_default_               = 0;
-    StrideDisplacementType stride_displacement_default_ = 0;
-
-    StrideCountType* stride_count_               = &stride_count_default_;
-    StrideDisplacementType* stride_displacement_ = &stride_displacement_default_;
+    concrete_or_ref<StrideCountType> stride_count_;
+    concrete_or_ref<StrideDisplacementType> stride_displacement_;
 };
 template <class It>
 stride_counting_iterator(It) -> stride_counting_iterator<It>;

>From 69dba78ed467990aa6a8a8cbb032706cfc551a20 Mon Sep 17 00:00:00 2001
From: Iuri Chaer <ichaer at splunk.com>
Date: Tue, 6 Feb 2024 21:31:30 +0000
Subject: [PATCH 21/44] Restrict number of constexpr steps so
 `ranges_set_intersection.pass.cpp` is ok on gcc.

---
 .../ranges_set_intersection.pass.cpp              | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/libcxx/test/std/algorithms/alg.sorting/alg.set.operations/set.intersection/ranges_set_intersection.pass.cpp b/libcxx/test/std/algorithms/alg.sorting/alg.set.operations/set.intersection/ranges_set_intersection.pass.cpp
index 2f3b0df9cda7cf..162f6ca8b7f357 100644
--- a/libcxx/test/std/algorithms/alg.sorting/alg.set.operations/set.intersection/ranges_set_intersection.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.sorting/alg.set.operations/set.intersection/ranges_set_intersection.pass.cpp
@@ -483,18 +483,21 @@ constexpr void testComplexityParameterizedIterPermutateIn1() {
 }
 
 template <class Out>
-constexpr void testComplexityParameterizedIterPermutateIn1In2() {
+constexpr bool testComplexityParameterizedIterPermutateIn1In2() {
   testComplexityParameterizedIterPermutateIn1<forward_iterator, Out>();
   testComplexityParameterizedIterPermutateIn1<bidirectional_iterator, Out>();
   testComplexityParameterizedIterPermutateIn1<random_access_iterator, Out>();
+  return true;
 }
 
-constexpr bool testComplexityMultipleTypes() {
-  //testComplexityParameterizedIter<cpp20_input_iterator, random_access_iterator, OutIter>();
+constexpr void runAllComplexityTests() {
   testComplexityParameterizedIterPermutateIn1In2<forward_iterator<int*>>();
   testComplexityParameterizedIterPermutateIn1In2<bidirectional_iterator<int*>>();
   testComplexityParameterizedIterPermutateIn1In2<random_access_iterator<int*>>();
-  return true;
+
+  static_assert(testComplexityParameterizedIterPermutateIn1In2<forward_iterator<int*>>());
+  static_assert(testComplexityParameterizedIterPermutateIn1In2<bidirectional_iterator<int*>>());
+  static_assert(testComplexityParameterizedIterPermutateIn1In2<random_access_iterator<int*>>());
 }
 
 constexpr bool test() {
@@ -797,8 +800,8 @@ int main(int, char**) {
   // than the step limit.
   runAllIteratorPermutationsTests();
 
-  testComplexityMultipleTypes();
-  static_assert(testComplexityMultipleTypes());
+  // similar for complexity tests
+  runAllComplexityTests();
 
   return 0;
 }

>From fe1fe8c4607044ad14d42a6e6d713b7f58f4ef11 Mon Sep 17 00:00:00 2001
From: Iuri Chaer <ichaer at splunk.com>
Date: Mon, 12 Feb 2024 08:16:56 +0000
Subject: [PATCH 22/44] Fix constexpr annotation and make internal methods
 private in _IterOps

---
 libcxx/include/__algorithm/iterator_operations.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/libcxx/include/__algorithm/iterator_operations.h b/libcxx/include/__algorithm/iterator_operations.h
index 5797e1d7e78d86..c9fd4376595bd0 100644
--- a/libcxx/include/__algorithm/iterator_operations.h
+++ b/libcxx/include/__algorithm/iterator_operations.h
@@ -95,6 +95,7 @@ struct _IterOps<_ClassicAlgPolicy> {
     return _IterOps::__advance(__iter, __count, __sentinel, typename iterator_traits<_Iter>::iterator_category());
   }
 
+private:
   // advance with sentinel, a la std::ranges::advance -- InputIterator specialization
   template <class _InputIter, class _Distance>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 static _Distance
@@ -121,7 +122,7 @@ struct _IterOps<_ClassicAlgPolicy> {
 
   // advance with sentinel, a la std::ranges::advance -- RandomIterator specialization
   template <class _RandIter, class _Distance>
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR static _Distance
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 static _Distance
   __advance(_RandIter& __iter, _Distance __count, const _RandIter& __sentinel, random_access_iterator_tag) {
     auto __dist = _IterOps::distance(__iter, __sentinel);
     _LIBCPP_ASSERT_UNCATEGORIZED(
@@ -134,6 +135,7 @@ struct _IterOps<_ClassicAlgPolicy> {
     return __count - __dist;
   }
 
+public:
   // distance
   template <class _Iter>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 static typename iterator_traits<_Iter>::difference_type

>From bb2c7588947e92451a46f310fe63cd137303fcff Mon Sep 17 00:00:00 2001
From: Iuri Chaer <ichaer at splunk.com>
Date: Mon, 12 Feb 2024 08:18:15 +0000
Subject: [PATCH 23/44] Allow for assertions in comparison count when in
 hardened mode for complexity validation.

---
 .../set.intersection/ranges_set_intersection.pass.cpp  | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/libcxx/test/std/algorithms/alg.sorting/alg.set.operations/set.intersection/ranges_set_intersection.pass.cpp b/libcxx/test/std/algorithms/alg.sorting/alg.set.operations/set.intersection/ranges_set_intersection.pass.cpp
index 162f6ca8b7f357..4858493145af98 100644
--- a/libcxx/test/std/algorithms/alg.sorting/alg.set.operations/set.intersection/ranges_set_intersection.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.sorting/alg.set.operations/set.intersection/ranges_set_intersection.pass.cpp
@@ -299,7 +299,14 @@ struct [[nodiscard]] OperationCounts {
   std::array<PerInput, 2> in;
 
   [[nodiscard]] constexpr bool matchesExpectation(const OperationCounts& expect) {
-    return comparisons <= expect.comparisons && in[0].matchesExpectation(expect.in[0]) &&
+    // __debug_less will perform an additional comparison in an assertion
+    constexpr unsigned comparison_multiplier =
+#if _LIBCPP_HARDENING_MODE == _LIBCPP_HARDENING_MODE_DEBUG
+        2;
+#else
+        1;
+#endif
+    return comparisons <= comparison_multiplier * expect.comparisons && in[0].matchesExpectation(expect.in[0]) &&
            in[1].matchesExpectation(expect.in[1]);
   }
 
@@ -309,7 +316,6 @@ struct [[nodiscard]] OperationCounts {
 };
 } // namespace
 
-#include <iostream>
 template <template <class...> class In1,
           template <class...>
           class In2,

>From c6b895c46b5f11d988506a4bb66a655dfad275bb Mon Sep 17 00:00:00 2001
From: Iuri Chaer <ichaer at splunk.com>
Date: Tue, 13 Feb 2024 20:12:24 +0000
Subject: [PATCH 24/44] Revert lower_bound.pass.cpp changes, will move into a
 new PR.

---
 .../lower.bound/lower_bound.pass.cpp          | 26 ++++---------------
 1 file changed, 5 insertions(+), 21 deletions(-)

diff --git a/libcxx/test/std/algorithms/alg.sorting/alg.binary.search/lower.bound/lower_bound.pass.cpp b/libcxx/test/std/algorithms/alg.sorting/alg.binary.search/lower.bound/lower_bound.pass.cpp
index 196af84b69222f..a2d8ab632303cb 100644
--- a/libcxx/test/std/algorithms/alg.sorting/alg.binary.search/lower.bound/lower_bound.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.sorting/alg.binary.search/lower.bound/lower_bound.pass.cpp
@@ -39,27 +39,11 @@ template <class Iter, class T>
 void
 test(Iter first, Iter last, const T& value)
 {
-#if TEST_STD_VER > 17
-  std::size_t strides      = 0;
-  std::size_t displacement = 0;
-  stride_counting_iterator f(first, &strides, &displacement);
-  stride_counting_iterator l(last, &strides, &displacement);
-#else
-  Iter& f = first;
-  Iter& l = last;
-#endif
-
-  auto i = std::lower_bound(f, l, value);
-  for (auto j = base(f); j != base(i); ++j)
-    assert(*j < value);
-  for (auto j = base(i); j != base(l); ++j)
-    assert(!(*j < value));
-
-#if TEST_STD_VER > 17
-  auto len = static_cast<std::size_t>(std::distance(first, last));
-  assert(strides <= 2 * len);
-  assert(displacement <= 2 * len);
-#endif
+    Iter i = std::lower_bound(first, last, value);
+    for (Iter j = first; j != i; ++j)
+        assert(*j < value);
+    for (Iter j = i; j != last; ++j)
+        assert(!(*j < value));
 }
 
 template <class Iter>

>From 31321b943b9a5c9358844725b2e0bc0910055062 Mon Sep 17 00:00:00 2001
From: Iuri Chaer <ichaer at splunk.com>
Date: Tue, 13 Feb 2024 20:14:34 +0000
Subject: [PATCH 25/44] Oops, forgot to revert this one too.

---
 .../lower.bound/lower_bound_comp.pass.cpp     | 40 +++----------------
 1 file changed, 5 insertions(+), 35 deletions(-)

diff --git a/libcxx/test/std/algorithms/alg.sorting/alg.binary.search/lower.bound/lower_bound_comp.pass.cpp b/libcxx/test/std/algorithms/alg.sorting/alg.binary.search/lower.bound/lower_bound_comp.pass.cpp
index 643fd0052e479e..b9133028d9ade2 100644
--- a/libcxx/test/std/algorithms/alg.sorting/alg.binary.search/lower.bound/lower_bound_comp.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.sorting/alg.binary.search/lower.bound/lower_bound_comp.pass.cpp
@@ -17,7 +17,6 @@
 #include <vector>
 #include <cassert>
 #include <cstddef>
-#include <cmath>
 
 #include "test_macros.h"
 #include "test_iterators.h"
@@ -39,40 +38,11 @@ template <class Iter, class T>
 void
 test(Iter first, Iter last, const T& value)
 {
-#if TEST_STD_VER > 17
-  std::size_t strides      = 0;
-  std::size_t displacement = 0;
-  stride_counting_iterator f(first, &strides, &displacement);
-  stride_counting_iterator l(last, &strides, &displacement);
-#else
-  Iter& f = first;
-  Iter& l = last;
-#endif
-
-  std::size_t comparisons = 0;
-  struct InstrumentedGreater {
-    explicit InstrumentedGreater(std::size_t* cmp) : comparisons_(cmp) {}
-    bool operator()(int rhs, int lhs) const {
-      ++*comparisons_;
-      return std::greater<int>()(rhs, lhs);
-    }
-
-    std::size_t* comparisons_;
-  };
-  InstrumentedGreater cmp(&comparisons);
-
-  auto i = std::lower_bound(f, l, value, cmp);
-  for (auto j = base(f); j != base(i); ++j)
-    assert(std::greater<int>()(*j, value));
-  for (auto j = base(i); j != base(l); ++j)
-    assert(!std::greater<int>()(*j, value));
-
-  auto len = static_cast<std::size_t>(std::distance(first, last));
-#if TEST_STD_VER > 17
-  assert(strides <= 2 * len);
-  assert(displacement <= 2 * len);
-#endif
-  assert(comparisons <= std::ceil(std::log2(len + 1)));
+    Iter i = std::lower_bound(first, last, value, std::greater<int>());
+    for (Iter j = first; j != i; ++j)
+        assert(std::greater<int>()(*j, value));
+    for (Iter j = i; j != last; ++j)
+        assert(!std::greater<int>()(*j, value));
 }
 
 template <class Iter>

>From 3805e95cf63137cef87f796436abcf2038923e29 Mon Sep 17 00:00:00 2001
From: Iuri Chaer <ichaer at splunk.com>
Date: Tue, 23 Apr 2024 16:29:06 +0100
Subject: [PATCH 26/44] s/_LIBCPP_NODISCARD_EXT/_LIBCPP_NODISCARD/ after
 merging #87094

---
 libcxx/include/__algorithm/lower_bound.h      |  6 +++---
 libcxx/include/__algorithm/set_intersection.h | 10 +++++-----
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/libcxx/include/__algorithm/lower_bound.h b/libcxx/include/__algorithm/lower_bound.h
index 56084215e37cab..9424a50373fade 100644
--- a/libcxx/include/__algorithm/lower_bound.h
+++ b/libcxx/include/__algorithm/lower_bound.h
@@ -28,7 +28,7 @@
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 template <class _AlgPolicy, class _Iter, class _Type, class _Proj, class _Comp>
-_LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Iter __lower_bound_bisecting(
+_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Iter __lower_bound_bisecting(
     _Iter __first,
     const _Type& __value,
     typename iterator_traits<_Iter>::difference_type __len,
@@ -58,7 +58,7 @@ _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Iter
 // one-sided version will yield O(n) operations on both counts, with a \Omega(log(n)) bound on the number of
 // comparisons.
 template <class _AlgPolicy, class _Iter, class _Sent, class _Type, class _Proj, class _Comp>
-_LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Iter
+_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Iter
 __lower_bound_onesided(_Iter __first, _Sent __last, const _Type& __value, _Comp& __comp, _Proj& __proj) {
   // static_assert(std::is_base_of<std::forward_iterator_tag, typename _IterOps<_AlgPolicy>::template
   // __iterator_category<_Iter>>::value,
@@ -88,7 +88,7 @@ __lower_bound_onesided(_Iter __first, _Sent __last, const _Type& __value, _Comp&
 }
 
 template <class _AlgPolicy, class _RandIter, class _Sent, class _Type, class _Proj, class _Comp>
-_LIBCPP_NODISCARD_EXT inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _RandIter
+_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _RandIter
 __lower_bound(_RandIter __first, _Sent __last, const _Type& __value, _Comp& __comp, _Proj& __proj) {
   const auto __dist = _IterOps<_AlgPolicy>::distance(__first, __last);
   return std::__lower_bound_bisecting<_AlgPolicy>(__first, __value, __dist, __comp, __proj);
diff --git a/libcxx/include/__algorithm/set_intersection.h b/libcxx/include/__algorithm/set_intersection.h
index e3e93f9e755622..340dd7ec8b5527 100644
--- a/libcxx/include/__algorithm/set_intersection.h
+++ b/libcxx/include/__algorithm/set_intersection.h
@@ -44,7 +44,7 @@ struct __set_intersection_result {
 };
 
 template <class _AlgPolicy, class _Compare, class _InIter1, class _Sent1, class _InIter2, class _Sent2, class _OutIter>
-struct _LIBCPP_NODISCARD_EXT __set_intersector {
+struct _LIBCPP_NODISCARD __set_intersector {
   _InIter1& __first1_;
   const _Sent1& __last1_;
   _InIter2& __first2_;
@@ -62,7 +62,7 @@ struct _LIBCPP_NODISCARD_EXT __set_intersector {
         __result_(__result),
         __comp_(__comp) {}
 
-  _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20
+  _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20
       __set_intersection_result<_InIter1, _InIter2, _OutIter>
       operator()() {
     while (__first2_ != __last2_) {
@@ -142,7 +142,7 @@ template <class _AlgPolicy,
           class _InForwardIter2,
           class _Sent2,
           class _OutIter>
-_LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20
+_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20
     __set_intersection_result<_InForwardIter1, _InForwardIter2, _OutIter>
     __set_intersection(
         _InForwardIter1 __first1,
@@ -166,7 +166,7 @@ template <class _AlgPolicy,
           class _InInputIter2,
           class _Sent2,
           class _OutIter>
-_LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20
+_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20
     __set_intersection_result<_InInputIter1, _InInputIter2, _OutIter>
     __set_intersection(
         _InInputIter1 __first1,
@@ -210,7 +210,7 @@ class __set_intersection_iter_category {
 };
 
 template <class _AlgPolicy, class _Compare, class _InIter1, class _Sent1, class _InIter2, class _Sent2, class _OutIter>
-_LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20
+_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20
     __set_intersection_result<_InIter1, _InIter2, _OutIter>
     __set_intersection(
         _InIter1 __first1, _Sent1 __last1, _InIter2 __first2, _Sent2 __last2, _OutIter __result, _Compare&& __comp) {

>From 090df863302f92ade54e54dbb3b34ba0d9c58c63 Mon Sep 17 00:00:00 2001
From: Iuri Chaer <ichaer at splunk.com>
Date: Tue, 23 Apr 2024 11:52:10 +0100
Subject: [PATCH 27/44] Address feedback about qualifying abort(), added
 comment to clarify choice of not having a `default` case in `switch`.

---
 libcxx/benchmarks/algorithms/set_intersection.bench.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/libcxx/benchmarks/algorithms/set_intersection.bench.cpp b/libcxx/benchmarks/algorithms/set_intersection.bench.cpp
index b2de0c3223b005..396adb5067a40e 100644
--- a/libcxx/benchmarks/algorithms/set_intersection.bench.cpp
+++ b/libcxx/benchmarks/algorithms/set_intersection.bench.cpp
@@ -124,6 +124,7 @@ std::pair<Container, Container> genCacheUnfriendlyData(size_t size1, size_t size
 
   switch (pos) {
   case OverlapPosition::None:
+    // we like -Wswitch :)
     break;
 
   case OverlapPosition::Front:
@@ -135,7 +136,7 @@ std::pair<Container, Container> genCacheUnfriendlyData(size_t size1, size_t size
     return std::make_pair(move_into(StridedFwdIt(src.begin(), stride1), StridedFwdIt(src.end(), stride1)),
                           move_into(StridedFwdIt(copy.begin(), stride2), StridedFwdIt(copy.end(), stride2)));
   }
-  abort();
+  std::abort(); // would be std::unreachable() if it could
   return std::pair<Container, Container>();
 }
 

>From cb92d3cd7621b2360d1736c7aa53962c9226118f Mon Sep 17 00:00:00 2001
From: Iuri Chaer <ichaer at splunk.com>
Date: Tue, 23 Apr 2024 11:53:20 +0100
Subject: [PATCH 28/44] Address comment about broken comment for
 `getVectorOfRandom()`: move the function closer to its point of usage and
 document what `genCacheUnfriendlyData()` is trying to do in its own comment.
 `getVectorOfRandom()` has imho a good name which describes all it's meant to
 achieve, it's `genCacheUnfriendlyData()` that needs explaining.

---
 .../algorithms/set_intersection.bench.cpp     | 48 ++++++++++---------
 1 file changed, 25 insertions(+), 23 deletions(-)

diff --git a/libcxx/benchmarks/algorithms/set_intersection.bench.cpp b/libcxx/benchmarks/algorithms/set_intersection.bench.cpp
index 396adb5067a40e..521e184f81a122 100644
--- a/libcxx/benchmarks/algorithms/set_intersection.bench.cpp
+++ b/libcxx/benchmarks/algorithms/set_intersection.bench.cpp
@@ -46,27 +46,6 @@ struct AllOverlapPositions : EnumValuesAsTuple<AllOverlapPositions, OverlapPosit
   static constexpr const char* Names[] = {"None", "Front", "Interlaced"};
 };
 
-// functor that moves elements from an iterator range into a new Container instance
-template <typename Container>
-struct MoveInto {
-  template <class It>
-  [[nodiscard]] static Container operator()(It first, It last) {
-    Container out;
-    std::move(first, last, std::inserter(out, out.begin()));
-    return out;
-  }
-};
-
-// lightweight wrapping around fillValues() which puts a little effort into
-// making that would be contiguous when sorted non-contiguous in memory
-template <typename T>
-std::vector<T> getVectorOfRandom(size_t N) {
-  std::vector<T> v;
-  fillValues(v, N, Order::Random);
-  sortValues(v, Order::Random);
-  return std::vector<T>(v);
-}
-
 // forward_iterator wrapping which, for each increment, moves the underlying iterator forward Stride elements
 template <typename Wrapped>
 struct StridedFwdIt {
@@ -101,8 +80,31 @@ struct StridedFwdIt {
 template <typename Wrapped>
 StridedFwdIt(Wrapped, unsigned) -> StridedFwdIt<Wrapped>;
 
-// realistically, data won't all be nicely contiguous in a container
+// functor that moves elements from an iterator range into a new Container instance
+template <typename Container>
+struct MoveInto {
+  template <class It>
+  [[nodiscard]] static Container operator()(It first, It last) {
+    Container out;
+    std::move(first, last, std::inserter(out, out.begin()));
+    return out;
+  }
+};
+
+template <typename T>
+std::vector<T> getVectorOfRandom(size_t N) {
+  std::vector<T> v;
+  fillValues(v, N, Order::Random);
+  sortValues(v, Order::Random);
+  return std::vector<T>(v);
+}
+
+// realistically, data won't all be nicely contiguous in a container,
 // we'll go through some effort to ensure that it's shuffled through memory
+// this is especially important for containers with non-contiguous element
+// storage, but it will affect even a std::vector, because when you copy a
+// std::vector<std::string> the underlying data storage position for the char
+// arrays of the copy are likely to have high locality
 template <class Container>
 std::pair<Container, Container> genCacheUnfriendlyData(size_t size1, size_t size2, OverlapPosition pos) {
   using ValueType = typename Container::value_type;
@@ -116,7 +118,7 @@ std::pair<Container, Container> genCacheUnfriendlyData(size_t size1, size_t size
   }
 
   // all other overlap types will have to copy some part of the data, but if
-  // we copy after sorting it will likely have high cache locality, so we sort
+  // we copy after sorting it will likely have high locality, so we sort
   // each copy separately
   auto copy = src;
   std::sort(src.begin(), src.end());

>From f4a6f3630cc5fba532dae97cd2cc97b1d929a9cd Mon Sep 17 00:00:00 2001
From: Iuri Chaer <ichaer at splunk.com>
Date: Tue, 23 Apr 2024 17:49:11 +0100
Subject: [PATCH 29/44] Oops, forgot to format =/. Still working on the
 remaining feedback, but it would be good to be sure that we have a good
 baseline after this big merge from main.

---
 libcxx/include/__algorithm/set_intersection.h | 58 +++++++++----------
 1 file changed, 29 insertions(+), 29 deletions(-)

diff --git a/libcxx/include/__algorithm/set_intersection.h b/libcxx/include/__algorithm/set_intersection.h
index 340dd7ec8b5527..a5a86baa345cb7 100644
--- a/libcxx/include/__algorithm/set_intersection.h
+++ b/libcxx/include/__algorithm/set_intersection.h
@@ -62,9 +62,9 @@ struct _LIBCPP_NODISCARD __set_intersector {
         __result_(__result),
         __comp_(__comp) {}
 
-  _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20
-      __set_intersection_result<_InIter1, _InIter2, _OutIter>
-      operator()() {
+  _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI
+  _LIBCPP_CONSTEXPR_SINCE_CXX20 __set_intersection_result<_InIter1, _InIter2, _OutIter>
+  operator()() {
     while (__first2_ != __last2_) {
       __advance1_and_maybe_add_result();
       if (__first1_ == __last1_)
@@ -142,17 +142,17 @@ template <class _AlgPolicy,
           class _InForwardIter2,
           class _Sent2,
           class _OutIter>
-_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20
-    __set_intersection_result<_InForwardIter1, _InForwardIter2, _OutIter>
-    __set_intersection(
-        _InForwardIter1 __first1,
-        _Sent1 __last1,
-        _InForwardIter2 __first2,
-        _Sent2 __last2,
-        _OutIter __result,
-        _Compare&& __comp,
-        std::forward_iterator_tag,
-        std::forward_iterator_tag) {
+_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI
+_LIBCPP_CONSTEXPR_SINCE_CXX20 __set_intersection_result<_InForwardIter1, _InForwardIter2, _OutIter>
+__set_intersection(
+    _InForwardIter1 __first1,
+    _Sent1 __last1,
+    _InForwardIter2 __first2,
+    _Sent2 __last2,
+    _OutIter __result,
+    _Compare&& __comp,
+    std::forward_iterator_tag,
+    std::forward_iterator_tag) {
   std::__set_intersector<_AlgPolicy, _Compare, _InForwardIter1, _Sent1, _InForwardIter2, _Sent2, _OutIter>
       __intersector(__first1, __last1, __first2, __last2, __result, __comp);
   return __intersector();
@@ -166,17 +166,17 @@ template <class _AlgPolicy,
           class _InInputIter2,
           class _Sent2,
           class _OutIter>
-_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20
-    __set_intersection_result<_InInputIter1, _InInputIter2, _OutIter>
-    __set_intersection(
-        _InInputIter1 __first1,
-        _Sent1 __last1,
-        _InInputIter2 __first2,
-        _Sent2 __last2,
-        _OutIter __result,
-        _Compare&& __comp,
-        std::input_iterator_tag,
-        std::input_iterator_tag) {
+_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI
+_LIBCPP_CONSTEXPR_SINCE_CXX20 __set_intersection_result<_InInputIter1, _InInputIter2, _OutIter>
+__set_intersection(
+    _InInputIter1 __first1,
+    _Sent1 __last1,
+    _InInputIter2 __first2,
+    _Sent2 __last2,
+    _OutIter __result,
+    _Compare&& __comp,
+    std::input_iterator_tag,
+    std::input_iterator_tag) {
   while (__first1 != __last1 && __first2 != __last2) {
     if (__comp(*__first1, *__first2))
       ++__first1;
@@ -210,10 +210,10 @@ class __set_intersection_iter_category {
 };
 
 template <class _AlgPolicy, class _Compare, class _InIter1, class _Sent1, class _InIter2, class _Sent2, class _OutIter>
-_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20
-    __set_intersection_result<_InIter1, _InIter2, _OutIter>
-    __set_intersection(
-        _InIter1 __first1, _Sent1 __last1, _InIter2 __first2, _Sent2 __last2, _OutIter __result, _Compare&& __comp) {
+_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI
+_LIBCPP_CONSTEXPR_SINCE_CXX20 __set_intersection_result<_InIter1, _InIter2, _OutIter>
+__set_intersection(
+    _InIter1 __first1, _Sent1 __last1, _InIter2 __first2, _Sent2 __last2, _OutIter __result, _Compare&& __comp) {
   return std::__set_intersection<_AlgPolicy>(
       std::move(__first1),
       std::move(__last1),

>From 3f9cfec32224b97692f6c6119c00b3abd309c508 Mon Sep 17 00:00:00 2001
From: Iuri Chaer <ichaer at splunk.com>
Date: Tue, 23 Apr 2024 20:35:27 +0100
Subject: [PATCH 30/44] Address comment about making the benchmark's `struct
 MoveInto` into a function -- make it a lambda, to avoid the explicit template
 parameter a freestanding function would require.

---
 .../algorithms/set_intersection.bench.cpp       | 17 +++++------------
 1 file changed, 5 insertions(+), 12 deletions(-)

diff --git a/libcxx/benchmarks/algorithms/set_intersection.bench.cpp b/libcxx/benchmarks/algorithms/set_intersection.bench.cpp
index 521e184f81a122..74ba9e8a4ad1a2 100644
--- a/libcxx/benchmarks/algorithms/set_intersection.bench.cpp
+++ b/libcxx/benchmarks/algorithms/set_intersection.bench.cpp
@@ -80,17 +80,6 @@ struct StridedFwdIt {
 template <typename Wrapped>
 StridedFwdIt(Wrapped, unsigned) -> StridedFwdIt<Wrapped>;
 
-// functor that moves elements from an iterator range into a new Container instance
-template <typename Container>
-struct MoveInto {
-  template <class It>
-  [[nodiscard]] static Container operator()(It first, It last) {
-    Container out;
-    std::move(first, last, std::inserter(out, out.begin()));
-    return out;
-  }
-};
-
 template <typename T>
 std::vector<T> getVectorOfRandom(size_t N) {
   std::vector<T> v;
@@ -108,7 +97,11 @@ std::vector<T> getVectorOfRandom(size_t N) {
 template <class Container>
 std::pair<Container, Container> genCacheUnfriendlyData(size_t size1, size_t size2, OverlapPosition pos) {
   using ValueType = typename Container::value_type;
-  const MoveInto<Container> move_into;
+  auto move_into = [](auto first, auto last) {
+      Container out;
+      std::move(first, last, std::inserter(out, out.begin()));
+      return out;
+  };
   const auto src_size        = pos == OverlapPosition::None ? size1 + size2 : std::max(size1, size2);
   std::vector<ValueType> src = getVectorOfRandom<ValueType>(src_size);
 

>From 1afb99d14541f2388464fa43c5cf6cbb5ec701a6 Mon Sep 17 00:00:00 2001
From: Iuri Chaer <ichaer at splunk.com>
Date: Tue, 23 Apr 2024 20:37:19 +0100
Subject: [PATCH 31/44] Address comment about using `common.h`'s `Quantities`
 constant in the benchmark.

---
 .../benchmarks/algorithms/set_intersection.bench.cpp  | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/libcxx/benchmarks/algorithms/set_intersection.bench.cpp b/libcxx/benchmarks/algorithms/set_intersection.bench.cpp
index 74ba9e8a4ad1a2..30e580d4813d41 100644
--- a/libcxx/benchmarks/algorithms/set_intersection.bench.cpp
+++ b/libcxx/benchmarks/algorithms/set_intersection.bench.cpp
@@ -201,17 +201,6 @@ int main(int argc, char** argv) { /**/
   benchmark::Initialize(&argc, argv);
   if (benchmark::ReportUnrecognizedArguments(argc, argv))
     return 1;
-  const std::vector<size_t> Quantities = {
-    1 << 0,
-    1 << 4,
-    1 << 8,
-    1 << 14,
-// Running each benchmark in parallel consumes too much memory with MSAN
-// and can lead to the test process being killed.
-#if !TEST_HAS_FEATURE(memory_sanitizer)
-    1 << 18
-#endif
-  };
 
   makeCartesianProductBenchmark<SetIntersection, AllValueTypes, AllContainerTypes, AllOverlapPositions>(
       Quantities, Quantities);

>From 613e64af77a9a48240ae21bc515271e246061c1e Mon Sep 17 00:00:00 2001
From: Iuri Chaer <ichaer at splunk.com>
Date: Tue, 23 Apr 2024 22:20:16 +0100
Subject: [PATCH 32/44] Address feedback to improve assertion in
 _IterOps::__advance()

---
 libcxx/include/__algorithm/iterator_operations.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libcxx/include/__algorithm/iterator_operations.h b/libcxx/include/__algorithm/iterator_operations.h
index c9fd4376595bd0..4e1ff4bb83f15c 100644
--- a/libcxx/include/__algorithm/iterator_operations.h
+++ b/libcxx/include/__algorithm/iterator_operations.h
@@ -125,8 +125,8 @@ struct _IterOps<_ClassicAlgPolicy> {
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 static _Distance
   __advance(_RandIter& __iter, _Distance __count, const _RandIter& __sentinel, random_access_iterator_tag) {
     auto __dist = _IterOps::distance(__iter, __sentinel);
-    _LIBCPP_ASSERT_UNCATEGORIZED(
-        __count == 0 || (__dist < 0) == (__count < 0), "__sentinel must precede __iter when __count<0");
+    _LIBCPP_ASSERT_VALID_INPUT_RANGE(
+        __count == 0 || (__dist < 0) == (__count < 0), "__sentinel must precede __iter when __count < 0");
     if (__count < 0)
       __dist = __dist > __count ? __dist : __count;
     else

>From 4588447fd009eb95aa8fc7f9d7e4e3837df92ea3 Mon Sep 17 00:00:00 2001
From: Iuri Chaer <ichaer at splunk.com>
Date: Tue, 23 Apr 2024 22:28:08 +0100
Subject: [PATCH 33/44] Rename new sentinel-based `_IterOps::advance()` to
 `_IterOps::__advance_to` -- no reason IMO to have a second override if
 `__advance_to = ranges::advance` in c++20...

---
 libcxx/include/__algorithm/iterator_operations.h | 10 +++++-----
 libcxx/include/__algorithm/lower_bound.h         |  2 +-
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/libcxx/include/__algorithm/iterator_operations.h b/libcxx/include/__algorithm/iterator_operations.h
index 4e1ff4bb83f15c..12a76cdfbeab22 100644
--- a/libcxx/include/__algorithm/iterator_operations.h
+++ b/libcxx/include/__algorithm/iterator_operations.h
@@ -91,15 +91,15 @@ struct _IterOps<_ClassicAlgPolicy> {
   // use the incoming type for returning and steer clear of negative overflows
   template <class _Iter, class _Distance>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 static _Distance
-  advance(_Iter& __iter, _Distance __count, const _Iter& __sentinel) {
-    return _IterOps::__advance(__iter, __count, __sentinel, typename iterator_traits<_Iter>::iterator_category());
+  __advance_to(_Iter& __iter, _Distance __count, const _Iter& __sentinel) {
+    return _IterOps::__advance_to(__iter, __count, __sentinel, typename iterator_traits<_Iter>::iterator_category());
   }
 
 private:
   // advance with sentinel, a la std::ranges::advance -- InputIterator specialization
   template <class _InputIter, class _Distance>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 static _Distance
-  __advance(_InputIter& __iter, _Distance __count, const _InputIter& __sentinel, input_iterator_tag) {
+  __advance_to(_InputIter& __iter, _Distance __count, const _InputIter& __sentinel, input_iterator_tag) {
     _Distance __dist = _Distance();
     for (; __dist < __count && __iter != __sentinel; ++__dist)
       ++__iter;
@@ -109,7 +109,7 @@ struct _IterOps<_ClassicAlgPolicy> {
   // advance with sentinel, a la std::ranges::advance -- BidirectionalIterator specialization
   template <class _BiDirIter, class _Distance>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 static _Distance
-  __advance(_BiDirIter& __iter, _Distance __count, const _BiDirIter& __sentinel, bidirectional_iterator_tag) {
+  __advance_to(_BiDirIter& __iter, _Distance __count, const _BiDirIter& __sentinel, bidirectional_iterator_tag) {
     _Distance __dist = _Distance();
     if (__count >= 0)
       for (; __dist < __count && __iter != __sentinel; ++__dist)
@@ -123,7 +123,7 @@ struct _IterOps<_ClassicAlgPolicy> {
   // advance with sentinel, a la std::ranges::advance -- RandomIterator specialization
   template <class _RandIter, class _Distance>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 static _Distance
-  __advance(_RandIter& __iter, _Distance __count, const _RandIter& __sentinel, random_access_iterator_tag) {
+  __advance_to(_RandIter& __iter, _Distance __count, const _RandIter& __sentinel, random_access_iterator_tag) {
     auto __dist = _IterOps::distance(__iter, __sentinel);
     _LIBCPP_ASSERT_VALID_INPUT_RANGE(
         __count == 0 || (__dist < 0) == (__count < 0), "__sentinel must precede __iter when __count < 0");
diff --git a/libcxx/include/__algorithm/lower_bound.h b/libcxx/include/__algorithm/lower_bound.h
index 9424a50373fade..f92befc97e1c06 100644
--- a/libcxx/include/__algorithm/lower_bound.h
+++ b/libcxx/include/__algorithm/lower_bound.h
@@ -71,7 +71,7 @@ __lower_bound_onesided(_Iter __first, _Sent __last, const _Type& __value, _Comp&
   using _Distance = typename iterator_traits<_Iter>::difference_type;
   for (_Distance __step = 1; __first != __last; __step <<= 1) {
     auto __it   = __first;
-    auto __dist = __step - _IterOps<_AlgPolicy>::advance(__it, __step, __last);
+    auto __dist = __step - _IterOps<_AlgPolicy>::__advance_to(__it, __step, __last);
     // once we reach the last range where needle can be we must start
     // looking inwards, bisecting that range
     if (__it == __last || !std::__invoke(__comp, std::__invoke(__proj, *__it), __value)) {

>From 2af9a6fb935c7cfe75f2ea9b546786120c4178ba Mon Sep 17 00:00:00 2001
From: Iuri Chaer <ichaer at splunk.com>
Date: Tue, 23 Apr 2024 22:42:20 +0100
Subject: [PATCH 34/44] Address feedback about using
 `iterator_traits<_Iter>::difference_type` instead of a templated `_Distance`
 in `_IterOps::__advance_to()`

---
 .../include/__algorithm/iterator_operations.h | 28 +++++++++----------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/libcxx/include/__algorithm/iterator_operations.h b/libcxx/include/__algorithm/iterator_operations.h
index 12a76cdfbeab22..571bd13b0e2400 100644
--- a/libcxx/include/__algorithm/iterator_operations.h
+++ b/libcxx/include/__algorithm/iterator_operations.h
@@ -89,28 +89,28 @@ struct _IterOps<_ClassicAlgPolicy> {
   // advance with sentinel, a la std::ranges::advance
   // it's unclear whether _Iter has a difference_type and whether that's signed, so we play it safe:
   // use the incoming type for returning and steer clear of negative overflows
-  template <class _Iter, class _Distance>
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 static _Distance
-  __advance_to(_Iter& __iter, _Distance __count, const _Iter& __sentinel) {
+  template <class _Iter>
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 static __difference_type<_Iter>
+  __advance_to(_Iter& __iter, __difference_type<_Iter> __count, const _Iter& __sentinel) {
     return _IterOps::__advance_to(__iter, __count, __sentinel, typename iterator_traits<_Iter>::iterator_category());
   }
 
 private:
   // advance with sentinel, a la std::ranges::advance -- InputIterator specialization
-  template <class _InputIter, class _Distance>
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 static _Distance
-  __advance_to(_InputIter& __iter, _Distance __count, const _InputIter& __sentinel, input_iterator_tag) {
-    _Distance __dist = _Distance();
+  template <class _InputIter>
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 static __difference_type<_InputIter>
+  __advance_to(_InputIter& __iter, __difference_type<_InputIter> __count, const _InputIter& __sentinel, input_iterator_tag) {
+    __difference_type<_InputIter> __dist = 0;
     for (; __dist < __count && __iter != __sentinel; ++__dist)
       ++__iter;
     return __count - __dist;
   }
 
   // advance with sentinel, a la std::ranges::advance -- BidirectionalIterator specialization
-  template <class _BiDirIter, class _Distance>
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 static _Distance
-  __advance_to(_BiDirIter& __iter, _Distance __count, const _BiDirIter& __sentinel, bidirectional_iterator_tag) {
-    _Distance __dist = _Distance();
+  template <class _BiDirIter>
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 static __difference_type<_BiDirIter>
+  __advance_to(_BiDirIter& __iter, __difference_type<_BiDirIter> __count, const _BiDirIter& __sentinel, bidirectional_iterator_tag) {
+    __difference_type<_BiDirIter> __dist = 0;
     if (__count >= 0)
       for (; __dist < __count && __iter != __sentinel; ++__dist)
         ++__iter;
@@ -121,9 +121,9 @@ struct _IterOps<_ClassicAlgPolicy> {
   }
 
   // advance with sentinel, a la std::ranges::advance -- RandomIterator specialization
-  template <class _RandIter, class _Distance>
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 static _Distance
-  __advance_to(_RandIter& __iter, _Distance __count, const _RandIter& __sentinel, random_access_iterator_tag) {
+  template <class _RandIter>
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 static __difference_type<_RandIter>
+  __advance_to(_RandIter& __iter, __difference_type<_RandIter> __count, const _RandIter& __sentinel, random_access_iterator_tag) {
     auto __dist = _IterOps::distance(__iter, __sentinel);
     _LIBCPP_ASSERT_VALID_INPUT_RANGE(
         __count == 0 || (__dist < 0) == (__count < 0), "__sentinel must precede __iter when __count < 0");

>From 4f05ded78a5032f8ed9ff471b65860c97981bd6a Mon Sep 17 00:00:00 2001
From: Iuri Chaer <ichaer at splunk.com>
Date: Tue, 23 Apr 2024 22:55:40 +0100
Subject: [PATCH 35/44] git clang-format on the last batch of changes

---
 .../algorithms/set_intersection.bench.cpp          |  8 ++++----
 libcxx/include/__algorithm/iterator_operations.h   | 14 ++++++++++----
 2 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/libcxx/benchmarks/algorithms/set_intersection.bench.cpp b/libcxx/benchmarks/algorithms/set_intersection.bench.cpp
index 30e580d4813d41..2233b85f1162f0 100644
--- a/libcxx/benchmarks/algorithms/set_intersection.bench.cpp
+++ b/libcxx/benchmarks/algorithms/set_intersection.bench.cpp
@@ -97,10 +97,10 @@ std::vector<T> getVectorOfRandom(size_t N) {
 template <class Container>
 std::pair<Container, Container> genCacheUnfriendlyData(size_t size1, size_t size2, OverlapPosition pos) {
   using ValueType = typename Container::value_type;
-  auto move_into = [](auto first, auto last) {
-      Container out;
-      std::move(first, last, std::inserter(out, out.begin()));
-      return out;
+  auto move_into  = [](auto first, auto last) {
+    Container out;
+    std::move(first, last, std::inserter(out, out.begin()));
+    return out;
   };
   const auto src_size        = pos == OverlapPosition::None ? size1 + size2 : std::max(size1, size2);
   std::vector<ValueType> src = getVectorOfRandom<ValueType>(src_size);
diff --git a/libcxx/include/__algorithm/iterator_operations.h b/libcxx/include/__algorithm/iterator_operations.h
index 571bd13b0e2400..6edafe26fb42ba 100644
--- a/libcxx/include/__algorithm/iterator_operations.h
+++ b/libcxx/include/__algorithm/iterator_operations.h
@@ -98,8 +98,8 @@ struct _IterOps<_ClassicAlgPolicy> {
 private:
   // advance with sentinel, a la std::ranges::advance -- InputIterator specialization
   template <class _InputIter>
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 static __difference_type<_InputIter>
-  __advance_to(_InputIter& __iter, __difference_type<_InputIter> __count, const _InputIter& __sentinel, input_iterator_tag) {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 static __difference_type<_InputIter> __advance_to(
+      _InputIter& __iter, __difference_type<_InputIter> __count, const _InputIter& __sentinel, input_iterator_tag) {
     __difference_type<_InputIter> __dist = 0;
     for (; __dist < __count && __iter != __sentinel; ++__dist)
       ++__iter;
@@ -109,7 +109,10 @@ struct _IterOps<_ClassicAlgPolicy> {
   // advance with sentinel, a la std::ranges::advance -- BidirectionalIterator specialization
   template <class _BiDirIter>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 static __difference_type<_BiDirIter>
-  __advance_to(_BiDirIter& __iter, __difference_type<_BiDirIter> __count, const _BiDirIter& __sentinel, bidirectional_iterator_tag) {
+  __advance_to(_BiDirIter& __iter,
+               __difference_type<_BiDirIter> __count,
+               const _BiDirIter& __sentinel,
+               bidirectional_iterator_tag) {
     __difference_type<_BiDirIter> __dist = 0;
     if (__count >= 0)
       for (; __dist < __count && __iter != __sentinel; ++__dist)
@@ -123,7 +126,10 @@ struct _IterOps<_ClassicAlgPolicy> {
   // advance with sentinel, a la std::ranges::advance -- RandomIterator specialization
   template <class _RandIter>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 static __difference_type<_RandIter>
-  __advance_to(_RandIter& __iter, __difference_type<_RandIter> __count, const _RandIter& __sentinel, random_access_iterator_tag) {
+  __advance_to(_RandIter& __iter,
+               __difference_type<_RandIter> __count,
+               const _RandIter& __sentinel,
+               random_access_iterator_tag) {
     auto __dist = _IterOps::distance(__iter, __sentinel);
     _LIBCPP_ASSERT_VALID_INPUT_RANGE(
         __count == 0 || (__dist < 0) == (__count < 0), "__sentinel must precede __iter when __count < 0");

>From 161d81cfb744fc3cd5abe9717ae93f8e5c23874e Mon Sep 17 00:00:00 2001
From: Iuri Chaer <ichaer at splunk.com>
Date: Wed, 24 Apr 2024 16:14:15 +0100
Subject: [PATCH 36/44] Address review comments about lower_bound.h

---
 libcxx/include/__algorithm/lower_bound.h | 18 +++++++-----------
 1 file changed, 7 insertions(+), 11 deletions(-)

diff --git a/libcxx/include/__algorithm/lower_bound.h b/libcxx/include/__algorithm/lower_bound.h
index f92befc97e1c06..c5d549a0d54c61 100644
--- a/libcxx/include/__algorithm/lower_bound.h
+++ b/libcxx/include/__algorithm/lower_bound.h
@@ -57,18 +57,14 @@ _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Iter __lo
 // would yield \Omega(n*log(n)) comparisons and, for non-random iterators, \Omega(n^2) iterator increments, whereas the
 // one-sided version will yield O(n) operations on both counts, with a \Omega(log(n)) bound on the number of
 // comparisons.
-template <class _AlgPolicy, class _Iter, class _Sent, class _Type, class _Proj, class _Comp>
-_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Iter
-__lower_bound_onesided(_Iter __first, _Sent __last, const _Type& __value, _Comp& __comp, _Proj& __proj) {
-  // static_assert(std::is_base_of<std::forward_iterator_tag, typename _IterOps<_AlgPolicy>::template
-  // __iterator_category<_Iter>>::value,
-  //       "lower_bound() is a multipass algorithm and requires forward iterator or better");
-
+template <class _AlgPolicy, class _ForwardIterator, class _Sent, class _Type, class _Proj, class _Comp>
+_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _ForwardIterator
+__lower_bound_onesided(_ForwardIterator __first, _Sent __last, const _Type& __value, _Comp& __comp, _Proj& __proj) {
   // step = 0, ensuring we can always short-circuit when distance is 1 later on
   if (__first == __last || !std::__invoke(__comp, std::__invoke(__proj, *__first), __value))
     return __first;
 
-  using _Distance = typename iterator_traits<_Iter>::difference_type;
+  using _Distance = typename iterator_traits<_ForwardIterator>::difference_type;
   for (_Distance __step = 1; __first != __last; __step <<= 1) {
     auto __it   = __first;
     auto __dist = __step - _IterOps<_AlgPolicy>::__advance_to(__it, __step, __last);
@@ -87,9 +83,9 @@ __lower_bound_onesided(_Iter __first, _Sent __last, const _Type& __value, _Comp&
   return __first;
 }
 
-template <class _AlgPolicy, class _RandIter, class _Sent, class _Type, class _Proj, class _Comp>
-_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _RandIter
-__lower_bound(_RandIter __first, _Sent __last, const _Type& __value, _Comp& __comp, _Proj& __proj) {
+template <class _AlgPolicy, class _RandomAccessIterator, class _Sent, class _Type, class _Proj, class _Comp>
+_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _RandomAccessIterator
+__lower_bound(_RandomAccessIterator __first, _Sent __last, const _Type& __value, _Comp& __comp, _Proj& __proj) {
   const auto __dist = _IterOps<_AlgPolicy>::distance(__first, __last);
   return std::__lower_bound_bisecting<_AlgPolicy>(__first, __value, __dist, __comp, __proj);
 }

>From 3c9f8002b1ac954c0830005fd2cad3cad30f649d Mon Sep 17 00:00:00 2001
From: Iuri Chaer <ichaer at splunk.com>
Date: Wed, 24 Apr 2024 19:30:58 +0100
Subject: [PATCH 37/44] Address review comments about set_intersection.h:
 unnecessary namespace qualification, insufficient comments, and direct use of
 iterator traits.

---
 libcxx/include/__algorithm/set_intersection.h | 22 +++++--------------
 1 file changed, 5 insertions(+), 17 deletions(-)

diff --git a/libcxx/include/__algorithm/set_intersection.h b/libcxx/include/__algorithm/set_intersection.h
index a5a86baa345cb7..cbacb2c39605f7 100644
--- a/libcxx/include/__algorithm/set_intersection.h
+++ b/libcxx/include/__algorithm/set_intersection.h
@@ -134,7 +134,8 @@ struct _LIBCPP_NODISCARD __set_intersector {
   }
 };
 
-// with forward iterators we can use binary search to skip over entries
+// with forward iterators we can make multiple passes over the data, allowing the use of one-sided binary search to reduce best-case
+// complexity to log(N)
 template <class _AlgPolicy,
           class _Compare,
           class _InForwardIter1,
@@ -190,25 +191,12 @@ __set_intersection(
     }
   }
 
-  return std::__set_intersection_result<_InInputIter1, _InInputIter2, _OutIter>(
+  return __set_intersection_result<_InInputIter1, _InInputIter2, _OutIter>(
       _IterOps<_AlgPolicy>::next(std::move(__first1), std::move(__last1)),
       _IterOps<_AlgPolicy>::next(std::move(__first2), std::move(__last2)),
       std::move(__result));
 }
 
-template <class _AlgPolicy, class _Iter>
-class __set_intersection_iter_category {
-  template <class _It>
-  using __cat = typename std::_IterOps<_AlgPolicy>::template __iterator_category<_It>;
-  template <class _It>
-  static __cat<_It> test(__cat<_It>*);
-  template <class>
-  static std::input_iterator_tag test(...);
-
-public:
-  using __type = decltype(test<_Iter>(nullptr));
-};
-
 template <class _AlgPolicy, class _Compare, class _InIter1, class _Sent1, class _InIter2, class _Sent2, class _OutIter>
 _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI
 _LIBCPP_CONSTEXPR_SINCE_CXX20 __set_intersection_result<_InIter1, _InIter2, _OutIter>
@@ -221,8 +209,8 @@ __set_intersection(
       std::move(__last2),
       std::move(__result),
       __comp,
-      typename std::__set_intersection_iter_category<_AlgPolicy, _InIter1>::__type(),
-      typename std::__set_intersection_iter_category<_AlgPolicy, _InIter2>::__type());
+      typename std::_IterOps<_AlgPolicy>::template __iterator_category<_InIter1>(),
+      typename std::_IterOps<_AlgPolicy>::template __iterator_category<_InIter2>());
 }
 
 template <class _InputIterator1, class _InputIterator2, class _OutputIterator, class _Compare>

>From 4aa4a823367cfad153836d0ebdae6f80c15d02dd Mon Sep 17 00:00:00 2001
From: Iuri Chaer <ichaer at splunk.com>
Date: Wed, 24 Apr 2024 20:42:53 +0100
Subject: [PATCH 38/44] Address review comment about replacing `struct
 __set_intersector` with a function. I think I managed to preserve readability
 by keeping `__add_output_unless()` as a lambda.

---
 libcxx/include/__algorithm/set_intersection.h | 140 ++++++------------
 1 file changed, 44 insertions(+), 96 deletions(-)

diff --git a/libcxx/include/__algorithm/set_intersection.h b/libcxx/include/__algorithm/set_intersection.h
index cbacb2c39605f7..d0aa551037b1ee 100644
--- a/libcxx/include/__algorithm/set_intersection.h
+++ b/libcxx/include/__algorithm/set_intersection.h
@@ -43,99 +43,18 @@ struct __set_intersection_result {
       : __in1_(std::move(__in_iter1)), __in2_(std::move(__in_iter2)), __out_(std::move(__out_iter)) {}
 };
 
-template <class _AlgPolicy, class _Compare, class _InIter1, class _Sent1, class _InIter2, class _Sent2, class _OutIter>
-struct _LIBCPP_NODISCARD __set_intersector {
-  _InIter1& __first1_;
-  const _Sent1& __last1_;
-  _InIter2& __first2_;
-  const _Sent2& __last2_;
-  _OutIter& __result_;
-  _Compare& __comp_;
-  bool __prev_advanced_ = true;
-
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __set_intersector(
-      _InIter1& __first1, _Sent1& __last1, _InIter2& __first2, _Sent2& __last2, _OutIter& __result, _Compare& __comp)
-      : __first1_(__first1),
-        __last1_(__last1),
-        __first2_(__first2),
-        __last2_(__last2),
-        __result_(__result),
-        __comp_(__comp) {}
-
-  _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI
-  _LIBCPP_CONSTEXPR_SINCE_CXX20 __set_intersection_result<_InIter1, _InIter2, _OutIter>
-  operator()() {
-    while (__first2_ != __last2_) {
-      __advance1_and_maybe_add_result();
-      if (__first1_ == __last1_)
-        break;
-      __advance2_and_maybe_add_result();
-    }
-    return __set_intersection_result<_InIter1, _InIter2, _OutIter>(
-        _IterOps<_AlgPolicy>::next(std::move(__first1_), std::move(__last1_)),
-        _IterOps<_AlgPolicy>::next(std::move(__first2_), std::move(__last2_)),
-        std::move(__result_));
-  }
-
-private:
-  // advance __iter to the first element in the range where !__comp_(__iter, __value)
-  // add result if this is the second consecutive call without advancing
-  // this method only works if you alternate calls between __advance1_and_maybe_add_result() and
-  // __advance2_and_maybe_add_result()
-  template <class _Iter, class _Sent, class _Value>
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void
-  __advance_and_maybe_add_result(_Iter& __iter, const _Sent& __sentinel, const _Value& __value) {
-    _LIBCPP_CONSTEXPR std::__identity __proj;
-    // use one-sided binary search for improved algorithmic complexity bounds
-    // understanding how we can use binary search and still respect complexity
-    // guarantees is _not_ straightforward, so let me explain: the guarantee
-    // is "at most 2*(N+M)-1 comparisons", and one-sided binary search will
-    // necessarily overshoot depending on the position of the needle in the
-    // haystack -- for instance, if we're searching for 3 in (1, 2, 3, 4),
-    // we'll check if 3<1, then 3<2, then 3<4, and, finally, 3<3, for a total of
-    // 4 comparisons, when linear search would have yielded 3. However,
-    // because we won't need to perform the intervening reciprocal comparisons
-    // (ie 1<3, 2<3, 4<3), that extra comparison doesn't run afoul of the
-    // guarantee. Additionally, this type of scenario can only happen for match
-    // distances of up to 5 elements, because 2*log2(8) is 6, and we'll still
-    // be worse-off at position 5 of an 8-element set. From then onwards
-    // these scenarios can't happen.
-    // TL;DR: we'll be 1 comparison worse-off compared to the classic linear-
-    // searching algorithm if matching position 3 of a set with 4 elements,
-    // or position 5 if the set has 7 or 8 elements, but we'll never exceed
-    // the complexity guarantees from the standard.
-    _Iter __tmp = std::__lower_bound_onesided<_AlgPolicy>(__iter, __sentinel, __value, __comp_, __proj);
-    std::swap(__tmp, __iter);
-    __add_output_unless(__tmp != __iter);
-  }
-
-  // advance __first1_ to the first element in the range where !__comp_(*__first1_, *__first2_)
-  // add result if neither __first1_ nor __first2_ advanced in the last attempt (meaning they are equal)
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __advance1_and_maybe_add_result() {
-    __advance_and_maybe_add_result(__first1_, __last1_, *__first2_);
-  }
-
-  // advance __first2_ to the first element in the range where !__comp_(*__first2_, *__first1_)
-  // add result if neither __first1_ nor __first2_ advanced in the last attempt (meaning they are equal)
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __advance2_and_maybe_add_result() {
-    __advance_and_maybe_add_result(__first2_, __last2_, *__first1_);
-  }
-
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __add_output_unless(bool __advanced) {
-    if (__advanced | __prev_advanced_) {
-      __prev_advanced_ = __advanced;
-    } else {
-      *__result_ = *__first1_;
-      ++__result_;
-      ++__first1_;
-      ++__first2_;
-      __prev_advanced_ = true;
-    }
-  }
-};
-
-// with forward iterators we can make multiple passes over the data, allowing the use of one-sided binary search to reduce best-case
-// complexity to log(N)
+// With forward iterators we can make multiple passes over the data, allowing the use of one-sided binary search to
+// reduce best-case complexity to log(N). Understanding how we can use binary search and still respect complexity
+// guarantees is _not_ straightforward: the guarantee is "at most 2*(N+M)-1 comparisons", and one-sided binary search
+// will necessarily overshoot depending on the position of the needle in the haystack -- for instance, if we're
+// searching for 3 in (1, 2, 3, 4), we'll check if 3<1, then 3<2, then 3<4, and, finally, 3<3, for a total of 4
+// comparisons, when linear search would have yielded 3. However, because we won't need to perform the intervening
+// reciprocal comparisons (ie 1<3, 2<3, 4<3), that extra comparison doesn't run afoul of the guarantee. Additionally,
+// this type of scenario can only happen for match distances of up to 5 elements, because 2*log2(8) is 6, and we'll
+// still be worse-off at position 5 of an 8-element set. From then onwards these scenarios can't happen. TL;DR: we'll be
+// 1 comparison worse-off compared to the classic linear- searching algorithm if matching position 3 of a set with 4
+// elements, or position 5 if the set has 7 or 8 elements, but we'll never exceed the complexity guarantees from the
+// standard.
 template <class _AlgPolicy,
           class _Compare,
           class _InForwardIter1,
@@ -154,9 +73,38 @@ __set_intersection(
     _Compare&& __comp,
     std::forward_iterator_tag,
     std::forward_iterator_tag) {
-  std::__set_intersector<_AlgPolicy, _Compare, _InForwardIter1, _Sent1, _InForwardIter2, _Sent2, _OutIter>
-      __intersector(__first1, __last1, __first2, __last2, __result, __comp);
-  return __intersector();
+  _LIBCPP_CONSTEXPR std::__identity __proj;
+  bool __prev_advanced = true;
+
+  auto __add_output_unless = [&](bool __advanced) {
+    if (__advanced | __prev_advanced) {
+      __prev_advanced = __advanced;
+    } else {
+      *__result = *__first1;
+      ++__result;
+      ++__first1;
+      ++__first2;
+      __prev_advanced = true;
+    }
+  };
+
+  while (__first2 != __last2) {
+    _InForwardIter1 __first1_next =
+        std::__lower_bound_onesided<_AlgPolicy>(__first1, __last1, *__first2, __comp, __proj);
+    std::swap(__first1_next, __first1);
+    __add_output_unless(__first1 != __first1_next);
+    if (__first1 == __last1)
+      break;
+
+    _InForwardIter2 __first2_next =
+        std::__lower_bound_onesided<_AlgPolicy>(__first2, __last2, *__first1, __comp, __proj);
+    std::swap(__first2_next, __first2);
+    __add_output_unless(__first2 != __first2_next);
+  }
+  return __set_intersection_result<_InForwardIter1, _InForwardIter2, _OutIter>(
+      _IterOps<_AlgPolicy>::next(std::move(__first1), std::move(__last1)),
+      _IterOps<_AlgPolicy>::next(std::move(__first2), std::move(__last2)),
+      std::move(__result));
 }
 
 // input iterators are not suitable for multipass algorithms, so we stick to the classic single-pass version

>From 8307b2db9c238b7d3c2d8648e827860d5be4a899 Mon Sep 17 00:00:00 2001
From: Iuri Chaer <ichaer at splunk.com>
Date: Fri, 26 Apr 2024 16:04:55 +0100
Subject: [PATCH 39/44] Make `__add_output_unless()` a freestanding function,
 `__set_intersection_add_output_unless()`, because the lambda [tripped the
 "MacOS with C++03" test
 run](https://buildkite.com/llvm-project/libcxx-ci/builds/35055#018f173c-f155-4fbb-b6d7-a7aba01cec9e):
 ``` ```

---
 libcxx/include/__algorithm/set_intersection.h | 34 +++++++++++--------
 1 file changed, 20 insertions(+), 14 deletions(-)

diff --git a/libcxx/include/__algorithm/set_intersection.h b/libcxx/include/__algorithm/set_intersection.h
index d0aa551037b1ee..293f500f541ac6 100644
--- a/libcxx/include/__algorithm/set_intersection.h
+++ b/libcxx/include/__algorithm/set_intersection.h
@@ -43,6 +43,24 @@ struct __set_intersection_result {
       : __in1_(std::move(__in_iter1)), __in2_(std::move(__in_iter2)), __out_(std::move(__out_iter)) {}
 };
 
+// Helper for __set_intersection() with one-sided binary search: populate result and advance input iterators if they
+// haven't advanced in the last 2 calls. This function is very intimately related to the way it is used and doesn't
+// attempt to abstract that, it's not appropriate for general usage outside of its context. It would be a lambda of
+// __set_intersection() if that hadn't stumped the compiler in c++03 mode in some platforms.
+template <class _InForwardIter1, class _InForwardIter2, class _OutIter>
+_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __set_intersection_add_output_unless(
+    bool __advanced, _InForwardIter1& __first1, _InForwardIter2& __first2, _OutIter& __result, bool& __prev_advanced) {
+  if (__advanced | __prev_advanced) {
+    __prev_advanced = __advanced;
+  } else {
+    *__result = *__first1;
+    ++__result;
+    ++__first1;
+    ++__first2;
+    __prev_advanced = true;
+  }
+}
+
 // With forward iterators we can make multiple passes over the data, allowing the use of one-sided binary search to
 // reduce best-case complexity to log(N). Understanding how we can use binary search and still respect complexity
 // guarantees is _not_ straightforward: the guarantee is "at most 2*(N+M)-1 comparisons", and one-sided binary search
@@ -76,30 +94,18 @@ __set_intersection(
   _LIBCPP_CONSTEXPR std::__identity __proj;
   bool __prev_advanced = true;
 
-  auto __add_output_unless = [&](bool __advanced) {
-    if (__advanced | __prev_advanced) {
-      __prev_advanced = __advanced;
-    } else {
-      *__result = *__first1;
-      ++__result;
-      ++__first1;
-      ++__first2;
-      __prev_advanced = true;
-    }
-  };
-
   while (__first2 != __last2) {
     _InForwardIter1 __first1_next =
         std::__lower_bound_onesided<_AlgPolicy>(__first1, __last1, *__first2, __comp, __proj);
     std::swap(__first1_next, __first1);
-    __add_output_unless(__first1 != __first1_next);
+    std::__set_intersection_add_output_unless(__first1 != __first1_next, __first1, __first2, __result, __prev_advanced);
     if (__first1 == __last1)
       break;
 
     _InForwardIter2 __first2_next =
         std::__lower_bound_onesided<_AlgPolicy>(__first2, __last2, *__first1, __comp, __proj);
     std::swap(__first2_next, __first2);
-    __add_output_unless(__first2 != __first2_next);
+    std::__set_intersection_add_output_unless(__first2 != __first2_next, __first1, __first2, __result, __prev_advanced);
   }
   return __set_intersection_result<_InForwardIter1, _InForwardIter2, _OutIter>(
       _IterOps<_AlgPolicy>::next(std::move(__first1), std::move(__last1)),

>From be6c5c848d5615adbe44546d9f78df680bd54767 Mon Sep 17 00:00:00 2001
From: Iuri Chaer <ichaer at splunk.com>
Date: Sat, 27 Apr 2024 11:22:37 +0100
Subject: [PATCH 40/44] Address comment about using ` std::forward<_Compare>()`
 for consistency in `__set_intersection()` base overload.

---
 libcxx/include/__algorithm/set_intersection.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libcxx/include/__algorithm/set_intersection.h b/libcxx/include/__algorithm/set_intersection.h
index 293f500f541ac6..822f6ebcab60f5 100644
--- a/libcxx/include/__algorithm/set_intersection.h
+++ b/libcxx/include/__algorithm/set_intersection.h
@@ -162,7 +162,7 @@ __set_intersection(
       std::move(__first2),
       std::move(__last2),
       std::move(__result),
-      __comp,
+       std::forward<_Compare>(__comp),
       typename std::_IterOps<_AlgPolicy>::template __iterator_category<_InIter1>(),
       typename std::_IterOps<_AlgPolicy>::template __iterator_category<_InIter2>());
 }

>From 62a6010a27dd30b8e0469c4be89d3d597421a995 Mon Sep 17 00:00:00 2001
From: Iuri Chaer <ichaer at splunk.com>
Date: Sat, 27 Apr 2024 20:03:22 +0100
Subject: [PATCH 41/44] Address review feedback: remove benchmark counters.

---
 .../algorithms/set_intersection.bench.cpp     | 26 ++-----------------
 1 file changed, 2 insertions(+), 24 deletions(-)

diff --git a/libcxx/benchmarks/algorithms/set_intersection.bench.cpp b/libcxx/benchmarks/algorithms/set_intersection.bench.cpp
index 2233b85f1162f0..a9752d9a5a15c8 100644
--- a/libcxx/benchmarks/algorithms/set_intersection.bench.cpp
+++ b/libcxx/benchmarks/algorithms/set_intersection.bench.cpp
@@ -135,10 +135,6 @@ std::pair<Container, Container> genCacheUnfriendlyData(size_t size1, size_t size
   return std::pair<Container, Container>();
 }
 
-// use environment variable to enable additional counters: instrumentation will
-// impact CPU utilisation, let's give the user the option
-static const bool TRACK_COUNTERS = getenv("TRACK_COUNTERS") != nullptr;
-
 template <class ValueType, class Container, class Overlap>
 struct SetIntersection {
   using ContainerType = typename Container::template type<Value<ValueType>>;
@@ -164,26 +160,8 @@ struct SetIntersection {
       while (state.KeepRunningBatch(BATCH_SIZE)) {
         for (unsigned i = 0; i < BATCH_SIZE; ++i) {
           const auto& [c1, c2] = input;
-          if (TRACK_COUNTERS) {
-            size_t cmp{}, strides{}, displacement{};
-            auto tracking_less = [&cmp](const Value<ValueType>& lhs, const Value<ValueType>& rhs) {
-              ++cmp;
-              return std::less<Value<ValueType>>{}(lhs, rhs);
-            };
-            stride_counting_iterator b1(c1.begin(), &strides, &displacement);
-            stride_counting_iterator e1(c1.end(), &strides, &displacement);
-            stride_counting_iterator b2(c2.begin(), &strides, &displacement);
-            stride_counting_iterator e2(c2.end(), &strides, &displacement);
-            auto res = std::set_intersection(b1, e1, b2, e2, out.begin(), tracking_less);
-            benchmark::DoNotOptimize(res);
-            state.counters["comparisons"]       = cmp;
-            state.counters["iter_strides"]      = strides;
-            state.counters["iter_displacement"] = displacement;
-
-          } else {
-            auto res = std::set_intersection(c1.begin(), c1.end(), c2.begin(), c2.end(), out.begin());
-            benchmark::DoNotOptimize(res);
-          }
+          auto res = std::set_intersection(c1.begin(), c1.end(), c2.begin(), c2.end(), out.begin());
+          benchmark::DoNotOptimize(res);
         }
       }
     }

>From e2af5cc3558f9c94b835826fe2927ac88600dce0 Mon Sep 17 00:00:00 2001
From: Iuri Chaer <ichaer at splunk.com>
Date: Sat, 27 Apr 2024 20:04:15 +0100
Subject: [PATCH 42/44] clang-format of the last 2 changes

---
 libcxx/benchmarks/algorithms/set_intersection.bench.cpp | 2 +-
 libcxx/include/__algorithm/set_intersection.h           | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/libcxx/benchmarks/algorithms/set_intersection.bench.cpp b/libcxx/benchmarks/algorithms/set_intersection.bench.cpp
index a9752d9a5a15c8..9cdf6a78594428 100644
--- a/libcxx/benchmarks/algorithms/set_intersection.bench.cpp
+++ b/libcxx/benchmarks/algorithms/set_intersection.bench.cpp
@@ -160,7 +160,7 @@ struct SetIntersection {
       while (state.KeepRunningBatch(BATCH_SIZE)) {
         for (unsigned i = 0; i < BATCH_SIZE; ++i) {
           const auto& [c1, c2] = input;
-          auto res = std::set_intersection(c1.begin(), c1.end(), c2.begin(), c2.end(), out.begin());
+          auto res             = std::set_intersection(c1.begin(), c1.end(), c2.begin(), c2.end(), out.begin());
           benchmark::DoNotOptimize(res);
         }
       }
diff --git a/libcxx/include/__algorithm/set_intersection.h b/libcxx/include/__algorithm/set_intersection.h
index 822f6ebcab60f5..9c2145731473c3 100644
--- a/libcxx/include/__algorithm/set_intersection.h
+++ b/libcxx/include/__algorithm/set_intersection.h
@@ -162,7 +162,7 @@ __set_intersection(
       std::move(__first2),
       std::move(__last2),
       std::move(__result),
-       std::forward<_Compare>(__comp),
+      std::forward<_Compare>(__comp),
       typename std::_IterOps<_AlgPolicy>::template __iterator_category<_InIter1>(),
       typename std::_IterOps<_AlgPolicy>::template __iterator_category<_InIter2>());
 }

>From 89201ea2ee881bf71574e1bc043bfbbc9afd355a Mon Sep 17 00:00:00 2001
From: Iuri Chaer <ichaer at splunk.com>
Date: Sun, 28 Apr 2024 15:20:47 +0100
Subject: [PATCH 43/44] Oops, leftover template type name!

---
 libcxx/include/__algorithm/lower_bound.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/libcxx/include/__algorithm/lower_bound.h b/libcxx/include/__algorithm/lower_bound.h
index c5d549a0d54c61..06b58bd6dd2d8c 100644
--- a/libcxx/include/__algorithm/lower_bound.h
+++ b/libcxx/include/__algorithm/lower_bound.h
@@ -83,9 +83,9 @@ __lower_bound_onesided(_ForwardIterator __first, _Sent __last, const _Type& __va
   return __first;
 }
 
-template <class _AlgPolicy, class _RandomAccessIterator, class _Sent, class _Type, class _Proj, class _Comp>
-_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _RandomAccessIterator
-__lower_bound(_RandomAccessIterator __first, _Sent __last, const _Type& __value, _Comp& __comp, _Proj& __proj) {
+template <class _AlgPolicy, class _ForwardIterator, class _Sent, class _Type, class _Proj, class _Comp>
+_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _ForwardIterator
+__lower_bound(_ForwardIterator __first, _Sent __last, const _Type& __value, _Comp& __comp, _Proj& __proj) {
   const auto __dist = _IterOps<_AlgPolicy>::distance(__first, __last);
   return std::__lower_bound_bisecting<_AlgPolicy>(__first, __value, __dist, __comp, __proj);
 }

>From 5f6e7feaa267af9482cc14dbb8b09fc8497ae326 Mon Sep 17 00:00:00 2001
From: Iuri Chaer <ichaer at splunk.com>
Date: Mon, 29 Apr 2024 20:38:31 +0100
Subject: [PATCH 44/44] Remove unnecessary PauseTiming()/ResumeTiming() in the
 benchmark data generation stage, time won't be measured before we go into the
 benchmark::State loops.

---
 libcxx/benchmarks/algorithms/set_intersection.bench.cpp | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/libcxx/benchmarks/algorithms/set_intersection.bench.cpp b/libcxx/benchmarks/algorithms/set_intersection.bench.cpp
index 9cdf6a78594428..ba7670e7825374 100644
--- a/libcxx/benchmarks/algorithms/set_intersection.bench.cpp
+++ b/libcxx/benchmarks/algorithms/set_intersection.bench.cpp
@@ -149,13 +149,10 @@ struct SetIntersection {
   }
 
   void run(benchmark::State& state) const {
-    state.PauseTiming();
     auto input = genCacheUnfriendlyData<ContainerType>(size1_, size2_, Overlap());
     std::vector<Value<ValueType>> out(std::min(size1_, size2_));
 
     const auto BATCH_SIZE = std::max(size_t{512}, (2 * TestSetElements) / (size1_ + size2_));
-    state.ResumeTiming();
-
     for (const auto& _ : state) {
       while (state.KeepRunningBatch(BATCH_SIZE)) {
         for (unsigned i = 0; i < BATCH_SIZE; ++i) {



More information about the libcxx-commits mailing list