[libcxx-commits] [libcxx] [libc++] Optimize ranges::swap_ranges for vector<bool>::iterator (PR #121150)

Peng Liu via libcxx-commits libcxx-commits at lists.llvm.org
Thu Dec 26 05:29:56 PST 2024


https://github.com/winner245 created https://github.com/llvm/llvm-project/pull/121150

This PR optimizes the performance of `std::ranges::swap_ranges` for `vector<bool>::iterator`, addressing a subtask outlined in issue #64038. The optimizations yield performance improvements of up to **x** for aligned range swap and **60x** for unaligned range swap comparison. 

- Aligned 


- Unaligned

>From faed5b39d558aa26a30536c149629b7303fb4478 Mon Sep 17 00:00:00 2001
From: Peng Liu <winner245 at hotmail.com>
Date: Wed, 25 Dec 2024 16:06:35 -0500
Subject: [PATCH] Optimize ranges::swap_ranges for vector<bool>::iterator

---
 libcxx/include/__algorithm/swap_ranges.h      | 161 +++++++++++
 libcxx/include/__bit_reference                | 157 +----------
 .../algorithms/swap_ranges.bench.cpp          |  66 +++++
 .../alg.swap/iter_swap.pass.cpp               |  26 +-
 .../alg.swap/ranges.swap_ranges.pass.cpp      |  58 ++--
 .../alg.swap/swap_ranges.pass.cpp             | 253 ++++++++++--------
 6 files changed, 422 insertions(+), 299 deletions(-)
 create mode 100644 libcxx/test/benchmarks/algorithms/swap_ranges.bench.cpp

diff --git a/libcxx/include/__algorithm/swap_ranges.h b/libcxx/include/__algorithm/swap_ranges.h
index 54b453b72360e0..be151523a9aa01 100644
--- a/libcxx/include/__algorithm/swap_ranges.h
+++ b/libcxx/include/__algorithm/swap_ranges.h
@@ -10,7 +10,9 @@
 #define _LIBCPP___ALGORITHM_SWAP_RANGES_H
 
 #include <__algorithm/iterator_operations.h>
+#include <__algorithm/min.h>
 #include <__config>
+#include <__fwd/bit_reference.h>
 #include <__utility/move.h>
 #include <__utility/pair.h>
 
@@ -23,6 +25,165 @@ _LIBCPP_PUSH_MACROS
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
+template <class _Cl, class _Cr>
+_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __bit_iterator<_Cr, false> __swap_ranges_aligned(
+    __bit_iterator<_Cl, false> __first, __bit_iterator<_Cl, false> __last, __bit_iterator<_Cr, false> __result) {
+  using _I1             = __bit_iterator<_Cl, false>;
+  using difference_type = typename _I1::difference_type;
+  using __storage_type  = typename _I1::__storage_type;
+
+  const int __bits_per_word = _I1::__bits_per_word;
+  difference_type __n       = __last - __first;
+  if (__n > 0) {
+    // do first word
+    if (__first.__ctz_ != 0) {
+      unsigned __clz       = __bits_per_word - __first.__ctz_;
+      difference_type __dn = std::min(static_cast<difference_type>(__clz), __n);
+      __n -= __dn;
+      __storage_type __m  = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz - __dn));
+      __storage_type __b1 = *__first.__seg_ & __m;
+      *__first.__seg_ &= ~__m;
+      __storage_type __b2 = *__result.__seg_ & __m;
+      *__result.__seg_ &= ~__m;
+      *__result.__seg_ |= __b1;
+      *__first.__seg_ |= __b2;
+      __result.__seg_ += (__dn + __result.__ctz_) / __bits_per_word;
+      __result.__ctz_ = static_cast<unsigned>((__dn + __result.__ctz_) % __bits_per_word);
+      ++__first.__seg_;
+      // __first.__ctz_ = 0;
+    }
+    // __first.__ctz_ == 0;
+    // do middle words
+    for (; __n >= __bits_per_word; __n -= __bits_per_word, ++__first.__seg_, ++__result.__seg_)
+      swap(*__first.__seg_, *__result.__seg_);
+    // do last word
+    if (__n > 0) {
+      __storage_type __m  = ~__storage_type(0) >> (__bits_per_word - __n);
+      __storage_type __b1 = *__first.__seg_ & __m;
+      *__first.__seg_ &= ~__m;
+      __storage_type __b2 = *__result.__seg_ & __m;
+      *__result.__seg_ &= ~__m;
+      *__result.__seg_ |= __b1;
+      *__first.__seg_ |= __b2;
+      __result.__ctz_ = static_cast<unsigned>(__n);
+    }
+  }
+  return __result;
+}
+
+template <class _Cl, class _Cr>
+_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __bit_iterator<_Cr, false> __swap_ranges_unaligned(
+    __bit_iterator<_Cl, false> __first, __bit_iterator<_Cl, false> __last, __bit_iterator<_Cr, false> __result) {
+  using _I1             = __bit_iterator<_Cl, false>;
+  using difference_type = typename _I1::difference_type;
+  using __storage_type  = typename _I1::__storage_type;
+
+  const int __bits_per_word = _I1::__bits_per_word;
+  difference_type __n       = __last - __first;
+  if (__n > 0) {
+    // do first word
+    if (__first.__ctz_ != 0) {
+      unsigned __clz_f     = __bits_per_word - __first.__ctz_;
+      difference_type __dn = std::min(static_cast<difference_type>(__clz_f), __n);
+      __n -= __dn;
+      __storage_type __m  = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz_f - __dn));
+      __storage_type __b1 = *__first.__seg_ & __m;
+      *__first.__seg_ &= ~__m;
+      unsigned __clz_r     = __bits_per_word - __result.__ctz_;
+      __storage_type __ddn = std::min<__storage_type>(__dn, __clz_r);
+      __m                  = (~__storage_type(0) << __result.__ctz_) & (~__storage_type(0) >> (__clz_r - __ddn));
+      __storage_type __b2  = *__result.__seg_ & __m;
+      *__result.__seg_ &= ~__m;
+      if (__result.__ctz_ > __first.__ctz_) {
+        unsigned __s = __result.__ctz_ - __first.__ctz_;
+        *__result.__seg_ |= __b1 << __s;
+        *__first.__seg_ |= __b2 >> __s;
+      } else {
+        unsigned __s = __first.__ctz_ - __result.__ctz_;
+        *__result.__seg_ |= __b1 >> __s;
+        *__first.__seg_ |= __b2 << __s;
+      }
+      __result.__seg_ += (__ddn + __result.__ctz_) / __bits_per_word;
+      __result.__ctz_ = static_cast<unsigned>((__ddn + __result.__ctz_) % __bits_per_word);
+      __dn -= __ddn;
+      if (__dn > 0) {
+        __m  = ~__storage_type(0) >> (__bits_per_word - __dn);
+        __b2 = *__result.__seg_ & __m;
+        *__result.__seg_ &= ~__m;
+        unsigned __s = __first.__ctz_ + __ddn;
+        *__result.__seg_ |= __b1 >> __s;
+        *__first.__seg_ |= __b2 << __s;
+        __result.__ctz_ = static_cast<unsigned>(__dn);
+      }
+      ++__first.__seg_;
+      // __first.__ctz_ = 0;
+    }
+    // __first.__ctz_ == 0;
+    // do middle words
+    __storage_type __m = ~__storage_type(0) << __result.__ctz_;
+    unsigned __clz_r   = __bits_per_word - __result.__ctz_;
+    for (; __n >= __bits_per_word; __n -= __bits_per_word, ++__first.__seg_) {
+      __storage_type __b1 = *__first.__seg_;
+      __storage_type __b2 = *__result.__seg_ & __m;
+      *__result.__seg_ &= ~__m;
+      *__result.__seg_ |= __b1 << __result.__ctz_;
+      *__first.__seg_ = __b2 >> __result.__ctz_;
+      ++__result.__seg_;
+      __b2 = *__result.__seg_ & ~__m;
+      *__result.__seg_ &= __m;
+      *__result.__seg_ |= __b1 >> __clz_r;
+      *__first.__seg_ |= __b2 << __clz_r;
+    }
+    // do last word
+    if (__n > 0) {
+      __m                 = ~__storage_type(0) >> (__bits_per_word - __n);
+      __storage_type __b1 = *__first.__seg_ & __m;
+      *__first.__seg_ &= ~__m;
+      __storage_type __dn = std::min<__storage_type>(__n, __clz_r);
+      __m                 = (~__storage_type(0) << __result.__ctz_) & (~__storage_type(0) >> (__clz_r - __dn));
+      __storage_type __b2 = *__result.__seg_ & __m;
+      *__result.__seg_ &= ~__m;
+      *__result.__seg_ |= __b1 << __result.__ctz_;
+      *__first.__seg_ |= __b2 >> __result.__ctz_;
+      __result.__seg_ += (__dn + __result.__ctz_) / __bits_per_word;
+      __result.__ctz_ = static_cast<unsigned>((__dn + __result.__ctz_) % __bits_per_word);
+      __n -= __dn;
+      if (__n > 0) {
+        __m  = ~__storage_type(0) >> (__bits_per_word - __n);
+        __b2 = *__result.__seg_ & __m;
+        *__result.__seg_ &= ~__m;
+        *__result.__seg_ |= __b1 >> __dn;
+        *__first.__seg_ |= __b2 << __dn;
+        __result.__ctz_ = static_cast<unsigned>(__n);
+      }
+    }
+  }
+  return __result;
+}
+
+// 2+1 iterators: size2 >= size1; used by std::swap_ranges.
+template <class, class _Cl, class _Cr>
+_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair<__bit_iterator<_Cl, false>, __bit_iterator<_Cr, false> >
+__swap_ranges(__bit_iterator<_Cl, false> __first1,
+              __bit_iterator<_Cl, false> __last1,
+              __bit_iterator<_Cr, false> __first2) {
+  if (__first1.__ctz_ == __first2.__ctz_)
+    return std::make_pair(__last1, std::__swap_ranges_aligned(__first1, __last1, __first2));
+  return std::make_pair(__last1, std::__swap_ranges_unaligned(__first1, __last1, __first2));
+}
+
+// 2+2 iterators: used by std::ranges::swap_ranges.
+template <class _AlgPolicy, class _Cl, class _Cr>
+_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair<__bit_iterator<_Cl, false>, __bit_iterator<_Cr, false> >
+__swap_ranges(__bit_iterator<_Cl, false> __first1,
+              __bit_iterator<_Cl, false> __last1,
+              __bit_iterator<_Cr, false> __first2,
+              __bit_iterator<_Cr, false> __last2) {
+  if (__last1 - __first1 < __last2 - __first2)
+    return std::make_pair(__last1, std::__swap_ranges<_AlgPolicy>(__first1, __last1, __first2).second);
+  return std::make_pair(std::__swap_ranges<_AlgPolicy>(__first2, __last2, __first1).second, __last2);
+}
+
 // 2+2 iterators: the shorter size will be used.
 template <class _AlgPolicy, class _ForwardIterator1, class _Sentinel1, class _ForwardIterator2, class _Sentinel2>
 _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair<_ForwardIterator1, _ForwardIterator2>
diff --git a/libcxx/include/__bit_reference b/libcxx/include/__bit_reference
index 9fa24c98d493fd..0b8c31f643fadd 100644
--- a/libcxx/include/__bit_reference
+++ b/libcxx/include/__bit_reference
@@ -12,6 +12,7 @@
 
 #include <__algorithm/copy_n.h>
 #include <__algorithm/min.h>
+#include <__algorithm/swap_ranges.h>
 #include <__bit/countr.h>
 #include <__compare/ordering.h>
 #include <__config>
@@ -437,152 +438,6 @@ inline _LIBCPP_HIDE_FROM_ABI __bit_iterator<_Cp, false> move_backward(
   return std::copy_backward(__first, __last, __result);
 }
 
-// swap_ranges
-
-template <class _Cl, class _Cr>
-_LIBCPP_HIDE_FROM_ABI __bit_iterator<_Cr, false> __swap_ranges_aligned(
-    __bit_iterator<_Cl, false> __first, __bit_iterator<_Cl, false> __last, __bit_iterator<_Cr, false> __result) {
-  using _I1             = __bit_iterator<_Cl, false>;
-  using difference_type = typename _I1::difference_type;
-  using __storage_type  = typename _I1::__storage_type;
-
-  const int __bits_per_word = _I1::__bits_per_word;
-  difference_type __n       = __last - __first;
-  if (__n > 0) {
-    // do first word
-    if (__first.__ctz_ != 0) {
-      unsigned __clz       = __bits_per_word - __first.__ctz_;
-      difference_type __dn = std::min(static_cast<difference_type>(__clz), __n);
-      __n -= __dn;
-      __storage_type __m  = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz - __dn));
-      __storage_type __b1 = *__first.__seg_ & __m;
-      *__first.__seg_ &= ~__m;
-      __storage_type __b2 = *__result.__seg_ & __m;
-      *__result.__seg_ &= ~__m;
-      *__result.__seg_ |= __b1;
-      *__first.__seg_ |= __b2;
-      __result.__seg_ += (__dn + __result.__ctz_) / __bits_per_word;
-      __result.__ctz_ = static_cast<unsigned>((__dn + __result.__ctz_) % __bits_per_word);
-      ++__first.__seg_;
-      // __first.__ctz_ = 0;
-    }
-    // __first.__ctz_ == 0;
-    // do middle words
-    for (; __n >= __bits_per_word; __n -= __bits_per_word, ++__first.__seg_, ++__result.__seg_)
-      swap(*__first.__seg_, *__result.__seg_);
-    // do last word
-    if (__n > 0) {
-      __storage_type __m  = ~__storage_type(0) >> (__bits_per_word - __n);
-      __storage_type __b1 = *__first.__seg_ & __m;
-      *__first.__seg_ &= ~__m;
-      __storage_type __b2 = *__result.__seg_ & __m;
-      *__result.__seg_ &= ~__m;
-      *__result.__seg_ |= __b1;
-      *__first.__seg_ |= __b2;
-      __result.__ctz_ = static_cast<unsigned>(__n);
-    }
-  }
-  return __result;
-}
-
-template <class _Cl, class _Cr>
-_LIBCPP_HIDE_FROM_ABI __bit_iterator<_Cr, false> __swap_ranges_unaligned(
-    __bit_iterator<_Cl, false> __first, __bit_iterator<_Cl, false> __last, __bit_iterator<_Cr, false> __result) {
-  using _I1             = __bit_iterator<_Cl, false>;
-  using difference_type = typename _I1::difference_type;
-  using __storage_type  = typename _I1::__storage_type;
-
-  const int __bits_per_word = _I1::__bits_per_word;
-  difference_type __n       = __last - __first;
-  if (__n > 0) {
-    // do first word
-    if (__first.__ctz_ != 0) {
-      unsigned __clz_f     = __bits_per_word - __first.__ctz_;
-      difference_type __dn = std::min(static_cast<difference_type>(__clz_f), __n);
-      __n -= __dn;
-      __storage_type __m  = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz_f - __dn));
-      __storage_type __b1 = *__first.__seg_ & __m;
-      *__first.__seg_ &= ~__m;
-      unsigned __clz_r     = __bits_per_word - __result.__ctz_;
-      __storage_type __ddn = std::min<__storage_type>(__dn, __clz_r);
-      __m                  = (~__storage_type(0) << __result.__ctz_) & (~__storage_type(0) >> (__clz_r - __ddn));
-      __storage_type __b2  = *__result.__seg_ & __m;
-      *__result.__seg_ &= ~__m;
-      if (__result.__ctz_ > __first.__ctz_) {
-        unsigned __s = __result.__ctz_ - __first.__ctz_;
-        *__result.__seg_ |= __b1 << __s;
-        *__first.__seg_ |= __b2 >> __s;
-      } else {
-        unsigned __s = __first.__ctz_ - __result.__ctz_;
-        *__result.__seg_ |= __b1 >> __s;
-        *__first.__seg_ |= __b2 << __s;
-      }
-      __result.__seg_ += (__ddn + __result.__ctz_) / __bits_per_word;
-      __result.__ctz_ = static_cast<unsigned>((__ddn + __result.__ctz_) % __bits_per_word);
-      __dn -= __ddn;
-      if (__dn > 0) {
-        __m  = ~__storage_type(0) >> (__bits_per_word - __dn);
-        __b2 = *__result.__seg_ & __m;
-        *__result.__seg_ &= ~__m;
-        unsigned __s = __first.__ctz_ + __ddn;
-        *__result.__seg_ |= __b1 >> __s;
-        *__first.__seg_ |= __b2 << __s;
-        __result.__ctz_ = static_cast<unsigned>(__dn);
-      }
-      ++__first.__seg_;
-      // __first.__ctz_ = 0;
-    }
-    // __first.__ctz_ == 0;
-    // do middle words
-    __storage_type __m = ~__storage_type(0) << __result.__ctz_;
-    unsigned __clz_r   = __bits_per_word - __result.__ctz_;
-    for (; __n >= __bits_per_word; __n -= __bits_per_word, ++__first.__seg_) {
-      __storage_type __b1 = *__first.__seg_;
-      __storage_type __b2 = *__result.__seg_ & __m;
-      *__result.__seg_ &= ~__m;
-      *__result.__seg_ |= __b1 << __result.__ctz_;
-      *__first.__seg_ = __b2 >> __result.__ctz_;
-      ++__result.__seg_;
-      __b2 = *__result.__seg_ & ~__m;
-      *__result.__seg_ &= __m;
-      *__result.__seg_ |= __b1 >> __clz_r;
-      *__first.__seg_ |= __b2 << __clz_r;
-    }
-    // do last word
-    if (__n > 0) {
-      __m                 = ~__storage_type(0) >> (__bits_per_word - __n);
-      __storage_type __b1 = *__first.__seg_ & __m;
-      *__first.__seg_ &= ~__m;
-      __storage_type __dn = std::min<__storage_type>(__n, __clz_r);
-      __m                 = (~__storage_type(0) << __result.__ctz_) & (~__storage_type(0) >> (__clz_r - __dn));
-      __storage_type __b2 = *__result.__seg_ & __m;
-      *__result.__seg_ &= ~__m;
-      *__result.__seg_ |= __b1 << __result.__ctz_;
-      *__first.__seg_ |= __b2 >> __result.__ctz_;
-      __result.__seg_ += (__dn + __result.__ctz_) / __bits_per_word;
-      __result.__ctz_ = static_cast<unsigned>((__dn + __result.__ctz_) % __bits_per_word);
-      __n -= __dn;
-      if (__n > 0) {
-        __m  = ~__storage_type(0) >> (__bits_per_word - __n);
-        __b2 = *__result.__seg_ & __m;
-        *__result.__seg_ &= ~__m;
-        *__result.__seg_ |= __b1 >> __dn;
-        *__first.__seg_ |= __b2 << __dn;
-        __result.__ctz_ = static_cast<unsigned>(__n);
-      }
-    }
-  }
-  return __result;
-}
-
-template <class _Cl, class _Cr>
-inline _LIBCPP_HIDE_FROM_ABI __bit_iterator<_Cr, false> swap_ranges(
-    __bit_iterator<_Cl, false> __first1, __bit_iterator<_Cl, false> __last1, __bit_iterator<_Cr, false> __first2) {
-  if (__first1.__ctz_ == __first2.__ctz_)
-    return std::__swap_ranges_aligned(__first1, __last1, __first2);
-  return std::__swap_ranges_unaligned(__first1, __last1, __first2);
-}
-
 // rotate
 
 template <class _Cp>
@@ -987,14 +842,14 @@ private:
   _LIBCPP_CONSTEXPR_SINCE_CXX20 friend __bit_iterator<_Dp, false>
   copy_backward(__bit_iterator<_Dp, _IC> __first, __bit_iterator<_Dp, _IC> __last, __bit_iterator<_Dp, false> __result);
   template <class _Cl, class _Cr>
-  friend __bit_iterator<_Cr, false>
+  _LIBCPP_CONSTEXPR_SINCE_CXX20 friend __bit_iterator<_Cr, false>
       __swap_ranges_aligned(__bit_iterator<_Cl, false>, __bit_iterator<_Cl, false>, __bit_iterator<_Cr, false>);
   template <class _Cl, class _Cr>
-  friend __bit_iterator<_Cr, false>
+  _LIBCPP_CONSTEXPR_SINCE_CXX20 friend __bit_iterator<_Cr, false>
       __swap_ranges_unaligned(__bit_iterator<_Cl, false>, __bit_iterator<_Cl, false>, __bit_iterator<_Cr, false>);
-  template <class _Cl, class _Cr>
-  friend __bit_iterator<_Cr, false>
-      swap_ranges(__bit_iterator<_Cl, false>, __bit_iterator<_Cl, false>, __bit_iterator<_Cr, false>);
+  template <class, class _Cl, class _Cr>
+  _LIBCPP_CONSTEXPR_SINCE_CXX20 friend pair<__bit_iterator<_Cl, false>, __bit_iterator<_Cr, false> >
+      __swap_ranges(__bit_iterator<_Cl, false>, __bit_iterator<_Cl, false>, __bit_iterator<_Cr, false>);
   template <class _Dp>
   _LIBCPP_CONSTEXPR_SINCE_CXX20 friend __bit_iterator<_Dp, false>
       rotate(__bit_iterator<_Dp, false>, __bit_iterator<_Dp, false>, __bit_iterator<_Dp, false>);
diff --git a/libcxx/test/benchmarks/algorithms/swap_ranges.bench.cpp b/libcxx/test/benchmarks/algorithms/swap_ranges.bench.cpp
new file mode 100644
index 00000000000000..a8fcdf8b9c4883
--- /dev/null
+++ b/libcxx/test/benchmarks/algorithms/swap_ranges.bench.cpp
@@ -0,0 +1,66 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+
+#include <algorithm>
+#include <benchmark/benchmark.h>
+#include <vector>
+
+static void bm_ranges_swap_ranges_aligned(benchmark::State& state) {
+  auto n = state.range();
+  std::vector<bool> vec1(n, true);
+  std::vector<bool> vec2(n, false);
+  for (auto _ : state) {
+    benchmark::DoNotOptimize(std::ranges::swap_ranges(vec1, vec2));
+    benchmark::DoNotOptimize(&vec1);
+    benchmark::DoNotOptimize(&vec2);
+  }
+}
+
+static void bm_ranges_swap_ranges_unaligned(benchmark::State& state) {
+  auto n = state.range();
+  std::vector<bool> vec1(n, true);
+  std::vector<bool> vec2(n + 8, true);
+  auto beg1 = std::ranges::begin(vec1);
+  auto end1 = std::ranges::end(vec1);
+  auto beg2 = std::ranges::begin(vec2) + 4;
+  auto end2 = std::ranges::end(vec2) - 4;
+  for (auto _ : state) {
+    benchmark::DoNotOptimize(std::ranges::swap_ranges(beg1, end1, beg2, end2));
+    benchmark::DoNotOptimize(&vec1);
+    benchmark::DoNotOptimize(&vec2);
+  }
+}
+
+// Test std::ranges::swap_ranges for vector<bool>::iterator
+BENCHMARK(bm_ranges_swap_ranges_aligned)->Range(8, 1 << 20);
+BENCHMARK(bm_ranges_swap_ranges_unaligned)->Range(8, 1 << 20);
+
+static void bm_swap_ranges(benchmark::State& state, bool aligned) {
+  auto n = state.range();
+  std::vector<bool> vec1(n, true);
+  std::vector<bool> vec2(aligned ? n : n + 8, true);
+  auto beg1 = vec1.begin();
+  auto end1 = vec1.end();
+  auto beg2 = aligned ? vec2.begin() : vec2.begin() + 4;
+  for (auto _ : state) {
+    benchmark::DoNotOptimize(std::swap_ranges(beg1, end1, beg2));
+    benchmark::DoNotOptimize(&vec1);
+    benchmark::DoNotOptimize(&vec2);
+  }
+}
+
+static void bm_swap_ranges_aligned(benchmark::State& state) { bm_swap_ranges(state, true); }
+static void bm_swap_ranges_unaligned(benchmark::State& state) { bm_swap_ranges(state, false); }
+
+// Test std::swap_ranges for vector<bool>::iterator
+BENCHMARK(bm_swap_ranges_aligned)->Range(8, 1 << 20);
+BENCHMARK(bm_swap_ranges_unaligned)->Range(8, 1 << 20);
+
+BENCHMARK_MAIN();
diff --git a/libcxx/test/std/algorithms/alg.modifying.operations/alg.swap/iter_swap.pass.cpp b/libcxx/test/std/algorithms/alg.modifying.operations/alg.swap/iter_swap.pass.cpp
index b3a9f5fc259ef7..0394a48a0bb9a2 100644
--- a/libcxx/test/std/algorithms/alg.modifying.operations/alg.swap/iter_swap.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.modifying.operations/alg.swap/iter_swap.pass.cpp
@@ -19,25 +19,23 @@
 #include "test_macros.h"
 
 #if TEST_STD_VER > 17
-constexpr bool test_swap_constexpr()
-{
-    int i = 1;
-    int j = 2;
-    std::iter_swap(&i, &j);
-    return i == 2 && j == 1;
+constexpr bool test_swap_constexpr() {
+  int i = 1;
+  int j = 2;
+  std::iter_swap(&i, &j);
+  return i == 2 && j == 1;
 }
 #endif // TEST_STD_VER > 17
 
-int main(int, char**)
-{
-    int i = 1;
-    int j = 2;
-    std::iter_swap(&i, &j);
-    assert(i == 2);
-    assert(j == 1);
+int main(int, char**) {
+  int i = 1;
+  int j = 2;
+  std::iter_swap(&i, &j);
+  assert(i == 2);
+  assert(j == 1);
 
 #if TEST_STD_VER > 17
-    static_assert(test_swap_constexpr());
+  static_assert(test_swap_constexpr());
 #endif // TEST_STD_VER > 17
 
   return 0;
diff --git a/libcxx/test/std/algorithms/alg.modifying.operations/alg.swap/ranges.swap_ranges.pass.cpp b/libcxx/test/std/algorithms/alg.modifying.operations/alg.swap/ranges.swap_ranges.pass.cpp
index a8d69b2832b462..a7b62dc4b886d3 100644
--- a/libcxx/test/std/algorithms/alg.modifying.operations/alg.swap/ranges.swap_ranges.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.modifying.operations/alg.swap/ranges.swap_ranges.pass.cpp
@@ -23,13 +23,14 @@
 #include <array>
 #include <cassert>
 #include <ranges>
+#include <vector>
 
 #include "test_iterators.h"
 
 constexpr void test_different_lengths() {
-  using Expected = std::ranges::swap_ranges_result<int*, int*>;
-  int i[3] = {1, 2, 3};
-  int j[1] = {4};
+  using Expected                = std::ranges::swap_ranges_result<int*, int*>;
+  int i[3]                      = {1, 2, 3};
+  int j[1]                      = {4};
   std::same_as<Expected> auto r = std::ranges::swap_ranges(i, i + 3, j, j + 1);
   assert(r.in1 == i + 1);
   assert(r.in2 == j + 1);
@@ -64,8 +65,8 @@ constexpr void test_range() {
   std::array r1 = {1, 2, 3};
   std::array r2 = {4, 5, 6};
 
-
-  std::same_as<std::ranges::in_in_result<std::array<int, 3>::iterator, std::array<int, 3>::iterator>> auto r = std::ranges::swap_ranges(r1, r2);
+  std::same_as<std::ranges::in_in_result<std::array<int, 3>::iterator, std::array<int, 3>::iterator>> auto r =
+      std::ranges::swap_ranges(r1, r2);
   assert(r.in1 == r1.end());
   assert(r.in2 == r2.end());
 
@@ -110,13 +111,12 @@ constexpr void test_borrowed_input_range() {
 }
 
 constexpr void test_sentinel() {
-  int i[3] = {1, 2, 3};
-  int j[3] = {4, 5, 6};
-  using It = cpp17_input_iterator<int*>;
-  using Sent = sentinel_wrapper<It>;
-  using Expected = std::ranges::swap_ranges_result<It, It>;
-  std::same_as<Expected> auto r =
-      std::ranges::swap_ranges(It(i), Sent(It(i + 3)), It(j), Sent(It(j + 3)));
+  int i[3]                      = {1, 2, 3};
+  int j[3]                      = {4, 5, 6};
+  using It                      = cpp17_input_iterator<int*>;
+  using Sent                    = sentinel_wrapper<It>;
+  using Expected                = std::ranges::swap_ranges_result<It, It>;
+  std::same_as<Expected> auto r = std::ranges::swap_ranges(It(i), Sent(It(i + 3)), It(j), Sent(It(j + 3)));
   assert(base(r.in1) == i + 3);
   assert(base(r.in2) == j + 3);
   assert(i[0] == 4);
@@ -130,8 +130,8 @@ constexpr void test_sentinel() {
 template <class Iter1, class Iter2>
 constexpr void test_iterators() {
   using Expected = std::ranges::swap_ranges_result<Iter1, Iter2>;
-  int i[3] = {1, 2, 3};
-  int j[3] = {4, 5, 6};
+  int i[3]       = {1, 2, 3};
+  int j[3]       = {4, 5, 6};
   std::same_as<Expected> auto r =
       std::ranges::swap_ranges(Iter1(i), sentinel_wrapper(Iter1(i + 3)), Iter2(j), sentinel_wrapper(Iter2(j + 3)));
   assert(base(r.in1) == i + 3);
@@ -146,7 +146,7 @@ constexpr void test_iterators() {
 
 constexpr void test_rval_range() {
   {
-    using Expected = std::ranges::swap_ranges_result<std::array<int, 3>::iterator, std::ranges::dangling>;
+    using Expected       = std::ranges::swap_ranges_result<std::array<int, 3>::iterator, std::ranges::dangling>;
     std::array<int, 3> r = {1, 2, 3};
     std::same_as<Expected> auto a = std::ranges::swap_ranges(r, std::array{4, 5, 6});
     assert((r == std::array{4, 5, 6}));
@@ -154,7 +154,7 @@ constexpr void test_rval_range() {
   }
   {
     std::array<int, 3> r = {1, 2, 3};
-    using Expected = std::ranges::swap_ranges_result<std::ranges::dangling, std::array<int, 3>::iterator>;
+    using Expected       = std::ranges::swap_ranges_result<std::ranges::dangling, std::array<int, 3>::iterator>;
     std::same_as<Expected> auto b = std::ranges::swap_ranges(std::array{4, 5, 6}, r);
     assert((r == std::array{4, 5, 6}));
     assert(b.in2 == r.begin() + 3);
@@ -170,6 +170,24 @@ constexpr void test_proxy_in_iterators() {
   test_iterators<ProxyIterator<contiguous_iterator<int*>>, Out>();
 }
 
+template <std::size_t N>
+struct TestBitIter {
+  TEST_CONSTEXPR_CXX20 void operator()() {
+    { // Test swap_ranges() with aligned bytes
+      std::vector<bool> f(N, false), t(N, true);
+      std::ranges::swap_ranges(f, t);
+      assert(std::all_of(f.begin(), f.end(), [](bool b) { return b; }));
+      assert(std::all_of(t.begin(), t.end(), [](bool b) { return !b; }));
+    }
+    { // Test swap_ranges() with unaligned bytes
+      std::vector<bool> f(N, false), t(N + 8, true);
+      std::ranges::swap_ranges(f.begin(), f.end(), t.begin() + 4, t.end() - 4);
+      assert(std::all_of(f.begin(), f.end(), [](bool b) { return b; }));
+      assert(std::all_of(t.begin() + 4, t.end() - 4, [](bool b) { return !b; }));
+    }
+  }
+};
+
 constexpr bool test() {
   test_range();
 
@@ -214,6 +232,14 @@ constexpr bool test() {
   test_borrowed_input_range();
   test_rval_range();
 
+  { // Test vector<bool>::iterator optimization
+    TestBitIter<8>()();
+    TestBitIter<16>()();
+    TestBitIter<32>()();
+    TestBitIter<64>()();
+    TestBitIter<1024>()();
+  }
+
   return true;
 }
 
diff --git a/libcxx/test/std/algorithms/alg.modifying.operations/alg.swap/swap_ranges.pass.cpp b/libcxx/test/std/algorithms/alg.modifying.operations/alg.swap/swap_ranges.pass.cpp
index 8bfbcd755e39f7..74280053449b1b 100644
--- a/libcxx/test/std/algorithms/alg.modifying.operations/alg.swap/swap_ranges.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.modifying.operations/alg.swap/swap_ranges.pass.cpp
@@ -17,159 +17,176 @@
 #include <cassert>
 #include <memory>
 #include <utility>
+#include <vector>
 
 #include "test_macros.h"
 #include "test_iterators.h"
 
-template<class Iter1, class Iter2>
-void
-test()
-{
-    int i[3] = {1, 2, 3};
-    int j[3] = {4, 5, 6};
-    Iter2 r = std::swap_ranges(Iter1(i), Iter1(i+3), Iter2(j));
-    assert(base(r) == j+3);
-    assert(i[0] == 4);
-    assert(i[1] == 5);
-    assert(i[2] == 6);
-    assert(j[0] == 1);
-    assert(j[1] == 2);
-    assert(j[2] == 3);
+template <class Iter1, class Iter2>
+void test() {
+  int i[3] = {1, 2, 3};
+  int j[3] = {4, 5, 6};
+  Iter2 r  = std::swap_ranges(Iter1(i), Iter1(i + 3), Iter2(j));
+  assert(base(r) == j + 3);
+  assert(i[0] == 4);
+  assert(i[1] == 5);
+  assert(i[2] == 6);
+  assert(j[0] == 1);
+  assert(j[1] == 2);
+  assert(j[2] == 3);
 }
 
 #if TEST_STD_VER >= 11
-template<class Iter1, class Iter2>
-void
-test1()
-{
-    std::unique_ptr<int> i[3];
-    for (int k = 0; k < 3; ++k)
-        i[k].reset(new int(k+1));
-    std::unique_ptr<int> j[3];
-    for (int k = 0; k < 3; ++k)
-        j[k].reset(new int(k+4));
-    Iter2 r = std::swap_ranges(Iter1(i), Iter1(i+3), Iter2(j));
-    assert(base(r) == j+3);
-    assert(*i[0] == 4);
-    assert(*i[1] == 5);
-    assert(*i[2] == 6);
-    assert(*j[0] == 1);
-    assert(*j[1] == 2);
-    assert(*j[2] == 3);
+template <class Iter1, class Iter2>
+void test1() {
+  std::unique_ptr<int> i[3];
+  for (int k = 0; k < 3; ++k)
+    i[k].reset(new int(k + 1));
+  std::unique_ptr<int> j[3];
+  for (int k = 0; k < 3; ++k)
+    j[k].reset(new int(k + 4));
+  Iter2 r = std::swap_ranges(Iter1(i), Iter1(i + 3), Iter2(j));
+  assert(base(r) == j + 3);
+  assert(*i[0] == 4);
+  assert(*i[1] == 5);
+  assert(*i[2] == 6);
+  assert(*j[0] == 1);
+  assert(*j[1] == 2);
+  assert(*j[2] == 3);
 }
 #endif // TEST_STD_VER >= 11
 
-void test2()
-{
-    {
+void test2() {
+  {
     int src[2][2]      = {{0, 1}, {2, 3}};
     decltype(src) dest = {{9, 8}, {7, 6}};
 
     std::swap(src, dest);
 
-    assert ( src[0][0] == 9 );
-    assert ( src[0][1] == 8 );
-    assert ( src[1][0] == 7 );
-    assert ( src[1][1] == 6 );
+    assert(src[0][0] == 9);
+    assert(src[0][1] == 8);
+    assert(src[1][0] == 7);
+    assert(src[1][1] == 6);
 
-    assert ( dest[0][0] == 0 );
-    assert ( dest[0][1] == 1 );
-    assert ( dest[1][0] == 2 );
-    assert ( dest[1][1] == 3 );
-    }
+    assert(dest[0][0] == 0);
+    assert(dest[0][1] == 1);
+    assert(dest[1][0] == 2);
+    assert(dest[1][1] == 3);
+  }
 
-    {
+  {
     int src[3][3]      = {{0, 1, 2}, {3, 4, 5}, {6, 7, 8}};
     decltype(src) dest = {{9, 8, 7}, {6, 5, 4}, {3, 2, 1}};
 
     std::swap(src, dest);
 
-    assert ( src[0][0] == 9 );
-    assert ( src[0][1] == 8 );
-    assert ( src[0][2] == 7 );
-    assert ( src[1][0] == 6 );
-    assert ( src[1][1] == 5 );
-    assert ( src[1][2] == 4 );
-    assert ( src[2][0] == 3 );
-    assert ( src[2][1] == 2 );
-    assert ( src[2][2] == 1 );
-
-    assert ( dest[0][0] == 0 );
-    assert ( dest[0][1] == 1 );
-    assert ( dest[0][2] == 2 );
-    assert ( dest[1][0] == 3 );
-    assert ( dest[1][1] == 4 );
-    assert ( dest[1][2] == 5 );
-    assert ( dest[2][0] == 6 );
-    assert ( dest[2][1] == 7 );
-    assert ( dest[2][2] == 8 );
-    }
+    assert(src[0][0] == 9);
+    assert(src[0][1] == 8);
+    assert(src[0][2] == 7);
+    assert(src[1][0] == 6);
+    assert(src[1][1] == 5);
+    assert(src[1][2] == 4);
+    assert(src[2][0] == 3);
+    assert(src[2][1] == 2);
+    assert(src[2][2] == 1);
+
+    assert(dest[0][0] == 0);
+    assert(dest[0][1] == 1);
+    assert(dest[0][2] == 2);
+    assert(dest[1][0] == 3);
+    assert(dest[1][1] == 4);
+    assert(dest[1][2] == 5);
+    assert(dest[2][0] == 6);
+    assert(dest[2][1] == 7);
+    assert(dest[2][2] == 8);
+  }
 }
 
 #if TEST_STD_VER > 17
-constexpr bool test_swap_constexpr()
-{
-    int i[3] = {1, 2, 3};
-    int j[3] = {4, 5, 6};
-    std::swap_ranges(i, i+3, j);
-    return i[0] == 4 &&
-           i[1] == 5 &&
-           i[2] == 6 &&
-           j[0] == 1 &&
-           j[1] == 2 &&
-           j[2] == 3;
+constexpr bool test_swap_constexpr() {
+  int i[3] = {1, 2, 3};
+  int j[3] = {4, 5, 6};
+  std::swap_ranges(i, i + 3, j);
+  return i[0] == 4 && i[1] == 5 && i[2] == 6 && j[0] == 1 && j[1] == 2 && j[2] == 3;
 }
 #endif // TEST_STD_VER > 17
 
-int main(int, char**)
-{
-    test<forward_iterator<int*>, forward_iterator<int*> >();
-    test<forward_iterator<int*>, bidirectional_iterator<int*> >();
-    test<forward_iterator<int*>, random_access_iterator<int*> >();
-    test<forward_iterator<int*>, int*>();
-
-    test<bidirectional_iterator<int*>, forward_iterator<int*> >();
-    test<bidirectional_iterator<int*>, bidirectional_iterator<int*> >();
-    test<bidirectional_iterator<int*>, random_access_iterator<int*> >();
-    test<bidirectional_iterator<int*>, int*>();
-
-    test<random_access_iterator<int*>, forward_iterator<int*> >();
-    test<random_access_iterator<int*>, bidirectional_iterator<int*> >();
-    test<random_access_iterator<int*>, random_access_iterator<int*> >();
-    test<random_access_iterator<int*>, int*>();
-
-    test<int*, forward_iterator<int*> >();
-    test<int*, bidirectional_iterator<int*> >();
-    test<int*, random_access_iterator<int*> >();
-    test<int*, int*>();
+template <std::size_t N>
+struct TestBitIter {
+  std::vector<bool> f, t;
+  TEST_CONSTEXPR_CXX20 TestBitIter() : f(N, false), t(N, true) {}
+  TEST_CONSTEXPR_CXX20 void operator()() {
+    { // Test swap_ranges() with aligned bytes
+      std::vector<bool> f1 = f, t1 = t;
+      std::swap_ranges(f1.begin(), f1.end(), t1.begin());
+      assert(std::equal(f1.begin(), f1.end(), t.begin()));
+      assert(std::equal(t1.begin(), t1.end(), f.begin()));
+    }
+    { // Test swap_ranges() with unaligned bytes
+      std::vector<bool> f1(N, false), t1(N + 8, true);
+      std::swap_ranges(f1.begin(), f1.end(), t1.begin() + 4);
+      assert(std::equal(f1.begin(), f1.end(), t.begin()));
+      assert(std::equal(t1.begin() + 4, t1.end() - 4, f.begin()));
+    }
+  }
+};
+
+int main(int, char**) {
+  test<forward_iterator<int*>, forward_iterator<int*> >();
+  test<forward_iterator<int*>, bidirectional_iterator<int*> >();
+  test<forward_iterator<int*>, random_access_iterator<int*> >();
+  test<forward_iterator<int*>, int*>();
+
+  test<bidirectional_iterator<int*>, forward_iterator<int*> >();
+  test<bidirectional_iterator<int*>, bidirectional_iterator<int*> >();
+  test<bidirectional_iterator<int*>, random_access_iterator<int*> >();
+  test<bidirectional_iterator<int*>, int*>();
+
+  test<random_access_iterator<int*>, forward_iterator<int*> >();
+  test<random_access_iterator<int*>, bidirectional_iterator<int*> >();
+  test<random_access_iterator<int*>, random_access_iterator<int*> >();
+  test<random_access_iterator<int*>, int*>();
+
+  test<int*, forward_iterator<int*> >();
+  test<int*, bidirectional_iterator<int*> >();
+  test<int*, random_access_iterator<int*> >();
+  test<int*, int*>();
 
 #if TEST_STD_VER >= 11
-    test1<forward_iterator<std::unique_ptr<int>*>, forward_iterator<std::unique_ptr<int>*> >();
-    test1<forward_iterator<std::unique_ptr<int>*>, bidirectional_iterator<std::unique_ptr<int>*> >();
-    test1<forward_iterator<std::unique_ptr<int>*>, random_access_iterator<std::unique_ptr<int>*> >();
-    test1<forward_iterator<std::unique_ptr<int>*>, std::unique_ptr<int>*>();
-
-    test1<bidirectional_iterator<std::unique_ptr<int>*>, forward_iterator<std::unique_ptr<int>*> >();
-    test1<bidirectional_iterator<std::unique_ptr<int>*>, bidirectional_iterator<std::unique_ptr<int>*> >();
-    test1<bidirectional_iterator<std::unique_ptr<int>*>, random_access_iterator<std::unique_ptr<int>*> >();
-    test1<bidirectional_iterator<std::unique_ptr<int>*>, std::unique_ptr<int>*>();
-
-    test1<random_access_iterator<std::unique_ptr<int>*>, forward_iterator<std::unique_ptr<int>*> >();
-    test1<random_access_iterator<std::unique_ptr<int>*>, bidirectional_iterator<std::unique_ptr<int>*> >();
-    test1<random_access_iterator<std::unique_ptr<int>*>, random_access_iterator<std::unique_ptr<int>*> >();
-    test1<random_access_iterator<std::unique_ptr<int>*>, std::unique_ptr<int>*>();
-
-    test1<std::unique_ptr<int>*, forward_iterator<std::unique_ptr<int>*> >();
-    test1<std::unique_ptr<int>*, bidirectional_iterator<std::unique_ptr<int>*> >();
-    test1<std::unique_ptr<int>*, random_access_iterator<std::unique_ptr<int>*> >();
-    test1<std::unique_ptr<int>*, std::unique_ptr<int>*>();
+  test1<forward_iterator<std::unique_ptr<int>*>, forward_iterator<std::unique_ptr<int>*> >();
+  test1<forward_iterator<std::unique_ptr<int>*>, bidirectional_iterator<std::unique_ptr<int>*> >();
+  test1<forward_iterator<std::unique_ptr<int>*>, random_access_iterator<std::unique_ptr<int>*> >();
+  test1<forward_iterator<std::unique_ptr<int>*>, std::unique_ptr<int>*>();
+
+  test1<bidirectional_iterator<std::unique_ptr<int>*>, forward_iterator<std::unique_ptr<int>*> >();
+  test1<bidirectional_iterator<std::unique_ptr<int>*>, bidirectional_iterator<std::unique_ptr<int>*> >();
+  test1<bidirectional_iterator<std::unique_ptr<int>*>, random_access_iterator<std::unique_ptr<int>*> >();
+  test1<bidirectional_iterator<std::unique_ptr<int>*>, std::unique_ptr<int>*>();
+
+  test1<random_access_iterator<std::unique_ptr<int>*>, forward_iterator<std::unique_ptr<int>*> >();
+  test1<random_access_iterator<std::unique_ptr<int>*>, bidirectional_iterator<std::unique_ptr<int>*> >();
+  test1<random_access_iterator<std::unique_ptr<int>*>, random_access_iterator<std::unique_ptr<int>*> >();
+  test1<random_access_iterator<std::unique_ptr<int>*>, std::unique_ptr<int>*>();
+
+  test1<std::unique_ptr<int>*, forward_iterator<std::unique_ptr<int>*> >();
+  test1<std::unique_ptr<int>*, bidirectional_iterator<std::unique_ptr<int>*> >();
+  test1<std::unique_ptr<int>*, random_access_iterator<std::unique_ptr<int>*> >();
+  test1<std::unique_ptr<int>*, std::unique_ptr<int>*>();
 #endif // TEST_STD_VER >= 11
 
 #if TEST_STD_VER > 17
-    static_assert(test_swap_constexpr());
+  static_assert(test_swap_constexpr());
 #endif // TEST_STD_VER > 17
 
-    test2();
+  test2();
+
+  { // Test vector<bool>::iterator optimization
+    TestBitIter<8>()();
+    TestBitIter<16>()();
+    TestBitIter<32>()();
+    TestBitIter<64>()();
+    TestBitIter<1024>()();
+  }
 
   return 0;
 }



More information about the libcxx-commits mailing list