[libcxx-commits] [libcxx] [libc++] Optimize ranges::copy{, _n} for vector<bool>::iterator (PR #121013)

via libcxx-commits libcxx-commits at lists.llvm.org
Sat Dec 28 05:34:40 PST 2024


llvmbot wrote:


<!--LLVM PR SUMMARY COMMENT-->

@llvm/pr-subscribers-libcxx

Author: Peng Liu (winner245)

<details>
<summary>Changes</summary>

This PR optimizes the performance of `std::ranges::copy` and `std::ranges::copy_n` specifically for `vector<bool>::iterator`, addressing a subtask outlined in issue #<!-- -->64038. The optimizations yield performance improvements of up to **2000x** for aligned copies and **60x** for unaligned copies. Additionally, new tests have been added to validate these enhancements.


- Aligned source-destination bits

ranges::copy
```
------------------------------------------------------------------------
Benchmark                              Before        After   Improvement
------------------------------------------------------------------------
bm_ranges_copy_aligned/8              10.8 ns      1.42 ns           8x
bm_ranges_copy_aligned/64             88.5 ns      2.28 ns          39x
bm_ranges_copy_aligned/512             709 ns      1.95 ns         364x
bm_ranges_copy_aligned/4096           5568 ns      5.01 ns        1111x
bm_ranges_copy_aligned/32768         44754 ns      38.7 ns        1156x
bm_ranges_copy_aligned/65536         91092 ns      73.2 ns        1244x
bm_ranges_copy_aligned/102400       139473 ns       127 ns        1098x
bm_ranges_copy_aligned/106496       189004 ns      81.5 ns        2319x
bm_ranges_copy_aligned/110592       153647 ns      71.1 ns        2161x
bm_ranges_copy_aligned/114688       159261 ns      70.2 ns        2269x
bm_ranges_copy_aligned/118784       181910 ns      73.5 ns        2475x
bm_ranges_copy_aligned/122880       174117 ns      76.5 ns        2276x
bm_ranges_copy_aligned/126976       176020 ns      82.0 ns        2147x
bm_ranges_copy_aligned/131072       180757 ns       137 ns        1319x
bm_ranges_copy_aligned/135168       190342 ns       158 ns        1205x
bm_ranges_copy_aligned/139264       192831 ns       103 ns        1872x
bm_ranges_copy_aligned/143360       199627 ns      89.4 ns        2233x
bm_ranges_copy_aligned/147456       203881 ns      88.6 ns        2301x
bm_ranges_copy_aligned/151552       213345 ns      88.4 ns        2413x
bm_ranges_copy_aligned/155648       216892 ns      92.9 ns        2335x
bm_ranges_copy_aligned/159744       222751 ns      96.4 ns        2311x
bm_ranges_copy_aligned/163840       225995 ns       173 ns        1306x
bm_ranges_copy_aligned/167936       235230 ns       202 ns        1165x
bm_ranges_copy_aligned/172032       244093 ns       131 ns        1863x
bm_ranges_copy_aligned/176128       244434 ns       111 ns        2202x
bm_ranges_copy_aligned/180224       249570 ns       108 ns        2311x
bm_ranges_copy_aligned/184320       254538 ns       108 ns        2357x
bm_ranges_copy_aligned/188416       261817 ns       113 ns        2317x
bm_ranges_copy_aligned/192512       269923 ns       125 ns        2159x
bm_ranges_copy_aligned/196608       273494 ns       210 ns        1302x
bm_ranges_copy_aligned/200704       280035 ns       269 ns        1041x
bm_ranges_copy_aligned/204800       293102 ns       231 ns        1269x
```

ranges::copy_n
```
------------------------------------------------------------------------
Benchmark                              Before        After   Improvement
------------------------------------------------------------------------
bm_ranges_copy_n_aligned/8            11.8 ns       0.89 ns         13x
bm_ranges_copy_n_aligned/64           91.6 ns       2.06 ns         44x
bm_ranges_copy_n_aligned/512           718 ns       2.45 ns        293x
bm_ranges_copy_n_aligned/4096         5750 ns       5.02 ns       1145x
bm_ranges_copy_n_aligned/32768       45824 ns       40.9 ns       1120x
bm_ranges_copy_n_aligned/65536       92267 ns       73.8 ns       1250x
bm_ranges_copy_n_aligned/102400     143267 ns       125 ns        1146x
bm_ranges_copy_n_aligned/106496     148625 ns      82.4 ns        1804x
bm_ranges_copy_n_aligned/110592     154817 ns      72.0 ns        2150x
bm_ranges_copy_n_aligned/114688     157953 ns      70.4 ns        2244x
bm_ranges_copy_n_aligned/118784     162374 ns      71.5 ns        2270x
bm_ranges_copy_n_aligned/122880     168638 ns      72.9 ns        2313x
bm_ranges_copy_n_aligned/126976     175596 ns      76.6 ns        2292x
bm_ranges_copy_n_aligned/131072     181164 ns       135 ns        1342x
bm_ranges_copy_n_aligned/135168     184697 ns       157 ns        1176x
bm_ranges_copy_n_aligned/139264     191395 ns       104 ns        1840x
bm_ranges_copy_n_aligned/143360     194954 ns      88.3 ns        2208x
bm_ranges_copy_n_aligned/147456     208917 ns      86.1 ns        2426x
bm_ranges_copy_n_aligned/151552     211101 ns      87.2 ns        2421x
bm_ranges_copy_n_aligned/155648     213175 ns      89.0 ns        2395x
bm_ranges_copy_n_aligned/159744     218988 ns      86.7 ns        2526x
bm_ranges_copy_n_aligned/163840     225263 ns       156 ns        1444x
bm_ranges_copy_n_aligned/167936     230725 ns       184 ns        1254x
bm_ranges_copy_n_aligned/172032     235795 ns       119 ns        1981x
bm_ranges_copy_n_aligned/176128     241145 ns       101 ns        2388x
bm_ranges_copy_n_aligned/180224     250680 ns      99.5 ns        2519x
bm_ranges_copy_n_aligned/184320     262954 ns      99.7 ns        2637x
bm_ranges_copy_n_aligned/188416     258584 ns       103 ns        2510x
bm_ranges_copy_n_aligned/192512     267190 ns       125 ns        2138x
bm_ranges_copy_n_aligned/196608     270821 ns       213 ns        1271x
bm_ranges_copy_n_aligned/200704     279532 ns       262 ns        1067x
bm_ranges_copy_n_aligned/204800     283412 ns       222 ns        1277x
```

- Unaligned source-destination bits
```
-----------------------------------------------------------------------------
Benchmark                                 Before           After  Improvement
-----------------------------------------------------------------------------
bm_ranges_copy_unaligned/8               12.8 ns         8.59 ns         1.5x
bm_ranges_copy_unaligned/64              98.2 ns         8.24 ns          12x
bm_ranges_copy_unaligned/512              755 ns         18.1 ns          42x
bm_ranges_copy_unaligned/4096            6027 ns          102 ns          59x
bm_ranges_copy_unaligned/32768          47663 ns          774 ns          62x
bm_ranges_copy_unaligned/262144        378981 ns         6455 ns          59x
bm_ranges_copy_unaligned/1048576      1520486 ns        25942 ns          59x
bm_ranges_copy_n_unaligned/8             11.3 ns         8.22 ns         1.4x
bm_ranges_copy_n_unaligned/64            97.3 ns         7.89 ns          12x
bm_ranges_copy_n_unaligned/512            747 ns         18.1 ns          41x
bm_ranges_copy_n_unaligned/4096          5932 ns         99.0 ns          60x
bm_ranges_copy_n_unaligned/32768        47776 ns         749 ns           64x
bm_ranges_copy_n_unaligned/262144      378802 ns        6576 ns           58x
bm_ranges_copy_n_unaligned/1048576    1547234 ns       26229 ns           59x
```

---

Patch is 33.87 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/121013.diff


9 Files Affected:

- (modified) libcxx/docs/ReleaseNotes/20.rst (+3) 
- (modified) libcxx/include/__algorithm/copy.h (+133-1) 
- (modified) libcxx/include/__bit_reference (+5-126) 
- (modified) libcxx/include/bitset (+2) 
- (added) libcxx/test/benchmarks/algorithms/copy.bench.cpp (+89) 
- (modified) libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/copy.pass.cpp (+33-1) 
- (modified) libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/copy_n.pass.cpp (+77-50) 
- (modified) libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/ranges.copy.pass.cpp (+36) 
- (modified) libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/ranges.copy_n.pass.cpp (+46-7) 


``````````diff
diff --git a/libcxx/docs/ReleaseNotes/20.rst b/libcxx/docs/ReleaseNotes/20.rst
index c8a07fb8b73348..4cb2eff0688176 100644
--- a/libcxx/docs/ReleaseNotes/20.rst
+++ b/libcxx/docs/ReleaseNotes/20.rst
@@ -73,6 +73,9 @@ Improvements and New Features
   optimized, resulting in a performance improvement of up to 2x for trivial element types (e.g., `std::vector<int>`),
   and up to 3.4x for non-trivial element types (e.g., `std::vector<std::vector<int>>`).
 
+- The ``std::ranges::copy`` and ``std::ranges::copy_n`` algorithms have been optimized for ``std::vector<bool>::iterator``\s,
+  resulting in a performance improvement of up to 2000x.
+
 Deprecations and Removals
 -------------------------
 
diff --git a/libcxx/include/__algorithm/copy.h b/libcxx/include/__algorithm/copy.h
index 4f30b2050abbaf..745056fa9ec058 100644
--- a/libcxx/include/__algorithm/copy.h
+++ b/libcxx/include/__algorithm/copy.h
@@ -13,8 +13,10 @@
 #include <__algorithm/for_each_segment.h>
 #include <__algorithm/min.h>
 #include <__config>
+#include <__fwd/bit_reference.h>
 #include <__iterator/iterator_traits.h>
 #include <__iterator/segmented_iterator.h>
+#include <__memory/pointer_traits.h>
 #include <__type_traits/common_type.h>
 #include <__type_traits/enable_if.h>
 #include <__utility/move.h>
@@ -29,9 +31,129 @@ _LIBCPP_PUSH_MACROS
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
+template <class _InputIterator, class _OutputIterator>
+inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _OutputIterator
+copy(_InputIterator __first, _InputIterator __last, _OutputIterator __result);
+
 template <class _InIter, class _Sent, class _OutIter>
 inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 pair<_InIter, _OutIter> __copy(_InIter, _Sent, _OutIter);
 
+template <class _Cp, bool _IsConst>
+_LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI __bit_iterator<_Cp, false> __copy_aligned(
+    __bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, __bit_iterator<_Cp, false> __result) {
+  using _In             = __bit_iterator<_Cp, _IsConst>;
+  using difference_type = typename _In::difference_type;
+  using __storage_type  = typename _In::__storage_type;
+
+  const int __bits_per_word = _In::__bits_per_word;
+  difference_type __n       = __last - __first;
+  if (__n > 0) {
+    // do first word
+    if (__first.__ctz_ != 0) {
+      unsigned __clz       = __bits_per_word - __first.__ctz_;
+      difference_type __dn = std::min(static_cast<difference_type>(__clz), __n);
+      __n -= __dn;
+      __storage_type __m = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz - __dn));
+      __storage_type __b = *__first.__seg_ & __m;
+      *__result.__seg_ &= ~__m;
+      *__result.__seg_ |= __b;
+      __result.__seg_ += (__dn + __result.__ctz_) / __bits_per_word;
+      __result.__ctz_ = static_cast<unsigned>((__dn + __result.__ctz_) % __bits_per_word);
+      ++__first.__seg_;
+      // __first.__ctz_ = 0;
+    }
+    // __first.__ctz_ == 0;
+    // do middle words
+    __storage_type __nw = __n / __bits_per_word;
+    std::copy(std::__to_address(__first.__seg_),
+              std::__to_address(__first.__seg_ + __nw),
+              std::__to_address(__result.__seg_));
+    __n -= __nw * __bits_per_word;
+    __result.__seg_ += __nw;
+    // do last word
+    if (__n > 0) {
+      __first.__seg_ += __nw;
+      __storage_type __m = ~__storage_type(0) >> (__bits_per_word - __n);
+      __storage_type __b = *__first.__seg_ & __m;
+      *__result.__seg_ &= ~__m;
+      *__result.__seg_ |= __b;
+      __result.__ctz_ = static_cast<unsigned>(__n);
+    }
+  }
+  return __result;
+}
+
+template <class _Cp, bool _IsConst>
+_LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI __bit_iterator<_Cp, false> __copy_unaligned(
+    __bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, __bit_iterator<_Cp, false> __result) {
+  using _In             = __bit_iterator<_Cp, _IsConst>;
+  using difference_type = typename _In::difference_type;
+  using __storage_type  = typename _In::__storage_type;
+
+  const int __bits_per_word = _In::__bits_per_word;
+  difference_type __n       = __last - __first;
+  if (__n > 0) {
+    // do first word
+    if (__first.__ctz_ != 0) {
+      unsigned __clz_f     = __bits_per_word - __first.__ctz_;
+      difference_type __dn = std::min(static_cast<difference_type>(__clz_f), __n);
+      __n -= __dn;
+      __storage_type __m   = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz_f - __dn));
+      __storage_type __b   = *__first.__seg_ & __m;
+      unsigned __clz_r     = __bits_per_word - __result.__ctz_;
+      __storage_type __ddn = std::min<__storage_type>(__dn, __clz_r);
+      __m                  = (~__storage_type(0) << __result.__ctz_) & (~__storage_type(0) >> (__clz_r - __ddn));
+      *__result.__seg_ &= ~__m;
+      if (__result.__ctz_ > __first.__ctz_)
+        *__result.__seg_ |= __b << (__result.__ctz_ - __first.__ctz_);
+      else
+        *__result.__seg_ |= __b >> (__first.__ctz_ - __result.__ctz_);
+      __result.__seg_ += (__ddn + __result.__ctz_) / __bits_per_word;
+      __result.__ctz_ = static_cast<unsigned>((__ddn + __result.__ctz_) % __bits_per_word);
+      __dn -= __ddn;
+      if (__dn > 0) {
+        __m = ~__storage_type(0) >> (__bits_per_word - __dn);
+        *__result.__seg_ &= ~__m;
+        *__result.__seg_ |= __b >> (__first.__ctz_ + __ddn);
+        __result.__ctz_ = static_cast<unsigned>(__dn);
+      }
+      ++__first.__seg_;
+      // __first.__ctz_ = 0;
+    }
+    // __first.__ctz_ == 0;
+    // do middle words
+    unsigned __clz_r   = __bits_per_word - __result.__ctz_;
+    __storage_type __m = ~__storage_type(0) << __result.__ctz_;
+    for (; __n >= __bits_per_word; __n -= __bits_per_word, ++__first.__seg_) {
+      __storage_type __b = *__first.__seg_;
+      *__result.__seg_ &= ~__m;
+      *__result.__seg_ |= __b << __result.__ctz_;
+      ++__result.__seg_;
+      *__result.__seg_ &= __m;
+      *__result.__seg_ |= __b >> __clz_r;
+    }
+    // do last word
+    if (__n > 0) {
+      __m                 = ~__storage_type(0) >> (__bits_per_word - __n);
+      __storage_type __b  = *__first.__seg_ & __m;
+      __storage_type __dn = std::min(__n, static_cast<difference_type>(__clz_r));
+      __m                 = (~__storage_type(0) << __result.__ctz_) & (~__storage_type(0) >> (__clz_r - __dn));
+      *__result.__seg_ &= ~__m;
+      *__result.__seg_ |= __b << __result.__ctz_;
+      __result.__seg_ += (__dn + __result.__ctz_) / __bits_per_word;
+      __result.__ctz_ = static_cast<unsigned>((__dn + __result.__ctz_) % __bits_per_word);
+      __n -= __dn;
+      if (__n > 0) {
+        __m = ~__storage_type(0) >> (__bits_per_word - __n);
+        *__result.__seg_ &= ~__m;
+        *__result.__seg_ |= __b >> __dn;
+        __result.__ctz_ = static_cast<unsigned>(__n);
+      }
+    }
+  }
+  return __result;
+}
+
 struct __copy_impl {
   template <class _InIter, class _Sent, class _OutIter>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 pair<_InIter, _OutIter>
@@ -95,6 +217,16 @@ struct __copy_impl {
     }
   }
 
+  template <class _Cp, bool _IsConst>
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair<__bit_iterator<_Cp, _IsConst>, __bit_iterator<_Cp, false> >
+  operator()(__bit_iterator<_Cp, _IsConst> __first,
+             __bit_iterator<_Cp, _IsConst> __last,
+             __bit_iterator<_Cp, false> __result) {
+    if (__first.__ctz_ == __result.__ctz_)
+      return std::make_pair(__last, std::__copy_aligned(__first, __last, __result));
+    return std::make_pair(__last, std::__copy_unaligned(__first, __last, __result));
+  }
+
   // At this point, the iterators have been unwrapped so any `contiguous_iterator` has been unwrapped to a pointer.
   template <class _In, class _Out, __enable_if_t<__can_lower_copy_assignment_to_memmove<_In, _Out>::value, int> = 0>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 pair<_In*, _Out*>
@@ -110,7 +242,7 @@ __copy(_InIter __first, _Sent __last, _OutIter __result) {
 }
 
 template <class _InputIterator, class _OutputIterator>
-inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _OutputIterator
+_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _OutputIterator
 copy(_InputIterator __first, _InputIterator __last, _OutputIterator __result) {
   return std::__copy(__first, __last, __result).second;
 }
diff --git a/libcxx/include/__bit_reference b/libcxx/include/__bit_reference
index 9fa24c98d493fd..b51ee1c58dc009 100644
--- a/libcxx/include/__bit_reference
+++ b/libcxx/include/__bit_reference
@@ -10,6 +10,7 @@
 #ifndef _LIBCPP___BIT_REFERENCE
 #define _LIBCPP___BIT_REFERENCE
 
+#include <__algorithm/copy.h>
 #include <__algorithm/copy_n.h>
 #include <__algorithm/min.h>
 #include <__bit/countr.h>
@@ -22,6 +23,7 @@
 #include <__memory/pointer_traits.h>
 #include <__type_traits/conditional.h>
 #include <__type_traits/is_constant_evaluated.h>
+#include <__utility/pair.h>
 #include <__utility/swap.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
@@ -169,130 +171,6 @@ private:
         __mask_(__m) {}
 };
 
-// copy
-
-template <class _Cp, bool _IsConst>
-_LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI __bit_iterator<_Cp, false> __copy_aligned(
-    __bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, __bit_iterator<_Cp, false> __result) {
-  using _In             = __bit_iterator<_Cp, _IsConst>;
-  using difference_type = typename _In::difference_type;
-  using __storage_type  = typename _In::__storage_type;
-
-  const int __bits_per_word = _In::__bits_per_word;
-  difference_type __n       = __last - __first;
-  if (__n > 0) {
-    // do first word
-    if (__first.__ctz_ != 0) {
-      unsigned __clz       = __bits_per_word - __first.__ctz_;
-      difference_type __dn = std::min(static_cast<difference_type>(__clz), __n);
-      __n -= __dn;
-      __storage_type __m = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz - __dn));
-      __storage_type __b = *__first.__seg_ & __m;
-      *__result.__seg_ &= ~__m;
-      *__result.__seg_ |= __b;
-      __result.__seg_ += (__dn + __result.__ctz_) / __bits_per_word;
-      __result.__ctz_ = static_cast<unsigned>((__dn + __result.__ctz_) % __bits_per_word);
-      ++__first.__seg_;
-      // __first.__ctz_ = 0;
-    }
-    // __first.__ctz_ == 0;
-    // do middle words
-    __storage_type __nw = __n / __bits_per_word;
-    std::copy_n(std::__to_address(__first.__seg_), __nw, std::__to_address(__result.__seg_));
-    __n -= __nw * __bits_per_word;
-    __result.__seg_ += __nw;
-    // do last word
-    if (__n > 0) {
-      __first.__seg_ += __nw;
-      __storage_type __m = ~__storage_type(0) >> (__bits_per_word - __n);
-      __storage_type __b = *__first.__seg_ & __m;
-      *__result.__seg_ &= ~__m;
-      *__result.__seg_ |= __b;
-      __result.__ctz_ = static_cast<unsigned>(__n);
-    }
-  }
-  return __result;
-}
-
-template <class _Cp, bool _IsConst>
-_LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI __bit_iterator<_Cp, false> __copy_unaligned(
-    __bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, __bit_iterator<_Cp, false> __result) {
-  using _In             = __bit_iterator<_Cp, _IsConst>;
-  using difference_type = typename _In::difference_type;
-  using __storage_type  = typename _In::__storage_type;
-
-  const int __bits_per_word = _In::__bits_per_word;
-  difference_type __n       = __last - __first;
-  if (__n > 0) {
-    // do first word
-    if (__first.__ctz_ != 0) {
-      unsigned __clz_f     = __bits_per_word - __first.__ctz_;
-      difference_type __dn = std::min(static_cast<difference_type>(__clz_f), __n);
-      __n -= __dn;
-      __storage_type __m   = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz_f - __dn));
-      __storage_type __b   = *__first.__seg_ & __m;
-      unsigned __clz_r     = __bits_per_word - __result.__ctz_;
-      __storage_type __ddn = std::min<__storage_type>(__dn, __clz_r);
-      __m                  = (~__storage_type(0) << __result.__ctz_) & (~__storage_type(0) >> (__clz_r - __ddn));
-      *__result.__seg_ &= ~__m;
-      if (__result.__ctz_ > __first.__ctz_)
-        *__result.__seg_ |= __b << (__result.__ctz_ - __first.__ctz_);
-      else
-        *__result.__seg_ |= __b >> (__first.__ctz_ - __result.__ctz_);
-      __result.__seg_ += (__ddn + __result.__ctz_) / __bits_per_word;
-      __result.__ctz_ = static_cast<unsigned>((__ddn + __result.__ctz_) % __bits_per_word);
-      __dn -= __ddn;
-      if (__dn > 0) {
-        __m = ~__storage_type(0) >> (__bits_per_word - __dn);
-        *__result.__seg_ &= ~__m;
-        *__result.__seg_ |= __b >> (__first.__ctz_ + __ddn);
-        __result.__ctz_ = static_cast<unsigned>(__dn);
-      }
-      ++__first.__seg_;
-      // __first.__ctz_ = 0;
-    }
-    // __first.__ctz_ == 0;
-    // do middle words
-    unsigned __clz_r   = __bits_per_word - __result.__ctz_;
-    __storage_type __m = ~__storage_type(0) << __result.__ctz_;
-    for (; __n >= __bits_per_word; __n -= __bits_per_word, ++__first.__seg_) {
-      __storage_type __b = *__first.__seg_;
-      *__result.__seg_ &= ~__m;
-      *__result.__seg_ |= __b << __result.__ctz_;
-      ++__result.__seg_;
-      *__result.__seg_ &= __m;
-      *__result.__seg_ |= __b >> __clz_r;
-    }
-    // do last word
-    if (__n > 0) {
-      __m                 = ~__storage_type(0) >> (__bits_per_word - __n);
-      __storage_type __b  = *__first.__seg_ & __m;
-      __storage_type __dn = std::min(__n, static_cast<difference_type>(__clz_r));
-      __m                 = (~__storage_type(0) << __result.__ctz_) & (~__storage_type(0) >> (__clz_r - __dn));
-      *__result.__seg_ &= ~__m;
-      *__result.__seg_ |= __b << __result.__ctz_;
-      __result.__seg_ += (__dn + __result.__ctz_) / __bits_per_word;
-      __result.__ctz_ = static_cast<unsigned>((__dn + __result.__ctz_) % __bits_per_word);
-      __n -= __dn;
-      if (__n > 0) {
-        __m = ~__storage_type(0) >> (__bits_per_word - __n);
-        *__result.__seg_ &= ~__m;
-        *__result.__seg_ |= __b >> __dn;
-        __result.__ctz_ = static_cast<unsigned>(__n);
-      }
-    }
-  }
-  return __result;
-}
-
-template <class _Cp, bool _IsConst>
-inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __bit_iterator<_Cp, false>
-copy(__bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, __bit_iterator<_Cp, false> __result) {
-  if (__first.__ctz_ == __result.__ctz_)
-    return std::__copy_aligned(__first, __last, __result);
-  return std::__copy_unaligned(__first, __last, __result);
-}
-
 // copy_backward
 
 template <class _Cp, bool _IsConst>
@@ -975,8 +853,9 @@ private:
   _LIBCPP_CONSTEXPR_SINCE_CXX20 friend __bit_iterator<_Dp, false> __copy_unaligned(
       __bit_iterator<_Dp, _IC> __first, __bit_iterator<_Dp, _IC> __last, __bit_iterator<_Dp, false> __result);
   template <class _Dp, bool _IC>
-  _LIBCPP_CONSTEXPR_SINCE_CXX20 friend __bit_iterator<_Dp, false>
-  copy(__bit_iterator<_Dp, _IC> __first, __bit_iterator<_Dp, _IC> __last, __bit_iterator<_Dp, false> __result);
+  _LIBCPP_CONSTEXPR_SINCE_CXX20 friend pair<__bit_iterator<_Dp, _IC>, __bit_iterator<_Dp, false> >
+  __copy_impl::operator()(
+      __bit_iterator<_Dp, _IC> __first, __bit_iterator<_Dp, _IC> __last, __bit_iterator<_Dp, false> __result);
   template <class _Dp, bool _IC>
   _LIBCPP_CONSTEXPR_SINCE_CXX20 friend __bit_iterator<_Dp, false> __copy_backward_aligned(
       __bit_iterator<_Dp, _IC> __first, __bit_iterator<_Dp, _IC> __last, __bit_iterator<_Dp, false> __result);
diff --git a/libcxx/include/bitset b/libcxx/include/bitset
index 8b361824805571..bb09bc5415fb03 100644
--- a/libcxx/include/bitset
+++ b/libcxx/include/bitset
@@ -129,6 +129,8 @@ template <size_t N> struct hash<std::bitset<N>>;
 #if __cplusplus < 201103L && defined(_LIBCPP_USE_FROZEN_CXX03_HEADERS)
 #  include <__cxx03/bitset>
 #else
+#  include <__algorithm/copy.h>
+#  include <__algorithm/copy_backward.h>
 #  include <__algorithm/count.h>
 #  include <__algorithm/fill.h>
 #  include <__algorithm/fill_n.h>
diff --git a/libcxx/test/benchmarks/algorithms/copy.bench.cpp b/libcxx/test/benchmarks/algorithms/copy.bench.cpp
new file mode 100644
index 00000000000000..54006d5a72edea
--- /dev/null
+++ b/libcxx/test/benchmarks/algorithms/copy.bench.cpp
@@ -0,0 +1,89 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+#include <algorithm>
+#include <benchmark/benchmark.h>
+#include <vector>
+
+static void bm_ranges_copy(benchmark::State& state, bool aligned) {
+  auto n = state.range();
+  std::vector<bool> in(n, true);
+  std::vector<bool> out(aligned ? n : n + 8);
+  benchmark::DoNotOptimize(&in);
+  auto dst = aligned ? out.begin() : out.begin() + 4;
+  for (auto _ : state) {
+    benchmark::DoNotOptimize(std::ranges::copy(in, dst));
+    benchmark::DoNotOptimize(&out);
+  }
+}
+
+static void bm_ranges_copy_n(benchmark::State& state, bool aligned) {
+  auto n = state.range();
+  std::vector<bool> in(n, true);
+  std::vector<bool> out(aligned ? n : n + 8);
+  benchmark::DoNotOptimize(&in);
+  auto src = in.begin();
+  auto dst = aligned ? out.begin() : out.begin() + 4;
+  for (auto _ : state) {
+    benchmark::DoNotOptimize(std::ranges::copy_n(src, n, dst));
+    benchmark::DoNotOptimize(&out);
+  }
+}
+
+static void bm_copy(benchmark::State& state, bool aligned) {
+  auto n = state.range();
+  std::vector<bool> in(n, true);
+  std::vector<bool> out(aligned ? n : n + 8);
+  benchmark::DoNotOptimize(&in);
+  auto beg = in.begin();
+  auto end = in.end();
+  auto dst = aligned ? out.begin() : out.begin() + 4;
+  for (auto _ : state) {
+    benchmark::DoNotOptimize(std::copy(beg, end, dst));
+    benchmark::DoNotOptimize(&out);
+  }
+}
+
+static void bm_copy_n(benchmark::State& state, bool aligned) {
+  auto n = state.range();
+  std::vector<bool> in(n, true);
+  std::vector<bool> out(aligned ? n : n + 8);
+  benchmark::DoNotOptimize(&in);
+  auto src = in.begin();
+  auto dst = aligned ? out.begin() : out.begin() + 4;
+  for (auto _ : state) {
+    benchmark::DoNotOptimize(std::copy_n(src, n, dst));
+    benchmark::DoNotOptimize(&out);
+  }
+}
+
+static void bm_ranges_copy_aligned(benchmark::State& state) { bm_ranges_copy(state, true); }
+static void bm_ranges_copy_unaligned(benchmark::State& state) { bm_ranges_copy(state, false); }
+static void bm_ranges_copy_n_aligned(benchmark::State& state) { bm_ranges_copy_n(state, true); }
+static void bm_ranges_copy_n_unaligned(benchmark::State& state) { bm_ranges_copy_n(state, false); }
+
+static void bm_copy_aligned(benchmark::State& state) { bm_copy(state, true); }
+static void bm_copy_unaligned(benchmark::State& state) { bm_copy(state, false); }
+static void bm_copy_n_aligned(benchmark::State& state) { bm_copy_n(state, true); }
+static void bm_copy_n_unaligned(benchmark::State& state) { bm_copy_n(state, false); }
+
+// Test the range version of std::copy for vector<bool>::iterator
+BENCHMARK(bm_ranges_copy_aligned)->Range(8, 1 << 16)->DenseRange(102400, 204800, 4096);
+BENCHMARK(bm_ranges_copy_n_aligned)->Range(8, 1 << 20);
+BENCHMARK(bm_ranges_copy_unaligned)->Range(8, 1 << 20);
+BENCHMARK(bm_ranges_copy_n_unaligned)->Range(8, 1 << 20);
+
+// Test the iterator-pair version of std::copy for vector<bool>::iterator
+BENCHMARK(bm_copy_aligned)->Range(8, 1 << 20);
+BENCHMARK(bm_copy_n_aligned)->Range(8, 1 << 20);
+BENCHMARK(bm_copy_unaligned)->Range(8, 1 << 20);
+BENCHMARK(bm_copy_n_unaligned)->Range(8, 1 << 20);
+
+BENCHMARK_MAIN();
diff --git a/libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/copy.pass.cpp b/libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/copy.pass.cpp
index b5f0a32b986a03..553e4c206ac08a 100644
--- a/libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/copy.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/copy.pass.cpp
@@ -14,6 +14,7 @@
 
 #include <algorithm>
 #include <cassert>
+#include <vector>
 
 #...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/121013


More information about the libcxx-commits mailing list