[libcxx-commits] [libcxx] 63a2b20 - [libc++, std::vector] call the optimized version of __uninitialized_allocator_copy for trivial types

via libcxx-commits libcxx-commits at lists.llvm.org
Wed May 24 13:41:24 PDT 2023


Author: AdityaK
Date: 2023-05-24T13:40:53-07:00
New Revision: 63a2b206fa4e19ffcadaaeb3c60b9ea4205079a1

URL: https://github.com/llvm/llvm-project/commit/63a2b206fa4e19ffcadaaeb3c60b9ea4205079a1
DIFF: https://github.com/llvm/llvm-project/commit/63a2b206fa4e19ffcadaaeb3c60b9ea4205079a1.diff

LOG: [libc++, std::vector] call the optimized version of __uninitialized_allocator_copy for trivial types

See: https://github.com/llvm/llvm-project/issues/61987

Fix suggested by: @philnik and @var-const

Reviewers: philnik, ldionne, EricWF, var-const

Differential Revision: https://reviews.llvm.org/D147741

Testing:
ninja check-cxx check-clang check-llvm

Benchmark Testcases (BM_CopyConstruct, and BM_Assignment) added.

performance improvement:

Run on (8 X 4800 MHz CPU s)
CPU Caches:
  L1 Data 48 KiB (x4)
  L1 Instruction 32 KiB (x4)
  L2 Unified 1280 KiB (x4)
  L3 Unified 12288 KiB (x1)
Load Average: 1.66, 3.02, 2.43

Comparing build-runtimes-base/libcxx/benchmarks/vector_operations.libcxx.out to build-runtimes/libcxx/benchmarks/vector_operations.libcxx.out
Benchmark                                                   Time             CPU      Time Old      Time New       CPU Old       CPU New
----------------------------------------------------------------------------------------------------------------------------------------
BM_ConstructSize/vector_byte/5140480                     +0.0362         +0.0362        116906        121132        116902        121131
BM_CopyConstruct/vector_int/5140480                      -0.4563         -0.4577       1755224        954241       1755330        951987
BM_Assignment/vector_int/5140480                         -0.0222         -0.0220        990045        968095        989917        968125
BM_ConstructSizeValue/vector_byte/5140480                +0.0308         +0.0307        116970        120567        116977        120573
BM_ConstructIterIter/vector_char/1024                    -0.0831         -0.0831            19            17            19            17
BM_ConstructIterIter/vector_size_t/1024                  +0.0129         +0.0131            88            89            88            89
BM_ConstructIterIter/vector_string/1024                  -0.0064         -0.0018         54455         54109         54208         54112
OVERALL_GEOMEAN                                          -0.0845         -0.0842             0             0             0             0

FYI, the perf improvements for BM_CopyConstruct due to this patch is mostly subsumed by the https://reviews.llvm.org/D149826. However this patch still adds value by converting copy to memmove (the second testcase).

Before the patch:

```
define linkonce_odr dso_local void @_ZNSt3__16vectorIiNS_9allocatorIiEEE18__construct_at_endIPiS5_EEvT_T0_m(ptr noundef nonnull align 8 dereferenceable(24) %0, ptr noundef %1, ptr noundef %2, i64 noundef %3) local_unnamed_addr #4 comdat align 2 {
  %5 = getelementptr inbounds %"class.std::__1::vector", ptr %0, i64 0, i32 1
  %6 = load ptr, ptr %5, align 8, !tbaa !12
  %7 = icmp eq ptr %1, %2
  br i1 %7, label %16, label %8

8:                                                ; preds = %4, %8
  %9 = phi ptr [ %13, %8 ], [ %1, %4 ]
  %10 = phi ptr [ %14, %8 ], [ %6, %4 ]
  %11 = icmp ne ptr %10, null
  tail call void @llvm.assume(i1 %11)
  %12 = load i32, ptr %9, align 4, !tbaa !14
  store i32 %12, ptr %10, align 4, !tbaa !14
  %13 = getelementptr inbounds i32, ptr %9, i64 1
  %14 = getelementptr inbounds i32, ptr %10, i64 1
  %15 = icmp eq ptr %13, %2
  br i1 %15, label %16, label %8, !llvm.loop !16

16:                                               ; preds = %8, %4
  %17 = phi ptr [ %6, %4 ], [ %14, %8 ]
  store ptr %17, ptr %5, align 8, !tbaa !12
  ret void
}
```

After the patch:
```
define linkonce_odr dso_local void @_ZNSt3__16vectorIiNS_9allocatorIiEEE18__construct_at_endIPiS5_EEvT_T0_m(ptr noundef nonnull align 8 dereferenceable(24) %0, ptr noundef %1, ptr noundef %2, i64 noundef %3) local_unnamed_addr #4 comdat align 2 {
  %5 = getelementptr inbounds %"class.std::__1::vector", ptr %0, i64 0, i32 1
  %6 = load ptr, ptr %5, align 8, !tbaa !12
  %7 = ptrtoint ptr %2 to i64
  %8 = ptrtoint ptr %1 to i64
  %9 = sub i64 %7, %8
  %10 = ashr exact i64 %9, 2
  tail call void @llvm.memmove.p0.p0.i64(ptr align 4 %6, ptr align 4 %1, i64 %9, i1 false)
  %11 = getelementptr inbounds i32, ptr %6, i64 %10
  store ptr %11, ptr %5, align 8, !tbaa !12
  ret void
}
```

This is due to the optimized version of uninitialized_allocator_copy function.

Added: 
    

Modified: 
    libcxx/benchmarks/ContainerBenchmarks.h
    libcxx/benchmarks/vector_operations.bench.cpp
    libcxx/include/__memory/uninitialized_algorithms.h

Removed: 
    


################################################################################
diff  --git a/libcxx/benchmarks/ContainerBenchmarks.h b/libcxx/benchmarks/ContainerBenchmarks.h
index f64869adc99df..4dae95748077f 100644
--- a/libcxx/benchmarks/ContainerBenchmarks.h
+++ b/libcxx/benchmarks/ContainerBenchmarks.h
@@ -26,6 +26,28 @@ void BM_ConstructSize(benchmark::State& st, Container) {
   }
 }
 
+template <class Container>
+void BM_CopyConstruct(benchmark::State& st, Container) {
+  auto size = st.range(0);
+  Container c(size);
+  for (auto _ : st) {
+    auto v = c;
+    DoNotOptimizeData(v);
+  }
+}
+
+template <class Container>
+void BM_Assignment(benchmark::State& st, Container) {
+  auto size = st.range(0);
+  Container c1;
+  Container c2(size);
+  for (auto _ : st) {
+    c1 = c2;
+    DoNotOptimizeData(c1);
+    DoNotOptimizeData(c2);
+  }
+}
+
 template <class Container>
 void BM_ConstructSizeValue(benchmark::State& st, Container, typename Container::value_type const& val) {
   const auto size = st.range(0);

diff  --git a/libcxx/benchmarks/vector_operations.bench.cpp b/libcxx/benchmarks/vector_operations.bench.cpp
index 70a317df9fb83..2e707c0a1b3c5 100644
--- a/libcxx/benchmarks/vector_operations.bench.cpp
+++ b/libcxx/benchmarks/vector_operations.bench.cpp
@@ -17,6 +17,14 @@ BENCHMARK_CAPTURE(BM_ConstructSize,
     vector_byte,
     std::vector<unsigned char>{})->Arg(5140480);
 
+BENCHMARK_CAPTURE(BM_CopyConstruct,
+    vector_int,
+    std::vector<int>{})->Arg(5140480);
+
+BENCHMARK_CAPTURE(BM_Assignment,
+    vector_int,
+    std::vector<int>{})->Arg(5140480);
+
 BENCHMARK_CAPTURE(BM_ConstructSizeValue,
     vector_byte,
     std::vector<unsigned char>{}, 0)->Arg(5140480);

diff  --git a/libcxx/include/__memory/uninitialized_algorithms.h b/libcxx/include/__memory/uninitialized_algorithms.h
index 62a5dc9916dcb..c02803330bcfb 100644
--- a/libcxx/include/__memory/uninitialized_algorithms.h
+++ b/libcxx/include/__memory/uninitialized_algorithms.h
@@ -12,6 +12,8 @@
 
 #include <__algorithm/copy.h>
 #include <__algorithm/move.h>
+#include <__algorithm/unwrap_iter.h>
+#include <__algorithm/unwrap_range.h>
 #include <__config>
 #include <__iterator/iterator_traits.h>
 #include <__iterator/reverse_iterator.h>
@@ -545,7 +547,7 @@ class _AllocatorDestroyRangeReverse {
 // already copied elements are destroyed in reverse order of their construction.
 template <class _Alloc, class _Iter1, class _Sent1, class _Iter2>
 _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Iter2
-__uninitialized_allocator_copy(_Alloc& __alloc, _Iter1 __first1, _Sent1 __last1, _Iter2 __first2) {
+__uninitialized_allocator_copy_impl(_Alloc& __alloc, _Iter1 __first1, _Sent1 __last1, _Iter2 __first2) {
   auto __destruct_first = __first2;
   auto __guard =
       std::__make_exception_guard(_AllocatorDestroyRangeReverse<_Alloc, _Iter2>(__alloc, __destruct_first, __first2));
@@ -565,14 +567,16 @@ template <class _Type>
 struct __allocator_has_trivial_copy_construct<allocator<_Type>, _Type> : true_type {};
 
 template <class _Alloc,
-          class _Type,
-          class _RawType = __remove_const_t<_Type>,
+          class _In,
+          class _RawTypeIn = __remove_const_t<_In>,
+          class _Out,
           __enable_if_t<
-              // using _RawType because of the allocator<T const> extension
-              is_trivially_copy_constructible<_RawType>::value && is_trivially_copy_assignable<_RawType>::value &&
-              __allocator_has_trivial_copy_construct<_Alloc, _RawType>::value>* = nullptr>
-_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Type*
-__uninitialized_allocator_copy(_Alloc&, const _Type* __first1, const _Type* __last1, _Type* __first2) {
+              // using _RawTypeIn because of the allocator<T const> extension
+              is_trivially_copy_constructible<_RawTypeIn>::value && is_trivially_copy_assignable<_RawTypeIn>::value &&
+              is_same<__remove_cv_t<_In>, __remove_cv_t<_Out> >::value &&
+              __allocator_has_trivial_copy_construct<_Alloc, _RawTypeIn>::value>* = nullptr>
+_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Out*
+__uninitialized_allocator_copy_impl(_Alloc&, _In* __first1, _In* __last1, _Out* __first2) {
   // TODO: Remove the const_cast once we drop support for std::allocator<T const>
   if (__libcpp_is_constant_evaluated()) {
     while (__first1 != __last1) {
@@ -582,10 +586,17 @@ __uninitialized_allocator_copy(_Alloc&, const _Type* __first1, const _Type* __la
     }
     return __first2;
   } else {
-    return std::copy(__first1, __last1, const_cast<_RawType*>(__first2));
+    return std::copy(__first1, __last1, const_cast<_RawTypeIn*>(__first2));
   }
 }
 
+template <class _Alloc, class _Iter1, class _Sent1, class _Iter2>
+_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Iter2 __uninitialized_allocator_copy(_Alloc& __alloc, _Iter1 __first1, _Sent1 __last1, _Iter2 __first2) {
+    auto __unwrapped_range = std::__unwrap_range(__first1, __last1);
+    auto __result = std::__uninitialized_allocator_copy_impl(__alloc, __unwrapped_range.first, __unwrapped_range.second, std::__unwrap_iter(__first2));
+    return std::__rewrap_iter(__first2, __result);
+}
+
 // Move-construct the elements [__first1, __last1) into [__first2, __first2 + N)
 // if the move constructor is noexcept, where N is distance(__first1, __last1).
 //


        


More information about the libcxx-commits mailing list