[libcxx-commits] [libcxx] 6fe4e03 - [libc++] Optimize vector push_back to avoid continuous load and store of end pointer

Louis Dionne via libcxx-commits libcxx-commits at lists.llvm.org
Mon Oct 2 06:12:59 PDT 2023


Author: Martijn Vels
Date: 2023-10-02T09:12:37-04:00
New Revision: 6fe4e033f07d332980e1997c19fe705cff9d07a4

URL: https://github.com/llvm/llvm-project/commit/6fe4e033f07d332980e1997c19fe705cff9d07a4
DIFF: https://github.com/llvm/llvm-project/commit/6fe4e033f07d332980e1997c19fe705cff9d07a4.diff

LOG: [libc++] Optimize vector push_back to avoid continuous load and store of end pointer

Credits: this change is based on analysis and a proof of concept by
gerbens at google.com.

Before, the compiler loses track of end as 'this' and other references
possibly escape beyond the compiler's scope. This can be see in the
generated assembly:

     16.28 │200c80:   mov     %r15d,(%rax)
     60.87 │200c83:   add     $0x4,%rax
           │200c87:   mov     %rax,-0x38(%rbp)
      0.03 │200c8b: → jmpq    200d4e
      ...
      ...
      1.69 │200d4e:   cmp     %r15d,%r12d
           │200d51: → je      200c40
     16.34 │200d57:   inc     %r15d
      0.05 │200d5a:   mov     -0x38(%rbp),%rax
      3.27 │200d5e:   mov     -0x30(%rbp),%r13
      1.47 │200d62:   cmp     %r13,%rax
           │200d65: → jne     200c80

We fix this by always explicitly storing the loaded local and pointer
back at the end of push back. This generates some slight source 'noise',
but creates nice and compact fast path code, i.e.:

     32.64 │200760:   mov    %r14d,(%r12)
      9.97 │200764:   add    $0x4,%r12
      6.97 │200768:   mov    %r12,-0x38(%rbp)
     32.17 │20076c:   add    $0x1,%r14d
      2.36 │200770:   cmp    %r14d,%ebx
           │200773: → je     200730
      8.98 │200775:   mov    -0x30(%rbp),%r13
      6.75 │200779:   cmp    %r13,%r12
           │20077c: → jne    200760

Now there is a single store for the push_back value (as before), and a
single store for the end without a reload (dependency).

For fully local vectors, (i.e., not referenced elsewhere), the capacity
load and store inside the loop could also be removed, but this requires
more substantial refactoring inside vector.

Differential Revision: https://reviews.llvm.org/D80588

Added: 
    

Modified: 
    libcxx/benchmarks/ContainerBenchmarks.h
    libcxx/benchmarks/vector_operations.bench.cpp
    libcxx/include/vector

Removed: 
    


################################################################################
diff  --git a/libcxx/benchmarks/ContainerBenchmarks.h b/libcxx/benchmarks/ContainerBenchmarks.h
index 071e46c2a1c6546..9a9abfd3b0d0f6a 100644
--- a/libcxx/benchmarks/ContainerBenchmarks.h
+++ b/libcxx/benchmarks/ContainerBenchmarks.h
@@ -79,6 +79,19 @@ void BM_ConstructFromRange(benchmark::State& st, Container, GenInputs gen) {
   }
 }
 
+template <class Container>
+void BM_Pushback(benchmark::State& state, Container c) {
+  int count = state.range(0);
+  c.reserve(count);
+  while (state.KeepRunningBatch(count)) {
+    c.clear();
+    for (int i = 0; i != count; ++i) {
+      c.push_back(i);
+    }
+    benchmark::DoNotOptimize(c.data());
+  }
+}
+
 template <class Container, class GenInputs>
 void BM_InsertValue(benchmark::State& st, Container c, GenInputs gen) {
   auto in        = gen(st.range(0));

diff  --git a/libcxx/benchmarks/vector_operations.bench.cpp b/libcxx/benchmarks/vector_operations.bench.cpp
index be0bee6b645612a..38b14c56756fba0 100644
--- a/libcxx/benchmarks/vector_operations.bench.cpp
+++ b/libcxx/benchmarks/vector_operations.bench.cpp
@@ -39,4 +39,6 @@ BENCHMARK_CAPTURE(BM_ConstructFromRange, vector_size_t, std::vector<size_t>{}, g
 BENCHMARK_CAPTURE(BM_ConstructFromRange, vector_string, std::vector<std::string>{}, getRandomStringInputs)
     ->Arg(TestNumInputs);
 
+BENCHMARK_CAPTURE(BM_Pushback, vector_int, std::vector<int>{})->Arg(TestNumInputs);
+
 BENCHMARK_MAIN();

diff  --git a/libcxx/include/vector b/libcxx/include/vector
index 6d26ce169b600a1..4ec6b602371eaee 100644
--- a/libcxx/include/vector
+++ b/libcxx/include/vector
@@ -833,11 +833,11 @@ private:
 
     template <class _Up>
     _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI
-    inline void __push_back_slow_path(_Up&& __x);
+    inline pointer __push_back_slow_path(_Up&& __x);
 
     template <class... _Args>
     _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI
-    inline void __emplace_back_slow_path(_Args&&... __args);
+    inline pointer __emplace_back_slow_path(_Args&&... __args);
 
     // The following functions are no-ops outside of AddressSanitizer mode.
     // We call annotations for every allocator, unless explicitly disabled.
@@ -1609,7 +1609,7 @@ vector<_Tp, _Allocator>::shrink_to_fit() _NOEXCEPT
 template <class _Tp, class _Allocator>
 template <class _Up>
 _LIBCPP_CONSTEXPR_SINCE_CXX20
-void
+typename vector<_Tp, _Allocator>::pointer
 vector<_Tp, _Allocator>::__push_back_slow_path(_Up&& __x)
 {
     allocator_type& __a = this->__alloc();
@@ -1618,6 +1618,7 @@ vector<_Tp, _Allocator>::__push_back_slow_path(_Up&& __x)
     __alloc_traits::construct(__a, std::__to_address(__v.__end_), std::forward<_Up>(__x));
     __v.__end_++;
     __swap_out_circular_buffer(__v);
+    return this->__end_;
 }
 
 template <class _Tp, class _Allocator>
@@ -1626,12 +1627,14 @@ inline _LIBCPP_HIDE_FROM_ABI
 void
 vector<_Tp, _Allocator>::push_back(const_reference __x)
 {
-    if (this->__end_ != this->__end_cap())
-    {
+    pointer __end = this->__end_;
+    if (__end < this->__end_cap()) {
         __construct_one_at_end(__x);
+        ++__end;
+    } else {
+        __end = __push_back_slow_path(__x);
     }
-    else
-        __push_back_slow_path(__x);
+    this->__end_ = __end;
 }
 
 template <class _Tp, class _Allocator>
@@ -1640,18 +1643,20 @@ inline _LIBCPP_HIDE_FROM_ABI
 void
 vector<_Tp, _Allocator>::push_back(value_type&& __x)
 {
-    if (this->__end_ < this->__end_cap())
-    {
+    pointer __end = this->__end_;
+    if (__end < this->__end_cap()) {
         __construct_one_at_end(std::move(__x));
+        ++__end;
+    } else {
+        __end = __push_back_slow_path(std::move(__x));
     }
-    else
-        __push_back_slow_path(std::move(__x));
+    this->__end_ = __end;
 }
 
 template <class _Tp, class _Allocator>
 template <class... _Args>
 _LIBCPP_CONSTEXPR_SINCE_CXX20
-void
+typename vector<_Tp, _Allocator>::pointer
 vector<_Tp, _Allocator>::__emplace_back_slow_path(_Args&&... __args)
 {
     allocator_type& __a = this->__alloc();
@@ -1660,6 +1665,7 @@ vector<_Tp, _Allocator>::__emplace_back_slow_path(_Args&&... __args)
     __alloc_traits::construct(__a, std::__to_address(__v.__end_), std::forward<_Args>(__args)...);
     __v.__end_++;
     __swap_out_circular_buffer(__v);
+    return this->__end_;
 }
 
 template <class _Tp, class _Allocator>
@@ -1673,14 +1679,16 @@ void
 #endif
 vector<_Tp, _Allocator>::emplace_back(_Args&&... __args)
 {
-    if (this->__end_ < this->__end_cap())
-    {
+    pointer __end = this->__end_;
+    if (__end < this->__end_cap()) {
         __construct_one_at_end(std::forward<_Args>(__args)...);
+        ++__end;
+    } else {
+        __end = __emplace_back_slow_path(std::forward<_Args>(__args)...);
     }
-    else
-        __emplace_back_slow_path(std::forward<_Args>(__args)...);
+    this->__end_ = __end;
 #if _LIBCPP_STD_VER >= 17
-    return this->back();
+    return *(__end - 1);
 #endif
 }
 


        


More information about the libcxx-commits mailing list