[llvm-branch-commits] [libcxx] [libc++][format][3/3] Improves formatting performance. (PR #108990)

Mark de Wever via llvm-branch-commits llvm-branch-commits at lists.llvm.org
Wed Sep 25 08:12:09 PDT 2024


https://github.com/mordante updated https://github.com/llvm/llvm-project/pull/108990

>From d435a3118ffe1cca91eff7eeea19f4bc243384c4 Mon Sep 17 00:00:00 2001
From: Mark de Wever <koraq at xs4all.nl>
Date: Sat, 30 Mar 2024 17:35:56 +0100
Subject: [PATCH] [libc++][format][3/3] Improves formatting performance.

This changes the __output_buffer to a new structure. This improves the
performace of std::format, std::format_to, std::format_to_n, and
std::foramtted size.
---
 libcxx/include/__format/buffer.h              | 616 ++++++++++--------
 libcxx/include/__format/format_functions.h    |  29 +-
 .../test/libcxx/transitive_includes/cxx03.csv |  18 -
 .../test/libcxx/transitive_includes/cxx11.csv |  18 -
 .../test/libcxx/transitive_includes/cxx14.csv |  18 -
 .../test/libcxx/transitive_includes/cxx17.csv |   8 -
 .../format/format.functions/format_tests.h    |   2 +-
 7 files changed, 352 insertions(+), 357 deletions(-)

diff --git a/libcxx/include/__format/buffer.h b/libcxx/include/__format/buffer.h
index 8598f0a1c03957..a0caaccb47c2d2 100644
--- a/libcxx/include/__format/buffer.h
+++ b/libcxx/include/__format/buffer.h
@@ -14,6 +14,7 @@
 #include <__algorithm/fill_n.h>
 #include <__algorithm/max.h>
 #include <__algorithm/min.h>
+#include <__algorithm/ranges_copy.h>
 #include <__algorithm/ranges_copy_n.h>
 #include <__algorithm/transform.h>
 #include <__algorithm/unwrap_iter.h>
@@ -29,6 +30,7 @@
 #include <__iterator/wrap_iter.h>
 #include <__memory/addressof.h>
 #include <__memory/allocate_at_least.h>
+#include <__memory/allocator.h>
 #include <__memory/allocator_traits.h>
 #include <__memory/construct_at.h>
 #include <__memory/ranges_construct_at.h>
@@ -38,6 +40,7 @@
 #include <__utility/exception_guard.h>
 #include <__utility/move.h>
 #include <cstddef>
+#include <stdexcept>
 #include <string_view>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
@@ -53,24 +56,150 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 
 namespace __format {
 
+// A helper to limit the total size of code units written.
+class _LIBCPP_HIDE_FROM_ABI __max_output_size {
+public:
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI explicit __max_output_size(size_t __max_size) : __max_size_{__max_size} {}
+
+  // This function adjusts the size of a (bulk) write operations. It ensures the
+  // number of code units written by a __output_buffer never exceeds
+  // __max_size_ code units.
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI size_t __write_request(size_t __code_units) {
+    size_t __result =
+        __code_units_written_ < __max_size_ ? std::min(__code_units, __max_size_ - __code_units_written_) : 0;
+    __code_units_written_ += __code_units;
+    return __result;
+  }
+
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI size_t __code_units_written() const noexcept { return __code_units_written_; }
+
+private:
+  size_t __max_size_;
+  // The code units that would have been written if there was no limit.
+  // format_to_n returns this value.
+  size_t __code_units_written_{0};
+};
+
 /// A "buffer" that handles writing to the proper iterator.
 ///
 /// This helper is used together with the @ref back_insert_iterator to offer
 /// type-erasure for the formatting functions. This reduces the number to
 /// template instantiations.
+///
+/// The design is the following:
+/// - There is an external object that connects the buffer to the output.
+/// - This buffer object:
+///   - inherits publicly from this class.
+///   - has a static or dynamic buffer.
+///   - has a static member function to make space in its buffer write
+///     operations. This can be done by increasing the size of the internal
+///     buffer or by writing the contents of the buffer to the output iterator.
+///
+///     This member function is a constructor argument, so its name is not
+///     fixed. The code uses the name __prepare_write.
+/// - The number of output code units can be limited by a __max_output_size
+///   object. This is used in format_to_n This object:
+///   - Contains the maximum number of code units to be written.
+///   - Contains the number of code units that are requested to be written.
+///     This number is returned to the user of format_to_n.
+///   - The write functions call the object's __request_write member function.
+///     This function:
+///     - Updates the number of code units that are requested to be written.
+///     - Returns the number of code units that can be written without
+///       exceeding the maximum number of code units to be written.
+///
+/// Documentation for the buffer usage members:
+/// - __ptr_
+///   The start of the buffer.
+/// - __capacity_
+///   The number of code units that can be written. This means
+///   [__ptr_, __ptr_ + __capacity_) is a valid range to write to.
+/// - __size_
+///   The number of code units written in the buffer. The next code unit will
+///   be written at __ptr_ + __size_. This __size_ may NOT contain the total
+///   number of code units written by the __output_buffer. Whether or not it
+///   does depends on the sub-class used. Typically the total number of code
+///   units written is not interesting. It is interesting for format_to_n which
+///   has its own way to track this number.
+///
+/// Documentation for the buffer modifying buffer operations:
+/// The subclasses have a function with the following signature:
+///
+///   static void __prepare_write(
+///     __output_buffer<_CharT>& __buffer, size_t __code_units);
+///
+/// This function is called when a write function writes more code units than
+/// the buffer's available space. When an __max_output_size object is provided
+/// the number of code units is the number of code units returned from
+/// __max_output_size::__request_write function.
+///
+/// - The __buffer contains *this. Since the class containing this function
+///   inherits from __output_buffer it's safe to cast it to the subclass being
+///   used.
+/// - The __code_units is the number of code units the caller will write + 1.
+///   - This value does not take the available space of the buffer into account.
+///   - The push_back function is more efficient when writing before resizing,
+///     this means the buffer should always have room for one code unit. Hence
+///     the + 1 is the size.
+/// - When the function returns there is room for at least one additional code
+///   unit. There is no requirement there is room for __code_units code units:
+///   - The class has some "bulk" operations. For example, __copy which copies
+///     the contents of a basic_string_view to the output. If the sub-class has
+///     a fixed size buffer the size of the basic_string_view may be larger
+///     than the buffer. In that case it's impossible to honor the requested
+///     size.
+///   - The at least one code unit makes sure the entire output can be written.
+///     (Obviously making room one code unit at a time is slow and
+///     it's recommended to return a larger available space.)
+///   - When the buffer has room for at least one code unit the function may be
+///     a no-op.
+/// - When the function makes space for more code units it uses one for these
+///   functions to signal the change:
+///   - __buffer_flushed()
+///     - This function is typically used for a fixed sized buffer.
+///     - The current contents of [__ptr_, __ptr_ + __size_) have been
+///       processed.
+///     - __ptr_ remains unchanged.
+///     - __capacity_ remains unchanged.
+///     - __size_ will be set to 0.
+///   - __buffer_moved(_CharT* __ptr, size_t __capacity)
+///     - This function is typically used for a dynamic sized buffer. There the
+///       location of the buffer changes due to reallocations.
+///     - __ptr_ will be set to __ptr. (This value may be the old value of
+///       __ptr_).
+///     - __capacity_ will be set to __capacity. (This value may be the old
+///       value of __capacity_).
+///     - __size_ remains unchanged,
+///     - The range [__ptr, __ptr + __size_) contains the original data of the
+///       range  [__ptr_, __ptr_ + __size_).
+///
+/// The push_back function expects a valid buffer and a capacity of at least 1.
+/// This means:
+/// - The class is constructed with a valid buffer,
+/// - __buffer_moved is called with a valid buffer is used before the first
+///   write operation,
+/// - no write function is ever called, or
+/// - the class is constructed with a __max_output_size object with __max_size 0.
+///
+/// The latter option allows formatted_size to use the output buffer without
+/// ever writing anything to the buffer.
 template <__fmt_char_type _CharT>
 class _LIBCPP_TEMPLATE_VIS __output_buffer {
 public:
-  using value_type = _CharT;
+  using value_type           = _CharT;
+  using __prepare_write_type = void (*)(__output_buffer<_CharT>&, size_t);
 
-  template <class _Tp>
-  _LIBCPP_HIDE_FROM_ABI explicit __output_buffer(_CharT* __ptr, size_t __capacity, _Tp* __obj)
-      : __ptr_(__ptr),
-        __capacity_(__capacity),
-        __flush_([](_CharT* __p, size_t __n, void* __o) { static_cast<_Tp*>(__o)->__flush(__p, __n); }),
-        __obj_(__obj) {}
+  [[nodiscard]]
+  _LIBCPP_HIDE_FROM_ABI explicit __output_buffer(_CharT* __ptr, size_t __capacity, __prepare_write_type __function)
+      : __output_buffer{__ptr, __capacity, __function, nullptr} {}
 
-  _LIBCPP_HIDE_FROM_ABI void __reset(_CharT* __ptr, size_t __capacity) {
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI explicit __output_buffer(
+      _CharT* __ptr, size_t __capacity, __prepare_write_type __function, __max_output_size* __max_output_size)
+      : __ptr_(__ptr), __capacity_(__capacity), __prepare_write_(__function), __max_output_size_(__max_output_size) {}
+
+  _LIBCPP_HIDE_FROM_ABI void __buffer_flushed() { __size_ = 0; }
+
+  _LIBCPP_HIDE_FROM_ABI void __buffer_moved(_CharT* __ptr, size_t __capacity) {
     __ptr_      = __ptr;
     __capacity_ = __capacity;
   }
@@ -79,12 +208,18 @@ class _LIBCPP_TEMPLATE_VIS __output_buffer {
 
   // Used in std::back_insert_iterator.
   _LIBCPP_HIDE_FROM_ABI void push_back(_CharT __c) {
+    if (__max_output_size_ && __max_output_size_->__write_request(1) == 0)
+      return;
+
+    _LIBCPP_ASSERT_INTERNAL(
+        __ptr_ && __size_ < __capacity_ && __available() >= 1, "attempted to write outside the buffer");
+
     __ptr_[__size_++] = __c;
 
     // Profiling showed flushing after adding is more efficient than flushing
     // when entering the function.
     if (__size_ == __capacity_)
-      __flush();
+      __prepare_write(0);
   }
 
   /// Copies the input __str to the buffer.
@@ -105,25 +240,20 @@ class _LIBCPP_TEMPLATE_VIS __output_buffer {
     // upper case. For integral these strings are short.
     // TODO FMT Look at the improvements above.
     size_t __n = __str.size();
-
-    __flush_on_overflow(__n);
-    if (__n < __capacity_) { //  push_back requires the buffer to have room for at least one character (so use <).
-      std::copy_n(__str.data(), __n, std::addressof(__ptr_[__size_]));
-      __size_ += __n;
-      return;
+    if (__max_output_size_) {
+      __n = __max_output_size_->__write_request(__n);
+      if (__n == 0)
+        return;
     }
 
-    // The output doesn't fit in the internal buffer.
-    // Copy the data in "__capacity_" sized chunks.
-    _LIBCPP_ASSERT_INTERNAL(__size_ == 0, "the buffer should be flushed by __flush_on_overflow");
     const _InCharT* __first = __str.data();
     do {
-      size_t __chunk = std::min(__n, __capacity_);
+      __prepare_write(__n);
+      size_t __chunk = std::min(__n, __available());
       std::copy_n(__first, __chunk, std::addressof(__ptr_[__size_]));
-      __size_ = __chunk;
+      __size_ += __chunk;
       __first += __chunk;
       __n -= __chunk;
-      __flush();
     } while (__n);
   }
 
@@ -137,121 +267,59 @@ class _LIBCPP_TEMPLATE_VIS __output_buffer {
     _LIBCPP_ASSERT_INTERNAL(__first <= __last, "not a valid range");
 
     size_t __n = static_cast<size_t>(__last - __first);
-    __flush_on_overflow(__n);
-    if (__n < __capacity_) { //  push_back requires the buffer to have room for at least one character (so use <).
-      std::transform(__first, __last, std::addressof(__ptr_[__size_]), std::move(__operation));
-      __size_ += __n;
-      return;
+    if (__max_output_size_) {
+      __n = __max_output_size_->__write_request(__n);
+      if (__n == 0)
+        return;
     }
 
-    // The output doesn't fit in the internal buffer.
-    // Transform the data in "__capacity_" sized chunks.
-    _LIBCPP_ASSERT_INTERNAL(__size_ == 0, "the buffer should be flushed by __flush_on_overflow");
     do {
-      size_t __chunk = std::min(__n, __capacity_);
+      __prepare_write(__n);
+      size_t __chunk = std::min(__n, __available());
       std::transform(__first, __first + __chunk, std::addressof(__ptr_[__size_]), __operation);
-      __size_ = __chunk;
+      __size_ += __chunk;
       __first += __chunk;
       __n -= __chunk;
-      __flush();
     } while (__n);
   }
 
   /// A \c fill_n wrapper.
   _LIBCPP_HIDE_FROM_ABI void __fill(size_t __n, _CharT __value) {
-    __flush_on_overflow(__n);
-    if (__n < __capacity_) { //  push_back requires the buffer to have room for at least one character (so use <).
-      std::fill_n(std::addressof(__ptr_[__size_]), __n, __value);
-      __size_ += __n;
-      return;
+    if (__max_output_size_) {
+      __n = __max_output_size_->__write_request(__n);
+      if (__n == 0)
+        return;
     }
 
-    // The output doesn't fit in the internal buffer.
-    // Fill the buffer in "__capacity_" sized chunks.
-    _LIBCPP_ASSERT_INTERNAL(__size_ == 0, "the buffer should be flushed by __flush_on_overflow");
     do {
-      size_t __chunk = std::min(__n, __capacity_);
+      __prepare_write(__n);
+      size_t __chunk = std::min(__n, __available());
       std::fill_n(std::addressof(__ptr_[__size_]), __chunk, __value);
-      __size_ = __chunk;
+      __size_ += __chunk;
       __n -= __chunk;
-      __flush();
     } while (__n);
   }
 
-  _LIBCPP_HIDE_FROM_ABI void __flush() {
-    __flush_(__ptr_, __size_, __obj_);
-    __size_ = 0;
-  }
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI size_t __capacity() const { return __capacity_; }
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI size_t __size() const { return __size_; }
 
 private:
   _CharT* __ptr_;
   size_t __capacity_;
   size_t __size_{0};
-  void (*__flush_)(_CharT*, size_t, void*);
-  void* __obj_;
+  void (*__prepare_write_)(__output_buffer<_CharT>&, size_t);
+  __max_output_size* __max_output_size_;
 
-  /// Flushes the buffer when the output operation would overflow the buffer.
-  ///
-  /// A simple approach for the overflow detection would be something along the
-  /// lines:
-  /// \code
-  /// // The internal buffer is large enough.
-  /// if (__n <= __capacity_) {
-  ///   // Flush when we really would overflow.
-  ///   if (__size_ + __n >= __capacity_)
-  ///     __flush();
-  ///   ...
-  /// }
-  /// \endcode
-  ///
-  /// This approach works for all cases but one:
-  /// A __format_to_n_buffer_base where \ref __enable_direct_output is true.
-  /// In that case the \ref __capacity_ of the buffer changes during the first
-  /// \ref __flush. During that operation the output buffer switches from its
-  /// __writer_ to its __storage_. The \ref __capacity_ of the former depends
-  /// on the value of n, of the latter is a fixed size. For example:
-  /// - a format_to_n call with a 10'000 char buffer,
-  /// - the buffer is filled with 9'500 chars,
-  /// - adding 1'000 elements would overflow the buffer so the buffer gets
-  ///   changed and the \ref __capacity_ decreases from 10'000 to
-  ///   __buffer_size (256 at the time of writing).
-  ///
-  /// This means that the \ref __flush for this class may need to copy a part of
-  /// the internal buffer to the proper output. In this example there will be
-  /// 500 characters that need this copy operation.
-  ///
-  /// Note it would be more efficient to write 500 chars directly and then swap
-  /// the buffers. This would make the code more complex and \ref format_to_n is
-  /// not the most common use case. Therefore the optimization isn't done.
-  _LIBCPP_HIDE_FROM_ABI void __flush_on_overflow(size_t __n) {
-    if (__size_ + __n >= __capacity_)
-      __flush();
-  }
-};
-
-/// A storage using an internal buffer.
-///
-/// This storage is used when writing a single element to the output iterator
-/// is expensive.
-template <__fmt_char_type _CharT>
-class _LIBCPP_TEMPLATE_VIS __internal_storage {
-public:
-  _LIBCPP_HIDE_FROM_ABI _CharT* __begin() { return __buffer_; }
-
-  static constexpr size_t __buffer_size = 256 / sizeof(_CharT);
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI size_t __available() const { return __capacity_ - __size_; }
 
-private:
-  _CharT __buffer_[__buffer_size];
+  _LIBCPP_HIDE_FROM_ABI void __prepare_write(size_t __code_units) {
+    // Always have space for one additional code unit. This is a precondition of the push_back function.
+    __code_units += 1;
+    if (__available() < __code_units)
+      __prepare_write_(*this, __code_units + 1);
+  }
 };
 
-/// A storage writing directly to the storage.
-///
-/// This requires the storage to be a contiguous buffer of \a _CharT.
-/// Since the output is directly written to the underlying storage this class
-/// is just an empty class.
-template <__fmt_char_type _CharT>
-class _LIBCPP_TEMPLATE_VIS __direct_storage {};
-
 template <class _OutIt, class _CharT>
 concept __enable_direct_output =
     __fmt_char_type<_CharT> &&
@@ -260,40 +328,6 @@ concept __enable_direct_output =
      // `#ifdef`.
      || same_as<_OutIt, __wrap_iter<_CharT*>>);
 
-/// Write policy for directly writing to the underlying output.
-template <class _OutIt, __fmt_char_type _CharT>
-class _LIBCPP_TEMPLATE_VIS __writer_direct {
-public:
-  _LIBCPP_HIDE_FROM_ABI explicit __writer_direct(_OutIt __out_it) : __out_it_(__out_it) {}
-
-  _LIBCPP_HIDE_FROM_ABI _OutIt __out_it() { return __out_it_; }
-
-  _LIBCPP_HIDE_FROM_ABI void __flush(_CharT*, size_t __n) {
-    // _OutIt can be a __wrap_iter<CharT*>. Therefore the original iterator
-    // is adjusted.
-    __out_it_ += __n;
-  }
-
-private:
-  _OutIt __out_it_;
-};
-
-/// Write policy for copying the buffer to the output.
-template <class _OutIt, __fmt_char_type _CharT>
-class _LIBCPP_TEMPLATE_VIS __writer_iterator {
-public:
-  _LIBCPP_HIDE_FROM_ABI explicit __writer_iterator(_OutIt __out_it) : __out_it_{std::move(__out_it)} {}
-
-  _LIBCPP_HIDE_FROM_ABI _OutIt __out_it() && { return std::move(__out_it_); }
-
-  _LIBCPP_HIDE_FROM_ABI void __flush(_CharT* __ptr, size_t __n) {
-    __out_it_ = std::ranges::copy_n(__ptr, __n, std::move(__out_it_)).out;
-  }
-
-private:
-  _OutIt __out_it_;
-};
-
 /// Concept to see whether a \a _Container is insertable.
 ///
 /// The concept is used to validate whether multiple calls to a
@@ -319,188 +353,212 @@ struct _LIBCPP_TEMPLATE_VIS __back_insert_iterator_container<back_insert_iterato
   using type = _Container;
 };
 
-/// Write policy for inserting the buffer in a container.
-template <class _Container>
-class _LIBCPP_TEMPLATE_VIS __writer_container {
+// A dynamically growing buffer.
+template <__fmt_char_type _CharT>
+class _LIBCPP_TEMPLATE_VIS __allocating_buffer : public __output_buffer<_CharT> {
 public:
-  using _CharT = typename _Container::value_type;
+  __allocating_buffer(const __allocating_buffer&)            = delete;
+  __allocating_buffer& operator=(const __allocating_buffer&) = delete;
 
-  _LIBCPP_HIDE_FROM_ABI explicit __writer_container(back_insert_iterator<_Container> __out_it)
-      : __container_{__out_it.__get_container()} {}
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI __allocating_buffer() : __allocating_buffer{nullptr} {}
 
-  _LIBCPP_HIDE_FROM_ABI auto __out_it() { return std::back_inserter(*__container_); }
+  [[nodiscard]]
+  _LIBCPP_HIDE_FROM_ABI explicit __allocating_buffer(__max_output_size* __max_output_size)
+      : __output_buffer<_CharT>{__small_buffer_, __buffer_size_, __prepare_write, __max_output_size} {}
 
-  _LIBCPP_HIDE_FROM_ABI void __flush(_CharT* __ptr, size_t __n) {
-    __container_->insert(__container_->end(), __ptr, __ptr + __n);
+  _LIBCPP_HIDE_FROM_ABI ~__allocating_buffer() {
+    if (__ptr_ != __small_buffer_)
+      _Alloc{}.deallocate(__ptr_, this->__capacity());
   }
 
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI basic_string_view<_CharT> __view() { return {__ptr_, this->__size()}; }
+
 private:
-  _Container* __container_;
-};
+  using _Alloc = allocator<_CharT>;
 
-/// Selects the type of the writer used for the output iterator.
-template <class _OutIt, class _CharT>
-class _LIBCPP_TEMPLATE_VIS __writer_selector {
-  using _Container = typename __back_insert_iterator_container<_OutIt>::type;
+  // Since allocating is expensive the class has a small internal buffer. When
+  // its capacity is exceeded a dynamic buffer will be allocated.
+  static constexpr size_t __buffer_size_ = 256;
+  _CharT __small_buffer_[__buffer_size_];
 
-public:
-  using type =
-      conditional_t<!same_as<_Container, void>,
-                    __writer_container<_Container>,
-                    conditional_t<__enable_direct_output<_OutIt, _CharT>,
-                                  __writer_direct<_OutIt, _CharT>,
-                                  __writer_iterator<_OutIt, _CharT>>>;
-};
-
-/// The generic formatting buffer.
-template <class _OutIt, __fmt_char_type _CharT>
-  requires(output_iterator<_OutIt, const _CharT&>)
-class _LIBCPP_TEMPLATE_VIS __format_buffer {
-  using _Storage =
-      conditional_t<__enable_direct_output<_OutIt, _CharT>, __direct_storage<_CharT>, __internal_storage<_CharT>>;
+  _CharT* __ptr_{__small_buffer_};
 
-public:
-  _LIBCPP_HIDE_FROM_ABI explicit __format_buffer(_OutIt __out_it)
-    requires(same_as<_Storage, __internal_storage<_CharT>>)
-      : __output_(__storage_.__begin(), __storage_.__buffer_size, this), __writer_(std::move(__out_it)) {}
+  _LIBCPP_HIDE_FROM_ABI void __grow_buffer(size_t __capacity) {
+    if (__capacity < __buffer_size_)
+      return;
 
-  _LIBCPP_HIDE_FROM_ABI explicit __format_buffer(_OutIt __out_it)
-    requires(same_as<_Storage, __direct_storage<_CharT>>)
-      : __output_(std::__unwrap_iter(__out_it), size_t(-1), this), __writer_(std::move(__out_it)) {}
+    _LIBCPP_ASSERT_INTERNAL(__capacity > this->__capacity(), "the buffer must grow");
 
-  _LIBCPP_HIDE_FROM_ABI auto __make_output_iterator() { return __output_.__make_output_iterator(); }
+    // _CharT is an implicit lifetime type so can be used without explicit
+    // construction or destruction.
+    _Alloc __alloc;
+    auto __result = std::__allocate_at_least(__alloc, __capacity);
+    std::copy_n(__ptr_, this->__size(), __result.ptr);
+    if (__ptr_ != __small_buffer_)
+      __alloc.deallocate(__ptr_, this->__capacity());
 
-  _LIBCPP_HIDE_FROM_ABI void __flush(_CharT* __ptr, size_t __n) { __writer_.__flush(__ptr, __n); }
+    __ptr_ = __result.ptr;
+    this->__buffer_moved(__ptr_, __result.count);
+  }
 
-  _LIBCPP_HIDE_FROM_ABI _OutIt __out_it() && {
-    __output_.__flush();
-    return std::move(__writer_).__out_it();
+  _LIBCPP_HIDE_FROM_ABI void __prepare_write(size_t __size_hint) {
+    __grow_buffer(std::max<size_t>(this->__capacity() + __size_hint, this->__capacity() * 1.6));
   }
 
-private:
-  _LIBCPP_NO_UNIQUE_ADDRESS _Storage __storage_;
-  __output_buffer<_CharT> __output_;
-  typename __writer_selector<_OutIt, _CharT>::type __writer_;
+  _LIBCPP_HIDE_FROM_ABI static void __prepare_write(__output_buffer<_CharT>& __buffer, size_t __size_hint) {
+    static_cast<__allocating_buffer<_CharT>&>(__buffer).__prepare_write(__size_hint);
+  }
 };
 
-/// A buffer that counts the number of insertions.
-///
-/// Since \ref formatted_size only needs to know the size, the output itself is
-/// discarded.
-template <__fmt_char_type _CharT>
-class _LIBCPP_TEMPLATE_VIS __formatted_size_buffer {
+// A buffer that directly writes to the underlying buffer.
+template <class _OutIt, __fmt_char_type _CharT>
+class _LIBCPP_TEMPLATE_VIS __direct_iterator_buffer : public __output_buffer<_CharT> {
 public:
-  _LIBCPP_HIDE_FROM_ABI auto __make_output_iterator() { return __output_.__make_output_iterator(); }
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI explicit __direct_iterator_buffer(_OutIt __out_it)
+      : __direct_iterator_buffer{__out_it, nullptr} {}
 
-  _LIBCPP_HIDE_FROM_ABI void __flush(const _CharT*, size_t __n) { __size_ += __n; }
+  [[nodiscard]]
+  _LIBCPP_HIDE_FROM_ABI explicit __direct_iterator_buffer(_OutIt __out_it, __max_output_size* __max_output_size)
+      : __output_buffer<_CharT>{std::__unwrap_iter(__out_it), __buffer_size, __prepare_write, __max_output_size},
+        __out_it_(__out_it) {}
 
-  _LIBCPP_HIDE_FROM_ABI size_t __result() && {
-    __output_.__flush();
-    return __size_;
-  }
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI _OutIt __out_it() && { return __out_it_ + this->__size(); }
 
 private:
-  __internal_storage<_CharT> __storage_;
-  __output_buffer<_CharT> __output_{__storage_.__begin(), __storage_.__buffer_size, this};
-  size_t __size_{0};
-};
+  // The function format_to expects a buffer large enough for the output. The
+  // function format_to_n has its own helper class that restricts the number of
+  // write options. So this function class can pretend to have an infinite
+  // buffer.
+  static constexpr size_t __buffer_size = -1;
+
+  _OutIt __out_it_;
 
-/// The base of a buffer that counts and limits the number of insertions.
-template <class _OutIt, __fmt_char_type _CharT, bool>
-  requires(output_iterator<_OutIt, const _CharT&>)
-struct _LIBCPP_TEMPLATE_VIS __format_to_n_buffer_base {
-  using _Size = iter_difference_t<_OutIt>;
+  _LIBCPP_HIDE_FROM_ABI static void
+  __prepare_write([[maybe_unused]] __output_buffer<_CharT>& __buffer, [[maybe_unused]] size_t __size_hint) {
+    std::__throw_length_error("__direct_iterator_buffer");
+  }
+};
 
+// A buffer that writes its output to the end of a container.
+template <class _OutIt, __fmt_char_type _CharT>
+class _LIBCPP_TEMPLATE_VIS __container_inserter_buffer : public __output_buffer<_CharT> {
 public:
-  _LIBCPP_HIDE_FROM_ABI explicit __format_to_n_buffer_base(_OutIt __out_it, _Size __max_size)
-      : __writer_(std::move(__out_it)), __max_size_(std::max(_Size(0), __max_size)) {}
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI explicit __container_inserter_buffer(_OutIt __out_it)
+      : __container_inserter_buffer{__out_it, nullptr} {}
 
-  _LIBCPP_HIDE_FROM_ABI void __flush(_CharT* __ptr, size_t __n) {
-    if (_Size(__size_) <= __max_size_)
-      __writer_.__flush(__ptr, std::min(_Size(__n), __max_size_ - __size_));
-    __size_ += __n;
+  [[nodiscard]]
+  _LIBCPP_HIDE_FROM_ABI explicit __container_inserter_buffer(_OutIt __out_it, __max_output_size* __max_output_size)
+      : __output_buffer<_CharT>{__small_buffer_, __buffer_size, __prepare_write, __max_output_size},
+        __container_{__out_it.__get_container()} {}
+
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI auto __out_it() && {
+    __container_->insert(__container_->end(), __small_buffer_, __small_buffer_ + this->__size());
+    return std::back_inserter(*__container_);
   }
 
-protected:
-  __internal_storage<_CharT> __storage_;
-  __output_buffer<_CharT> __output_{__storage_.__begin(), __storage_.__buffer_size, this};
-  typename __writer_selector<_OutIt, _CharT>::type __writer_;
+private:
+  typename __back_insert_iterator_container<_OutIt>::type* __container_;
+
+  // This class uses a fixed size buffer and appends the elements in
+  // __buffer_size chunks. An alternative would be to use an allocating buffer
+  // and append the output in a single write operation. Benchmarking showed no
+  // performance difference.
+  static constexpr size_t __buffer_size = 256;
+  _CharT __small_buffer_[__buffer_size];
+
+  _LIBCPP_HIDE_FROM_ABI void __prepare_write() {
+    __container_->insert(__container_->end(), __small_buffer_, __small_buffer_ + this->__size());
+    this->__buffer_flushed();
+  }
 
-  _Size __max_size_;
-  _Size __size_{0};
+  _LIBCPP_HIDE_FROM_ABI static void
+  __prepare_write(__output_buffer<_CharT>& __buffer, [[maybe_unused]] size_t __size_hint) {
+    static_cast<__container_inserter_buffer<_OutIt, _CharT>&>(__buffer).__prepare_write();
+  }
 };
 
-/// The base of a buffer that counts and limits the number of insertions.
-///
-/// This version is used when \c __enable_direct_output<_OutIt, _CharT> == true.
-///
-/// This class limits the size available to the direct writer so it will not
-/// exceed the maximum number of code units.
+// A buffer that writes to an iterator.
+//
+// Unlike the __container_inserter_buffer this class' performance does benefit
+// from allocating and then inserting.
 template <class _OutIt, __fmt_char_type _CharT>
-  requires(output_iterator<_OutIt, const _CharT&>)
-class _LIBCPP_TEMPLATE_VIS __format_to_n_buffer_base<_OutIt, _CharT, true> {
-  using _Size = iter_difference_t<_OutIt>;
-
+class _LIBCPP_TEMPLATE_VIS __iterator_buffer : public __allocating_buffer<_CharT> {
 public:
-  _LIBCPP_HIDE_FROM_ABI explicit __format_to_n_buffer_base(_OutIt __out_it, _Size __max_size)
-      : __output_(std::__unwrap_iter(__out_it), __max_size, this),
-        __writer_(std::move(__out_it)),
-        __max_size_(__max_size) {
-    if (__max_size <= 0) [[unlikely]]
-      __output_.__reset(__storage_.__begin(), __storage_.__buffer_size);
-  }
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI explicit __iterator_buffer(_OutIt __out_it)
+      : __allocating_buffer<_CharT>{}, __out_it_{std::move(__out_it)} {}
 
-  _LIBCPP_HIDE_FROM_ABI void __flush(_CharT* __ptr, size_t __n) {
-    // A __flush to the direct writer happens in the following occasions:
-    // - The format function has written the maximum number of allowed code
-    //   units. At this point it's no longer valid to write to this writer. So
-    //   switch to the internal storage. This internal storage doesn't need to
-    //   be written anywhere so the __flush for that storage writes no output.
-    // - Like above, but the next "mass write" operation would overflow the
-    //   buffer. In that case the buffer is pre-emptively switched. The still
-    //   valid code units will be written separately.
-    // - The format_to_n function is finished. In this case there's no need to
-    //   switch the buffer, but for simplicity the buffers are still switched.
-    // When the __max_size <= 0 the constructor already switched the buffers.
-    if (__size_ == 0 && __ptr != __storage_.__begin()) {
-      __writer_.__flush(__ptr, __n);
-      __output_.__reset(__storage_.__begin(), __storage_.__buffer_size);
-    } else if (__size_ < __max_size_) {
-      // Copies a part of the internal buffer to the output up to n characters.
-      // See __output_buffer<_CharT>::__flush_on_overflow for more information.
-      _Size __s = std::min(_Size(__n), __max_size_ - __size_);
-      std::copy_n(__ptr, __s, __writer_.__out_it());
-      __writer_.__flush(__ptr, __s);
-    }
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI explicit __iterator_buffer(_OutIt __out_it, __max_output_size* __max_output_size)
+      : __allocating_buffer<_CharT>{__max_output_size}, __out_it_{std::move(__out_it)} {}
 
-    __size_ += __n;
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI auto __out_it() && {
+    return std::ranges::copy(this->__view(), std::move(__out_it_)).out;
   }
 
-protected:
-  __internal_storage<_CharT> __storage_;
-  __output_buffer<_CharT> __output_;
-  __writer_direct<_OutIt, _CharT> __writer_;
+private:
+  _OutIt __out_it_;
+};
+
+// Selects the type of the buffer used for the output iterator.
+template <class _OutIt, __fmt_char_type _CharT>
+class _LIBCPP_TEMPLATE_VIS __buffer_selector {
+  using _Container = __back_insert_iterator_container<_OutIt>::type;
 
-  _Size __max_size_;
-  _Size __size_{0};
+public:
+  using type =
+      conditional_t<!same_as<_Container, void>,
+                    __container_inserter_buffer<_OutIt, _CharT>,
+                    conditional_t<__enable_direct_output<_OutIt, _CharT>,
+                                  __direct_iterator_buffer<_OutIt, _CharT>,
+                                  __iterator_buffer<_OutIt, _CharT>>>;
 };
 
-/// The buffer that counts and limits the number of insertions.
+// A buffer that counts and limits the number of insertions.
 template <class _OutIt, __fmt_char_type _CharT>
-  requires(output_iterator<_OutIt, const _CharT&>)
-struct _LIBCPP_TEMPLATE_VIS __format_to_n_buffer final
-    : public __format_to_n_buffer_base< _OutIt, _CharT, __enable_direct_output<_OutIt, _CharT>> {
-  using _Base = __format_to_n_buffer_base<_OutIt, _CharT, __enable_direct_output<_OutIt, _CharT>>;
-  using _Size = iter_difference_t<_OutIt>;
+class _LIBCPP_TEMPLATE_VIS __format_to_n_buffer : private __buffer_selector<_OutIt, _CharT>::type {
+public:
+  using _Base = __buffer_selector<_OutIt, _CharT>::type;
+
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI __format_to_n_buffer(_OutIt __out_it, iter_difference_t<_OutIt> __n)
+      : _Base{std::move(__out_it), std::addressof(__max_output_size_)},
+        __max_output_size_{__n < 0 ? size_t{0} : static_cast<size_t>(__n)} {}
+
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI auto __make_output_iterator() { return _Base::__make_output_iterator(); }
+
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI format_to_n_result<_OutIt> __result() && {
+    return {static_cast<_Base&&>(*this).__out_it(),
+            static_cast<iter_difference_t<_OutIt>>(__max_output_size_.__code_units_written())};
+  }
+
+private:
+  __max_output_size __max_output_size_;
+};
 
+// A buffer that counts the number of insertions.
+//
+// Since formatted_size only needs to know the size, the output itself is
+// discarded.
+template <__fmt_char_type _CharT>
+class _LIBCPP_TEMPLATE_VIS __formatted_size_buffer : private __output_buffer<_CharT> {
 public:
-  _LIBCPP_HIDE_FROM_ABI explicit __format_to_n_buffer(_OutIt __out_it, _Size __max_size)
-      : _Base(std::move(__out_it), __max_size) {}
-  _LIBCPP_HIDE_FROM_ABI auto __make_output_iterator() { return this->__output_.__make_output_iterator(); }
+  using _Base = __output_buffer<_CharT>;
+
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI __formatted_size_buffer()
+      : _Base{nullptr, 0, __prepare_write, std::addressof(__max_output_size_)} {}
+
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI auto __make_output_iterator() { return _Base::__make_output_iterator(); }
+
+  // This function does not need to be r-value qualified, however this is
+  // consistent with similar objects.
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI size_t __result() && { return __max_output_size_.__code_units_written(); }
+
+private:
+  __max_output_size __max_output_size_{0};
 
-  _LIBCPP_HIDE_FROM_ABI format_to_n_result<_OutIt> __result() && {
-    this->__output_.__flush();
-    return {std::move(this->__writer_).__out_it(), this->__size_};
+  _LIBCPP_HIDE_FROM_ABI static void
+  __prepare_write([[maybe_unused]] __output_buffer<_CharT>& __buffer, [[maybe_unused]] size_t __size_hint) {
+    // Note this function does not satisfy the requirement of giving a 1 code unit buffer.
+    _LIBCPP_ASSERT_INTERNAL(
+        false, "Since __max_output_size_.__max_size_ == 0 there should never be call to this function.");
   }
 };
 
diff --git a/libcxx/include/__format/format_functions.h b/libcxx/include/__format/format_functions.h
index d14b49aff14957..01cefc26069829 100644
--- a/libcxx/include/__format/format_functions.h
+++ b/libcxx/include/__format/format_functions.h
@@ -31,7 +31,6 @@
 #include <__format/formatter_pointer.h>
 #include <__format/formatter_string.h>
 #include <__format/parser_std_format_spec.h>
-#include <__iterator/back_insert_iterator.h>
 #include <__iterator/concepts.h>
 #include <__iterator/incrementable_traits.h>
 #include <__iterator/iterator_traits.h> // iter_value_t
@@ -411,7 +410,7 @@ _LIBCPP_HIDE_FROM_ABI _OutIt __vformat_to(_OutIt __out_it,
     return std::__format::__vformat_to(
         basic_format_parse_context{__fmt, __args.__size()}, std::__format_context_create(std::move(__out_it), __args));
   else {
-    __format::__format_buffer<_OutIt, _CharT> __buffer{std::move(__out_it)};
+    typename __format::__buffer_selector<_OutIt, _CharT>::type __buffer{std::move(__out_it)};
     std::__format::__vformat_to(basic_format_parse_context{__fmt, __args.__size()},
                                 std::__format_context_create(__buffer.__make_output_iterator(), __args));
     return std::move(__buffer).__out_it();
@@ -452,9 +451,9 @@ format_to(_OutIt __out_it, wformat_string<_Args...> __fmt, _Args&&... __args) {
 // fires too eagerly, see http://llvm.org/PR61563.
 template <class = void>
 [[nodiscard]] _LIBCPP_ALWAYS_INLINE inline _LIBCPP_HIDE_FROM_ABI string vformat(string_view __fmt, format_args __args) {
-  string __res;
-  std::vformat_to(std::back_inserter(__res), __fmt, __args);
-  return __res;
+  __format::__allocating_buffer<char> __buffer;
+  std::vformat_to(__buffer.__make_output_iterator(), __fmt, __args);
+  return string{__buffer.__view()};
 }
 
 #  ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS
@@ -463,9 +462,9 @@ template <class = void>
 template <class = void>
 [[nodiscard]] _LIBCPP_ALWAYS_INLINE inline _LIBCPP_HIDE_FROM_ABI wstring
 vformat(wstring_view __fmt, wformat_args __args) {
-  wstring __res;
-  std::vformat_to(std::back_inserter(__res), __fmt, __args);
-  return __res;
+  __format::__allocating_buffer<wchar_t> __buffer;
+  std::vformat_to(__buffer.__make_output_iterator(), __fmt, __args);
+  return wstring{__buffer.__view()};
 }
 #  endif
 
@@ -544,7 +543,7 @@ _LIBCPP_HIDE_FROM_ABI _OutIt __vformat_to(
     return std::__format::__vformat_to(basic_format_parse_context{__fmt, __args.__size()},
                                        std::__format_context_create(std::move(__out_it), __args, std::move(__loc)));
   else {
-    __format::__format_buffer<_OutIt, _CharT> __buffer{std::move(__out_it)};
+    typename __format::__buffer_selector<_OutIt, _CharT>::type __buffer{std::move(__out_it)};
     std::__format::__vformat_to(
         basic_format_parse_context{__fmt, __args.__size()},
         std::__format_context_create(__buffer.__make_output_iterator(), __args, std::move(__loc)));
@@ -585,9 +584,9 @@ format_to(_OutIt __out_it, locale __loc, wformat_string<_Args...> __fmt, _Args&&
 template <class = void>
 [[nodiscard]] _LIBCPP_ALWAYS_INLINE inline _LIBCPP_HIDE_FROM_ABI string
 vformat(locale __loc, string_view __fmt, format_args __args) {
-  string __res;
-  std::vformat_to(std::back_inserter(__res), std::move(__loc), __fmt, __args);
-  return __res;
+  __format::__allocating_buffer<char> __buffer;
+  std::vformat_to(__buffer.__make_output_iterator(), std::move(__loc), __fmt, __args);
+  return string{__buffer.__view()};
 }
 
 #    ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS
@@ -596,9 +595,9 @@ vformat(locale __loc, string_view __fmt, format_args __args) {
 template <class = void>
 [[nodiscard]] _LIBCPP_ALWAYS_INLINE inline _LIBCPP_HIDE_FROM_ABI wstring
 vformat(locale __loc, wstring_view __fmt, wformat_args __args) {
-  wstring __res;
-  std::vformat_to(std::back_inserter(__res), std::move(__loc), __fmt, __args);
-  return __res;
+  __format::__allocating_buffer<wchar_t> __buffer;
+  std::vformat_to(__buffer.__make_output_iterator(), std::move(__loc), __fmt, __args);
+  return wstring{__buffer.__view()};
 }
 #    endif
 
diff --git a/libcxx/test/libcxx/transitive_includes/cxx03.csv b/libcxx/test/libcxx/transitive_includes/cxx03.csv
index 51e659f52000b4..622fced5ffa40d 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx03.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx03.csv
@@ -245,21 +245,11 @@ experimental/simd limits
 experimental/utility utility
 filesystem compare
 filesystem concepts
-filesystem cstddef
-filesystem cstdint
 filesystem cstdlib
 filesystem cstring
-filesystem ctime
-filesystem iomanip
 filesystem iosfwd
-filesystem limits
-filesystem locale
 filesystem new
-filesystem ratio
-filesystem string
-filesystem string_view
 filesystem system_error
-filesystem type_traits
 filesystem version
 format array
 format cctype
@@ -787,15 +777,7 @@ stack version
 stdexcept cstdlib
 stdexcept exception
 stdexcept iosfwd
-stop_token atomic
-stop_token cstddef
-stop_token cstdint
-stop_token cstring
-stop_token ctime
 stop_token iosfwd
-stop_token limits
-stop_token ratio
-stop_token type_traits
 stop_token version
 streambuf cctype
 streambuf climits
diff --git a/libcxx/test/libcxx/transitive_includes/cxx11.csv b/libcxx/test/libcxx/transitive_includes/cxx11.csv
index 17e85e982729cf..11c3c1322c4069 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx11.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx11.csv
@@ -246,21 +246,11 @@ experimental/simd limits
 experimental/utility utility
 filesystem compare
 filesystem concepts
-filesystem cstddef
-filesystem cstdint
 filesystem cstdlib
 filesystem cstring
-filesystem ctime
-filesystem iomanip
 filesystem iosfwd
-filesystem limits
-filesystem locale
 filesystem new
-filesystem ratio
-filesystem string
-filesystem string_view
 filesystem system_error
-filesystem type_traits
 filesystem version
 format array
 format cctype
@@ -793,15 +783,7 @@ stack version
 stdexcept cstdlib
 stdexcept exception
 stdexcept iosfwd
-stop_token atomic
-stop_token cstddef
-stop_token cstdint
-stop_token cstring
-stop_token ctime
 stop_token iosfwd
-stop_token limits
-stop_token ratio
-stop_token type_traits
 stop_token version
 streambuf cctype
 streambuf climits
diff --git a/libcxx/test/libcxx/transitive_includes/cxx14.csv b/libcxx/test/libcxx/transitive_includes/cxx14.csv
index 8aed93da9e6ccf..666d5c38964673 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx14.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx14.csv
@@ -249,21 +249,11 @@ experimental/type_traits type_traits
 experimental/utility utility
 filesystem compare
 filesystem concepts
-filesystem cstddef
-filesystem cstdint
 filesystem cstdlib
 filesystem cstring
-filesystem ctime
-filesystem iomanip
 filesystem iosfwd
-filesystem limits
-filesystem locale
 filesystem new
-filesystem ratio
-filesystem string
-filesystem string_view
 filesystem system_error
-filesystem type_traits
 filesystem version
 format array
 format cctype
@@ -796,15 +786,7 @@ stack version
 stdexcept cstdlib
 stdexcept exception
 stdexcept iosfwd
-stop_token atomic
-stop_token cstddef
-stop_token cstdint
-stop_token cstring
-stop_token ctime
 stop_token iosfwd
-stop_token limits
-stop_token ratio
-stop_token type_traits
 stop_token version
 streambuf cctype
 streambuf climits
diff --git a/libcxx/test/libcxx/transitive_includes/cxx17.csv b/libcxx/test/libcxx/transitive_includes/cxx17.csv
index 2c028462144eee..3a3aa5a8944733 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx17.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx17.csv
@@ -797,15 +797,7 @@ stack version
 stdexcept cstdlib
 stdexcept exception
 stdexcept iosfwd
-stop_token atomic
-stop_token cstddef
-stop_token cstdint
-stop_token cstring
-stop_token ctime
 stop_token iosfwd
-stop_token limits
-stop_token ratio
-stop_token type_traits
 stop_token version
 streambuf cctype
 streambuf climits
diff --git a/libcxx/test/std/utilities/format/format.functions/format_tests.h b/libcxx/test/std/utilities/format/format.functions/format_tests.h
index b2ed6775fe8a13..3969b341cb1463 100644
--- a/libcxx/test/std/utilities/format/format.functions/format_tests.h
+++ b/libcxx/test/std/utilities/format/format.functions/format_tests.h
@@ -3038,7 +3038,7 @@ void format_test_buffer_optimizations(TestFunction check) {
   // Used to validate our test sets are the proper size.
   // To test the chunked operations it needs to be larger than the internal
   // buffer. Picked a nice looking number.
-  constexpr int minimum = 3 * std::__format::__internal_storage<CharT>::__buffer_size;
+  constexpr int minimum = 3 * 256;
 #else
   constexpr int minimum = 1;
 #endif



More information about the llvm-branch-commits mailing list