[libcxx-commits] [libcxx] [libcxx] Recalculate union-member sizes in basic_string (PR #168742)
Simon Tatham via libcxx-commits
libcxx-commits at lists.llvm.org
Mon Nov 24 05:36:54 PST 2025
https://github.com/statham-arm updated https://github.com/llvm/llvm-project/pull/168742
>From d51148dcb2412750920b5e3fa18c343cb7b4c173 Mon Sep 17 00:00:00 2001
From: Simon Tatham <simon.tatham at arm.com>
Date: Wed, 19 Nov 2025 17:21:55 +0000
Subject: [PATCH 1/6] [libcxx] Recalculate union-member sizes in basic_string
Fixes #51158: instantiating a `basic_string` with a character type of
3, 5, or greater than 8 wouldn't work properly, because the two
members of its internal union disagreed on the location of the bit
flag indicating which member was live.
This commit completely rewrites the calculations of how much to pad
the two sub-structures so that their lengths match. Two new
static_asserts check that, after all the calculation is done, the
compiler agrees that the structures really did end up the same size. A
new test instantiates `basic_string` with every size of character type
up to 16, and four different alignment requirements.
The new code now potentially has to pad _both_ sub-structures,
depending on what is and is not divisible by the size of the character
type. This also means that the `__padding` type must have a method to
zero it out (otherwise the `__grow_by` method stops being constexpr,
which an existing test depends on).
I've also added some extra checks to the existing test
`push_back.pass.cpp`, which _could_ have found this issue by checking
that pushing huge characters on to a string updated the length in the
appropriate way.
---
libcxx/include/string | 57 ++++++++-
.../basic.string/awkward-char-types.pass.cpp | 114 ++++++++++++++++++
.../string_append/push_back.pass.cpp | 4 +
3 files changed, 169 insertions(+), 6 deletions(-)
create mode 100644 libcxx/test/std/strings/basic.string/awkward-char-types.pass.cpp
diff --git a/libcxx/include/string b/libcxx/include/string
index 09fc6228c4fdb..50fcf5dc6fa44 100644
--- a/libcxx/include/string
+++ b/libcxx/include/string
@@ -719,10 +719,16 @@ struct __init_with_sentinel_tag {};
template <size_t _PaddingSize>
struct __padding {
char __padding_[_PaddingSize];
+ constexpr void clear() {
+ __padding __initialized = {0};
+ *this = __initialized;
+ }
};
template <>
-struct __padding<0> {};
+struct __padding<0> {
+ constexpr void clear() {}
+};
template <class _CharT, class _Traits, class _Allocator>
class basic_string {
@@ -819,6 +825,33 @@ private:
# ifdef _LIBCPP_ABI_ALTERNATE_STRING_LAYOUT
+ // The __long structure contains at least a `pointer` to an allocated buffer,
+ // and two `size_type` (the second one being a bitfield container). This
+ // struct, only used by `sizeof`, is the minimum that will fit those fields.
+ struct __long_min {
+ pointer __data_;
+ size_type __size_;
+ size_type __bitfield_container;
+ };
+
+ // The __short structure has a byte-sized bitfield container at the end, and
+ // the rest of it can be taken up by value_type. We want at least two
+ // value_type, or more if they will fit.
+ static constexpr size_t __fit_cap = (sizeof(__long_min) - 1) / sizeof(value_type);
+ static constexpr size_t __min_cap = __fit_cap > 2 ? __fit_cap : 2;
+
+ // Now we know how many value_type will fit, calculate how much space they
+ // take up in total; add one byte for the final bitfield container; and round
+ // up to the nearest multiple of the alignment. That's the total size of the
+ // structure.
+ static constexpr size_t __short_packed_size = sizeof(value_type) * __min_cap + 1;
+ union __union_alignment_check { __long_min __long_; value_type v; };
+ static constexpr size_t __union_alignment = _LIBCPP_ALIGNOF(__union_alignment_check);
+ static constexpr size_t __full_size = (__short_packed_size + __union_alignment - 1) & -__union_alignment;
+
+ // Now define both structures for real, with padding to ensure they are both
+ // exactly the calculated size.
+
struct __long {
__long() = default;
@@ -829,19 +862,23 @@ private:
pointer __data_;
size_type __size_;
+ _LIBCPP_NO_UNIQUE_ADDRESS __padding<__full_size - sizeof(__long_min)> __padding_;
size_type __cap_ : sizeof(size_type) * CHAR_BIT - 1;
size_type __is_long_ : 1;
};
- enum { __min_cap = (sizeof(__long) - 1) / sizeof(value_type) > 2 ? (sizeof(__long) - 1) / sizeof(value_type) : 2 };
-
struct __short {
value_type __data_[__min_cap];
- _LIBCPP_NO_UNIQUE_ADDRESS __padding<sizeof(value_type) - 1> __padding_;
+ _LIBCPP_NO_UNIQUE_ADDRESS __padding<__full_size - __short_packed_size> __padding_;
unsigned char __size_ : 7;
unsigned char __is_long_ : 1;
};
+ // Finally, check that we got it all right, and that both structures have
+ // exactly the expected size.
+ static_assert(sizeof(__long) == __full_size, "Miscalculated size for __long");
+ static_assert(sizeof(__short) == __full_size, "Miscalculated size for __short");
+
// The __endian_factor is required because the field we use to store the size
// has one fewer bit than it would if it were not a bitfield.
//
@@ -887,6 +924,13 @@ private:
};
size_type __size_;
pointer __data_;
+
+ // No padding is needed in this version of the structure, but we add a
+ // zero-sized __padding_ member anyway to match the
+ // `_LIBCPP_ABI_ALTERNATE_STRING_LAYOUT` version, so that
+ // `__padding_.clear()` can be performed unconditionally in code common to
+ // both layouts.
+ _LIBCPP_NO_UNIQUE_ADDRESS __padding<0> __padding_;
};
enum { __min_cap = (sizeof(__long) - 1) / sizeof(value_type) > 2 ? (sizeof(__long) - 1) / sizeof(value_type) : 2 };
@@ -900,10 +944,10 @@ private:
value_type __data_[__min_cap];
};
-# endif // _LIBCPP_ABI_ALTERNATE_STRING_LAYOUT
-
static_assert(sizeof(__short) == (sizeof(value_type) * (__min_cap + 1)), "__short has an unexpected size.");
+# endif // _LIBCPP_ABI_ALTERNATE_STRING_LAYOUT
+
union __rep {
__short __s;
__long __l;
@@ -2764,6 +2808,7 @@ _LIBCPP_DEPRECATED_("use __grow_by_without_replace") basic_string<_CharT, _Trait
// This is -1 to make sure the caller sets the size properly, since old versions of this function didn't set the size
// at all.
__buffer.__size_ = -1;
+ __buffer.__padding_.clear();
__reset_internal_buffer(__buffer);
}
diff --git a/libcxx/test/std/strings/basic.string/awkward-char-types.pass.cpp b/libcxx/test/std/strings/basic.string/awkward-char-types.pass.cpp
new file mode 100644
index 0000000000000..a4e258680cd2e
--- /dev/null
+++ b/libcxx/test/std/strings/basic.string/awkward-char-types.pass.cpp
@@ -0,0 +1,114 @@
+#include <stdint.h>
+#include <string>
+#include <algorithm>
+#include <cassert>
+
+#include "test_macros.h"
+
+template<typename Char>
+void test_string() {
+ // Make a test string.
+ std::basic_string<Char> s;
+ LIBCPP_ASSERT(s.size() == 0);
+
+ // Append enough chars to it that we must have switched over from a short
+ // string stored internally to a long one pointing to a dynamic buffer,
+ // causing a reallocation.
+ unsigned n = sizeof(s) / sizeof(Char) + 1;
+ for (unsigned i = 0; i < n; i++) {
+ s.push_back(Char::from_integer(i));
+ LIBCPP_ASSERT(s.size() == i + 1);
+ }
+
+ // Check that all the chars were correctly copied during the realloc.
+ for (unsigned i = 0; i < n; i++) {
+ LIBCPP_ASSERT(s[i] == Char::from_integer(i));
+ }
+}
+
+template<typename Integer, size_t N>
+struct TestChar {
+ Integer values[N];
+
+ static TestChar from_integer(unsigned index) {
+ TestChar ch;
+ for (size_t i = 0; i < N; i++)
+ ch.values[i] = index + i;
+ return ch;
+ }
+
+ bool operator==(const TestChar &other) const {
+ return 0 == memcmp(values, other.values, sizeof(values));
+ }
+ bool operator<(const TestChar &other) const {
+ return 0 < memcmp(values, other.values, sizeof(values));
+ }
+};
+
+template<typename Integer, size_t N>
+struct std::char_traits<TestChar<Integer, N>> {
+ using char_type = TestChar<Integer, N>;
+ using int_type = int;
+ using off_type = streamoff;
+ using pos_type = streampos;
+ using state_type = mbstate_t;
+
+ static TEST_CONSTEXPR_CXX20 void assign(char_type& c1, const char_type& c2) { c1 = c2; }
+ static bool eq(char_type c1, char_type c2);
+ static bool lt(char_type c1, char_type c2);
+
+ static int compare(const char_type* s1, const char_type* s2, std::size_t n);
+ static std::size_t length(const char_type* s);
+ static const char_type* find(const char_type* s, std::size_t n, const char_type& a);
+ static char_type* move(char_type* s1, const char_type* s2, std::size_t n);
+ static TEST_CONSTEXPR_CXX20 char_type* copy(char_type* s1, const char_type* s2, std::size_t n) {
+ std::copy_n(s2, n, s1);
+ return s1;
+ }
+ static TEST_CONSTEXPR_CXX20 char_type* assign(char_type* s, std::size_t n, char_type a) {
+ std::fill_n(s, n, a);
+ return s;
+ }
+
+ static int_type not_eof(int_type c);
+ static char_type to_char_type(int_type c);
+ static int_type to_int_type(char_type c);
+ static bool eq_int_type(int_type c1, int_type c2);
+ static int_type eof();
+};
+
+int main(int, char**) {
+ test_string<TestChar<uint8_t, 1>>();
+ test_string<TestChar<uint8_t, 2>>();
+ test_string<TestChar<uint8_t, 3>>();
+ test_string<TestChar<uint8_t, 4>>();
+ test_string<TestChar<uint8_t, 5>>();
+ test_string<TestChar<uint8_t, 6>>();
+ test_string<TestChar<uint8_t, 7>>();
+ test_string<TestChar<uint8_t, 8>>();
+ test_string<TestChar<uint8_t, 9>>();
+ test_string<TestChar<uint8_t, 10>>();
+ test_string<TestChar<uint8_t, 11>>();
+ test_string<TestChar<uint8_t, 12>>();
+ test_string<TestChar<uint8_t, 13>>();
+ test_string<TestChar<uint8_t, 14>>();
+ test_string<TestChar<uint8_t, 15>>();
+ test_string<TestChar<uint8_t, 16>>();
+
+ test_string<TestChar<uint16_t, 1>>();
+ test_string<TestChar<uint16_t, 2>>();
+ test_string<TestChar<uint16_t, 3>>();
+ test_string<TestChar<uint16_t, 4>>();
+ test_string<TestChar<uint16_t, 5>>();
+ test_string<TestChar<uint16_t, 6>>();
+ test_string<TestChar<uint16_t, 7>>();
+ test_string<TestChar<uint16_t, 8>>();
+
+ test_string<TestChar<uint32_t, 1>>();
+ test_string<TestChar<uint32_t, 2>>();
+ test_string<TestChar<uint32_t, 3>>();
+ test_string<TestChar<uint32_t, 4>>();
+
+ test_string<TestChar<uint64_t, 1>>();
+ test_string<TestChar<uint64_t, 2>>();
+}
diff --git a/libcxx/test/std/strings/basic.string/string.modifiers/string_append/push_back.pass.cpp b/libcxx/test/std/strings/basic.string/string.modifiers/string_append/push_back.pass.cpp
index 8c2324c9d1759..34669c28d624c 100644
--- a/libcxx/test/std/strings/basic.string/string.modifiers/string_append/push_back.pass.cpp
+++ b/libcxx/test/std/strings/basic.string/string.modifiers/string_append/push_back.pass.cpp
@@ -84,9 +84,13 @@ TEST_CONSTEXPR_CXX20 bool test() {
// https://llvm.org/PR31454
std::basic_string<VeryLarge> s;
VeryLarge vl = {};
+ LIBCPP_ASSERT(s.size() == 0);
s.push_back(vl);
+ LIBCPP_ASSERT(s.size() == 1);
s.push_back(vl);
+ LIBCPP_ASSERT(s.size() == 2);
s.push_back(vl);
+ LIBCPP_ASSERT(s.size() == 3);
}
return true;
>From 5834b98e5ff2245d0cd7212c006800930e3f0820 Mon Sep 17 00:00:00 2001
From: Simon Tatham <simon.tatham at arm.com>
Date: Thu, 20 Nov 2025 13:49:22 +0000
Subject: [PATCH 2/6] git-clang-format
---
.../basic.string/awkward-char-types.pass.cpp | 14 +++++---------
1 file changed, 5 insertions(+), 9 deletions(-)
diff --git a/libcxx/test/std/strings/basic.string/awkward-char-types.pass.cpp b/libcxx/test/std/strings/basic.string/awkward-char-types.pass.cpp
index a4e258680cd2e..a649cd8f582f8 100644
--- a/libcxx/test/std/strings/basic.string/awkward-char-types.pass.cpp
+++ b/libcxx/test/std/strings/basic.string/awkward-char-types.pass.cpp
@@ -5,7 +5,7 @@
#include "test_macros.h"
-template<typename Char>
+template <typename Char>
void test_string() {
// Make a test string.
std::basic_string<Char> s;
@@ -26,7 +26,7 @@ void test_string() {
}
}
-template<typename Integer, size_t N>
+template <typename Integer, size_t N>
struct TestChar {
Integer values[N];
@@ -37,15 +37,11 @@ struct TestChar {
return ch;
}
- bool operator==(const TestChar &other) const {
- return 0 == memcmp(values, other.values, sizeof(values));
- }
- bool operator<(const TestChar &other) const {
- return 0 < memcmp(values, other.values, sizeof(values));
- }
+ bool operator==(const TestChar& other) const { return 0 == memcmp(values, other.values, sizeof(values)); }
+ bool operator<(const TestChar& other) const { return 0 < memcmp(values, other.values, sizeof(values)); }
};
-template<typename Integer, size_t N>
+template <typename Integer, size_t N>
struct std::char_traits<TestChar<Integer, N>> {
using char_type = TestChar<Integer, N>;
using int_type = int;
>From ce04fcb17c361465e91317a94e79ce6432e1eaaa Mon Sep 17 00:00:00 2001
From: Simon Tatham <simon.tatham at arm.com>
Date: Thu, 20 Nov 2025 13:56:08 +0000
Subject: [PATCH 3/6] Use ordinary assert instead of LIBCPP_ASSERT
---
.../std/strings/basic.string/awkward-char-types.pass.cpp | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/libcxx/test/std/strings/basic.string/awkward-char-types.pass.cpp b/libcxx/test/std/strings/basic.string/awkward-char-types.pass.cpp
index a649cd8f582f8..c94c2a2cb13ac 100644
--- a/libcxx/test/std/strings/basic.string/awkward-char-types.pass.cpp
+++ b/libcxx/test/std/strings/basic.string/awkward-char-types.pass.cpp
@@ -9,7 +9,7 @@ template <typename Char>
void test_string() {
// Make a test string.
std::basic_string<Char> s;
- LIBCPP_ASSERT(s.size() == 0);
+ assert(s.size() == 0);
// Append enough chars to it that we must have switched over from a short
// string stored internally to a long one pointing to a dynamic buffer,
@@ -17,12 +17,12 @@ void test_string() {
unsigned n = sizeof(s) / sizeof(Char) + 1;
for (unsigned i = 0; i < n; i++) {
s.push_back(Char::from_integer(i));
- LIBCPP_ASSERT(s.size() == i + 1);
+ assert(s.size() == i + 1);
}
// Check that all the chars were correctly copied during the realloc.
for (unsigned i = 0; i < n; i++) {
- LIBCPP_ASSERT(s[i] == Char::from_integer(i));
+ assert(s[i] == Char::from_integer(i));
}
}
>From 911d9ad88f8919d2150d9a49ba8b55d69bcf3744 Mon Sep 17 00:00:00 2001
From: Simon Tatham <simon.tatham at arm.com>
Date: Mon, 24 Nov 2025 13:21:43 +0000
Subject: [PATCH 4/6] Replace __padding::clear() with __padding::empty()
One of the buildbot failures was objecting to a constexpr function
returning void, I guess because that's from a later C++ version.
---
libcxx/include/string | 13 ++++++++-----
1 file changed, 8 insertions(+), 5 deletions(-)
diff --git a/libcxx/include/string b/libcxx/include/string
index 8b2e599e17b2c..7814a6e15001e 100644
--- a/libcxx/include/string
+++ b/libcxx/include/string
@@ -719,15 +719,18 @@ struct __init_with_sentinel_tag {};
template <size_t _PaddingSize>
struct __padding {
char __padding_[_PaddingSize];
- constexpr void clear() {
+ static constexpr __padding empty() {
__padding __initialized = {0};
- *this = __initialized;
+ return __initialized;
}
};
template <>
struct __padding<0> {
- constexpr void clear() {}
+ static constexpr __padding empty() {
+ __padding __initialized;
+ return __initialized;
+ }
};
template <class _CharT, class _Traits, class _Allocator>
@@ -928,7 +931,7 @@ private:
// No padding is needed in this version of the structure, but we add a
// zero-sized __padding_ member anyway to match the
// `_LIBCPP_ABI_ALTERNATE_STRING_LAYOUT` version, so that
- // `__padding_.clear()` can be performed unconditionally in code common to
+ // `__padding::empty()` can be used unconditionally in code common to
// both layouts.
_LIBCPP_NO_UNIQUE_ADDRESS __padding<0> __padding_;
};
@@ -2817,7 +2820,7 @@ _LIBCPP_DEPRECATED_("use __grow_by_without_replace") basic_string<_CharT, _Trait
// This is -1 to make sure the caller sets the size properly, since old versions of this function didn't set the size
// at all.
__buffer.__size_ = -1;
- __buffer.__padding_.clear();
+ __buffer.__padding_ = decltype(__buffer.__padding_)::empty();
__reset_internal_buffer(__buffer);
}
>From 5bcb59a6e03b022cf49ff070970689a911702502 Mon Sep 17 00:00:00 2001
From: Simon Tatham <simon.tatham at arm.com>
Date: Mon, 24 Nov 2025 13:27:41 +0000
Subject: [PATCH 5/6] Avoid using constexpr where C++03 will hate it
Another CI failure occurred because of compiling in C++03 mode, where
'constexpr' isn't even a keyword. Replaced with
_LIBCPP_CONSTEXPR_SINCE_20 where feasible, and enum otherwise.
---
libcxx/include/string | 14 +++++++-------
1 file changed, 7 insertions(+), 7 deletions(-)
diff --git a/libcxx/include/string b/libcxx/include/string
index 7814a6e15001e..3c413bc8d47b8 100644
--- a/libcxx/include/string
+++ b/libcxx/include/string
@@ -719,7 +719,7 @@ struct __init_with_sentinel_tag {};
template <size_t _PaddingSize>
struct __padding {
char __padding_[_PaddingSize];
- static constexpr __padding empty() {
+ static _LIBCPP_CONSTEXPR_SINCE_CXX20 __padding empty() {
__padding __initialized = {0};
return __initialized;
}
@@ -727,7 +727,7 @@ struct __padding {
template <>
struct __padding<0> {
- static constexpr __padding empty() {
+ static _LIBCPP_CONSTEXPR_SINCE_CXX20 __padding empty() {
__padding __initialized;
return __initialized;
}
@@ -840,17 +840,17 @@ private:
// The __short structure has a byte-sized bitfield container at the end, and
// the rest of it can be taken up by value_type. We want at least two
// value_type, or more if they will fit.
- static constexpr size_t __fit_cap = (sizeof(__long_min) - 1) / sizeof(value_type);
- static constexpr size_t __min_cap = __fit_cap > 2 ? __fit_cap : 2;
+ enum { __fit_cap = (sizeof(__long_min) - 1) / sizeof(value_type) };
+ enum { __min_cap = __fit_cap > 2 ? __fit_cap : 2 };
// Now we know how many value_type will fit, calculate how much space they
// take up in total; add one byte for the final bitfield container; and round
// up to the nearest multiple of the alignment. That's the total size of the
// structure.
- static constexpr size_t __short_packed_size = sizeof(value_type) * __min_cap + 1;
+ enum { __short_packed_size = sizeof(value_type) * __min_cap + 1 };
union __union_alignment_check { __long_min __long_; value_type v; };
- static constexpr size_t __union_alignment = _LIBCPP_ALIGNOF(__union_alignment_check);
- static constexpr size_t __full_size = (__short_packed_size + __union_alignment - 1) & -__union_alignment;
+ enum { __union_alignment = _LIBCPP_ALIGNOF(__union_alignment_check) };
+ enum { __full_size = (__short_packed_size + __union_alignment - 1) & -__union_alignment };
// Now define both structures for real, with padding to ensure they are both
// exactly the calculated size.
>From fbc90a649679cceac6b61b14aada26ec33e0c855 Mon Sep 17 00:00:00 2001
From: Simon Tatham <simon.tatham at arm.com>
Date: Mon, 24 Nov 2025 13:35:42 +0000
Subject: [PATCH 6/6] clang-format *again*
This one is particularly annoying because my local run of
git-clang-format didn't pick it up and I had to round-trip through
official CI to even find out there was a problem!
---
libcxx/include/string | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/libcxx/include/string b/libcxx/include/string
index 3c413bc8d47b8..8ac66636c6007 100644
--- a/libcxx/include/string
+++ b/libcxx/include/string
@@ -848,7 +848,10 @@ private:
// up to the nearest multiple of the alignment. That's the total size of the
// structure.
enum { __short_packed_size = sizeof(value_type) * __min_cap + 1 };
- union __union_alignment_check { __long_min __long_; value_type v; };
+ union __union_alignment_check {
+ __long_min __long_;
+ value_type v;
+ };
enum { __union_alignment = _LIBCPP_ALIGNOF(__union_alignment_check) };
enum { __full_size = (__short_packed_size + __union_alignment - 1) & -__union_alignment };
More information about the libcxx-commits
mailing list