[libcxx-commits] [libcxx] 29c8c07 - [libc++] Use bit field for checking if string is in long or short mode

Nikolas Klauser via libcxx-commits libcxx-commits at lists.llvm.org
Thu Apr 21 05:20:27 PDT 2022


Author: Nikolas Klauser
Date: 2022-04-21T14:20:21+02:00
New Revision: 29c8c070a1770fc510ccad3be753f6f50336f8cc

URL: https://github.com/llvm/llvm-project/commit/29c8c070a1770fc510ccad3be753f6f50336f8cc
DIFF: https://github.com/llvm/llvm-project/commit/29c8c070a1770fc510ccad3be753f6f50336f8cc.diff

LOG: [libc++] Use bit field for checking if string is in long or short mode

This makes the code a bit simpler and (I think) removes the undefined behaviour from the normal string layout.

Reviewed By: ldionne, Mordante, #libc

Spies: labath, dblaikie, JDevlieghere, krytarowski, jgorbe, jingham, saugustine, arichardson, libcxx-commits

Differential Revision: https://reviews.llvm.org/D123580

Added: 
    libcxx/test/libcxx/strings/basic.string/string.capacity/max_size.pass.cpp

Modified: 
    libcxx/include/string
    libcxx/test/std/strings/basic.string/string.capacity/over_max_size.pass.cpp
    libcxx/utils/gdb/libcxx/printers.py

Removed: 
    


################################################################################
diff  --git a/libcxx/include/string b/libcxx/include/string
index 9fd19d2ad9570..bd1c4303e8a40 100644
--- a/libcxx/include/string
+++ b/libcxx/include/string
@@ -532,6 +532,8 @@ basic_string<char32_t> operator "" s( const char32_t *str, size_t len ); // C++1
 #include <__utility/auto_cast.h>
 #include <__utility/move.h>
 #include <__utility/swap.h>
+#include <__utility/unreachable.h>
+#include <climits>
 #include <compare>
 #include <cstdio>  // EOF
 #include <cstdlib>
@@ -539,6 +541,7 @@ basic_string<char32_t> operator "" s( const char32_t *str, size_t len ); // C++1
 #include <initializer_list>
 #include <iosfwd>
 #include <iterator>
+#include <limits>
 #include <memory>
 #include <stdexcept>
 #include <string_view>
@@ -674,17 +677,10 @@ private:
     {
         pointer   __data_;
         size_type __size_;
-        size_type __cap_;
+        size_type __cap_ : sizeof(size_type) * CHAR_BIT - 1;
+        size_type __is_long_ : 1;
     };
 
-#ifdef _LIBCPP_BIG_ENDIAN
-    static const size_type __short_mask = 0x01;
-    static const size_type __long_mask  = 0x1ul;
-#else  // _LIBCPP_BIG_ENDIAN
-    static const size_type __short_mask = 0x80;
-    static const size_type __long_mask  = ~(size_type(~0) >> 1);
-#endif // _LIBCPP_BIG_ENDIAN
-
     enum {__min_cap = (sizeof(__long) - 1)/sizeof(value_type) > 2 ?
                       (sizeof(__long) - 1)/sizeof(value_type) : 2};
 
@@ -692,26 +688,46 @@ private:
     {
         value_type __data_[__min_cap];
         unsigned char __padding[sizeof(value_type) - 1];
-        unsigned char __size_;
+        unsigned char __size_ : 7;
+        unsigned char __is_long_ : 1;
     };
 
+// The __endian_factor is required because the field we use to store the size
+// (either size_type or unsigned char depending on long/short) has one fewer
+// bit than it would if it were not a bitfield.
+//
+// If the LSB is used to store the short-flag in the short string representation,
+// we have to multiply the size by two when it is stored and divide it by two when
+// it is loaded to make sure that we always store an even number. In the long string
+// representation, we can ignore this because we can assume that we always allocate
+// an even amount of value_types.
+//
+// If the MSB is used for the short-flag, the max_size() is numeric_limits<size_type>::max() / 2.
+// This does not impact the short string representation, since we never need the MSB
+// for representing the size of a short string anyway.
+
+#ifdef _LIBCPP_BIG_ENDIAN
+    static const size_type __endian_factor = 2;
 #else
+    static const size_type __endian_factor = 1;
+#endif
+
+#else // _LIBCPP_ABI_ALTERNATE_STRING_LAYOUT
+
+#ifdef _LIBCPP_BIG_ENDIAN
+    static const size_type __endian_factor = 1;
+#else
+    static const size_type __endian_factor = 2;
+#endif
 
     struct __long
     {
-        size_type __cap_;
+        size_type __is_long_ : 1;
+        size_type __cap_ : sizeof(size_type) * CHAR_BIT - 1;
         size_type __size_;
         pointer   __data_;
     };
 
-#ifdef _LIBCPP_BIG_ENDIAN
-    static const size_type __short_mask = 0x80;
-    static const size_type __long_mask  = ~(size_type(~0) >> 1);
-#else  // _LIBCPP_BIG_ENDIAN
-    static const size_type __short_mask = 0x01;
-    static const size_type __long_mask  = 0x1ul;
-#endif // _LIBCPP_BIG_ENDIAN
-
     enum {__min_cap = (sizeof(__long) - 1)/sizeof(value_type) > 2 ?
                       (sizeof(__long) - 1)/sizeof(value_type) : 2};
 
@@ -719,7 +735,10 @@ private:
     {
         union
         {
-            unsigned char __size_;
+            struct {
+                unsigned char __is_long_ : 1;
+                unsigned char __size_ : 7;
+            };
             value_type __lx;
         };
         value_type __data_[__min_cap];
@@ -1426,8 +1445,9 @@ public:
     _LIBCPP_INLINE_VISIBILITY void __shrink_or_extend(size_type __target_capacity);
 
     _LIBCPP_INLINE_VISIBILITY
-    bool __is_long() const _NOEXCEPT
-        {return bool(__r_.first().__s.__size_ & __short_mask);}
+    bool __is_long() const _NOEXCEPT {
+        return __r_.first().__s.__is_long_;
+    }
 
 #if _LIBCPP_DEBUG_LEVEL == 2
 
@@ -1474,43 +1494,18 @@ private:
     _LIBCPP_HIDE_FROM_ABI allocator_type& __alloc() _NOEXCEPT { return __r_.second(); }
     _LIBCPP_HIDE_FROM_ABI const allocator_type& __alloc() const _NOEXCEPT { return __r_.second(); }
 
-#ifdef _LIBCPP_ABI_ALTERNATE_STRING_LAYOUT
-
-    _LIBCPP_INLINE_VISIBILITY
-    void __set_short_size(size_type __s) _NOEXCEPT
-#   ifdef _LIBCPP_BIG_ENDIAN
-        {__r_.first().__s.__size_ = (unsigned char)(__s << 1);}
-#   else
-        {__r_.first().__s.__size_ = (unsigned char)(__s);}
-#   endif
-
-    _LIBCPP_INLINE_VISIBILITY
-    size_type __get_short_size() const _NOEXCEPT
-#   ifdef _LIBCPP_BIG_ENDIAN
-        {return __r_.first().__s.__size_ >> 1;}
-#   else
-        {return __r_.first().__s.__size_;}
-#   endif
-
-#else  // _LIBCPP_ABI_ALTERNATE_STRING_LAYOUT
-
     _LIBCPP_INLINE_VISIBILITY
-    void __set_short_size(size_type __s) _NOEXCEPT
-#   ifdef _LIBCPP_BIG_ENDIAN
-        {__r_.first().__s.__size_ = (unsigned char)(__s);}
-#   else
-        {__r_.first().__s.__size_ = (unsigned char)(__s << 1);}
-#   endif
+    void __set_short_size(size_type __s) _NOEXCEPT {
+        _LIBCPP_ASSERT(__s < __min_cap, "__s should never be greater than or equal to the short string capacity");
+        __r_.first().__s.__size_ = __s;
+        __r_.first().__s.__is_long_ = false;
+    }
 
     _LIBCPP_INLINE_VISIBILITY
-    size_type __get_short_size() const _NOEXCEPT
-#   ifdef _LIBCPP_BIG_ENDIAN
-        {return __r_.first().__s.__size_;}
-#   else
-        {return __r_.first().__s.__size_ >> 1;}
-#   endif
-
-#endif // _LIBCPP_ABI_ALTERNATE_STRING_LAYOUT
+    size_type __get_short_size() const _NOEXCEPT {
+        _LIBCPP_ASSERT(!__r_.first().__s.__is_long_, "String has to be short when trying to get the short size");
+        return __r_.first().__s.__size_;
+    }
 
     _LIBCPP_INLINE_VISIBILITY
     void __set_long_size(size_type __s) _NOEXCEPT
@@ -1523,11 +1518,15 @@ private:
         {if (__is_long()) __set_long_size(__s); else __set_short_size(__s);}
 
     _LIBCPP_INLINE_VISIBILITY
-    void __set_long_cap(size_type __s) _NOEXCEPT
-        {__r_.first().__l.__cap_  = __long_mask | __s;}
+    void __set_long_cap(size_type __s) _NOEXCEPT {
+        __r_.first().__l.__cap_ = __s / __endian_factor;
+        __r_.first().__l.__is_long_ = true;
+    }
+
     _LIBCPP_INLINE_VISIBILITY
-    size_type __get_long_cap() const _NOEXCEPT
-        {return __r_.first().__l.__cap_ & size_type(~__long_mask);}
+    size_type __get_long_cap() const _NOEXCEPT {
+        return __r_.first().__l.__cap_ * __endian_factor;
+    }
 
     _LIBCPP_INLINE_VISIBILITY
     void __set_long_pointer(pointer __p) _NOEXCEPT
@@ -3225,11 +3224,12 @@ typename basic_string<_CharT, _Traits, _Allocator>::size_type
 basic_string<_CharT, _Traits, _Allocator>::max_size() const _NOEXCEPT
 {
     size_type __m = __alloc_traits::max_size(__alloc());
-#ifdef _LIBCPP_BIG_ENDIAN
-    return (__m <= ~__long_mask ? __m : __m/2) - __alignment;
-#else
-    return __m - __alignment;
-#endif
+    if (__m <= std::numeric_limits<size_type>::max() / 2) {
+        return __m - __alignment;
+    } else {
+        bool __uses_lsb = __endian_factor == 2;
+        return __uses_lsb ? __m - __alignment : (__m / 2) - __alignment;
+    }
 }
 
 template <class _CharT, class _Traits, class _Allocator>

diff  --git a/libcxx/test/libcxx/strings/basic.string/string.capacity/max_size.pass.cpp b/libcxx/test/libcxx/strings/basic.string/string.capacity/max_size.pass.cpp
new file mode 100644
index 0000000000000..65dfead25cdf0
--- /dev/null
+++ b/libcxx/test/libcxx/strings/basic.string/string.capacity/max_size.pass.cpp
@@ -0,0 +1,125 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED:
+
+// <string>
+
+// This test ensures that the correct max_size() is returned depending on the platform.
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <string>
+
+#include "test_macros.h"
+
+// alignment of the string heap buffer is hardcoded to 16
+static const size_t alignment = 16;
+
+void full_size() {
+  std::string str;
+  assert(str.max_size() == std::numeric_limits<size_t>::max() - alignment);
+
+#ifndef TEST_HAS_NO_CHAR8_T
+  std::u8string u8str;
+  assert(u8str.max_size() == std::numeric_limits<size_t>::max() - alignment);
+#endif
+
+#ifndef TEST_HAS_NO_WIDE_CHARACTERS
+  std::wstring wstr;
+  assert(wstr.max_size() == std::numeric_limits<size_t>::max() / sizeof(wchar_t) - alignment);
+#endif
+
+#ifndef TEST_HAS_NO_UNICODE_CHARS
+  std::u16string u16str;
+  std::u32string u32str;
+  assert(u16str.max_size() == std::numeric_limits<size_t>::max() / 2 - alignment);
+  assert(u32str.max_size() == std::numeric_limits<size_t>::max() / 4 - alignment);
+#endif
+}
+
+void half_size() {
+  std::string str;
+  assert(str.max_size() == std::numeric_limits<size_t>::max() / 2 - alignment);
+
+#ifndef TEST_HAS_NO_CHAR8_T
+  std::u8string u8str;
+  assert(u8str.max_size() == std::numeric_limits<size_t>::max() / 2 - alignment);
+#endif
+
+#ifndef TEST_HAS_NO_WIDE_CHARACTERS
+  std::wstring wstr;
+  assert(wstr.max_size() == std::numeric_limits<size_t>::max() / std::max<size_t>(2ul, sizeof(wchar_t)) - alignment);
+#endif
+
+#ifndef TEST_HAS_NO_UNICODE_CHARS
+  std::u16string u16str;
+  std::u32string u32str;
+  assert(u16str.max_size() == std::numeric_limits<size_t>::max() / 2 - alignment);
+  assert(u32str.max_size() == std::numeric_limits<size_t>::max() / 4 - alignment);
+#endif
+}
+
+bool test() {
+#if _LIBCPP_ABI_VERSION == 1
+#ifdef __APPLE__
+
+#  ifdef __aarch64__
+  half_size();
+#  elif defined(__x86_64__)
+  full_size();
+#  else
+#    error "Your target system seems to be unsupported."
+#  endif
+
+#elif defined(__linux__)
+
+#  ifdef  __x86_64__
+  full_size();
+#  elif defined(__arm__) || defined(__aarch64__)
+#    ifdef __BIG_ENDIAN__
+  half_size();
+#    else
+  full_size();
+#    endif
+
+#  else
+#    error "Your target system seems to be unsupported."
+#  endif
+
+#elif defined(__powerpc__)
+  half_size();
+#elif defined(__powepc64__)
+  half_size();
+
+#elif defined(_WIN64)
+
+#  ifdef __x86_64__
+  full_size();
+#  else
+#    error "Your target system seems to be unsupported."
+#  endif
+
+#else
+#  error "Your target system seems to be unsupported."
+#endif
+
+#endif
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER > 17
+  // static_assert(test());
+#endif
+
+  return 0;
+}

diff  --git a/libcxx/test/std/strings/basic.string/string.capacity/over_max_size.pass.cpp b/libcxx/test/std/strings/basic.string/string.capacity/over_max_size.pass.cpp
index 2b6b0e2ae66ea..a2c699456d858 100644
--- a/libcxx/test/std/strings/basic.string/string.capacity/over_max_size.pass.cpp
+++ b/libcxx/test/std/strings/basic.string/string.capacity/over_max_size.pass.cpp
@@ -9,6 +9,11 @@
 // UNSUPPORTED: no-exceptions
 // XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx10.{{9|10|11}}
 
+// Prior to http://llvm.org/D123580, there was a bug with how the max_size()
+// was calculated. That was inlined into some functions in the dylib, which leads
+// to failures when running this test against an older system dylib.
+// XFAIL: use_system_cxx_lib && target=arm64-apple-macosx{{11.0|12.0}}
+
 // <string>
 
 // size_type max_size() const;

diff  --git a/libcxx/utils/gdb/libcxx/printers.py b/libcxx/utils/gdb/libcxx/printers.py
index acf77b648cb26..d98b6269f1f9d 100644
--- a/libcxx/utils/gdb/libcxx/printers.py
+++ b/libcxx/utils/gdb/libcxx/printers.py
@@ -192,26 +192,6 @@ def _value_of_pair_first(value):
 class StdStringPrinter(object):
     """Print a std::string."""
 
-    def _get_short_size(self, short_field, short_size):
-        """Short size depends on both endianness and a compile-time define."""
-
-        # If the padding field is present after all this indirection, then string
-        # was compiled with _LIBCPP_ABI_ALTERNATE_STRING_LAYOUT defined.
-        field = short_field.type.fields()[1].type.fields()[0]
-        libcpp_abi_alternate_string_layout = field.name and "__padding" in field.name
-
-        # This logical structure closely follows the original code (which is clearer
-        # in C++).  Keep them parallel to make them easier to compare.
-        if libcpp_abi_alternate_string_layout:
-            if _libcpp_big_endian:
-                return short_size >> 1
-            else:
-                return short_size
-        elif _libcpp_big_endian:
-            return short_size
-        else:
-            return short_size >> 1
-
     def __init__(self, val):
         self.val = val
 
@@ -223,18 +203,13 @@ def to_string(self):
         short_size = short_field["__size_"]
         if short_size == 0:
             return ""
-        short_mask = self.val["__short_mask"]
-        # Counter intuitive to compare the size and short_mask to see if the string
-        # is long, but that's the way the implementation does it. Note that
-        # __is_long() doesn't use get_short_size in C++.
-        is_long = short_size & short_mask
-        if is_long:
+        if short_field["__is_long_"]:
             long_field = value_field["__l"]
             data = long_field["__data_"]
             size = long_field["__size_"]
         else:
             data = short_field["__data_"]
-            size = self._get_short_size(short_field, short_size)
+            size = short_field["__size_"]
         return data.lazy_string(length=size)
 
     def display_hint(self):


        


More information about the libcxx-commits mailing list