[libcxx-commits] [libcxx] c866855 - [libc++][format] Improves Unicode decoders.

Mark de Wever via libcxx-commits libcxx-commits at lists.llvm.org
Wed Mar 8 13:01:54 PST 2023


Author: Mark de Wever
Date: 2023-03-08T22:01:49+01:00
New Revision: c866855b42eb3e8aa7578aadb26e4431d1d71efd

URL: https://github.com/llvm/llvm-project/commit/c866855b42eb3e8aa7578aadb26e4431d1d71efd
DIFF: https://github.com/llvm/llvm-project/commit/c866855b42eb3e8aa7578aadb26e4431d1d71efd.diff

LOG: [libc++][format] Improves Unicode decoders.

During the implementation of P2286 a second Unicode decoder was added.
The original decoder was only used for the width estimation. Changing
an ill-formed Unicode sequence to the replacement character, works
properly for this use case. For P2286 an ill-formed Unicode sequence
needs to be formatted as a sequence of code units. The exact wording in
the Standard as a bit unclear and there was odd example in the WP. This
made it hard to use the same decoder. SG16 determined the odd example in
the WP was a bug and this has been fixed in the WP.

This made it possible to combine the two decoders. The P2286 decoder
kept track of the size of the ill-formed sequence. However this was not
needed since the output algorithm needs to keep track of size of a
well-formed and an ill-formed sequence. So this feature has been
removed.

The error status remains since it's needed for P2286, the grapheme
clustering can ignore this unneeded value. (In general, grapheme
clustering is only has specified behaviour for Unicode. When the string
is in a non-Unicode encoding there are no requirements. Ill-formed
Unicode is a non-Unicode encoding. Still libc++ does a best effort
estimation.)

There UTF-8 decoder accepted several ill-formed sequences:
- Values in the surrogate range U+D800..U+DFFF.
- Values encoded in more code units than required, for example 0+0020
  in theory can be encoded using 1, 2, 3, or 4 were accepted. This is
  not allowed by the Unicode Standard.
- Values larger than U+10FFFF were not always rejected.

Reviewed By: #libc, ldionne, tahonermann, Mordante

Differential Revision: https://reviews.llvm.org/D144346

Added: 
    

Modified: 
    libcxx/include/__format/formatter_output.h
    libcxx/include/__format/unicode.h
    libcxx/test/std/utilities/format/format.functions/escaped_output.unicode.pass.cpp

Removed: 
    


################################################################################
diff  --git a/libcxx/include/__format/formatter_output.h b/libcxx/include/__format/formatter_output.h
index 6c8cadb76eec5..2b3c5ae97d71d 100644
--- a/libcxx/include/__format/formatter_output.h
+++ b/libcxx/include/__format/formatter_output.h
@@ -12,6 +12,7 @@
 
 #include <__algorithm/ranges_copy.h>
 #include <__algorithm/ranges_fill_n.h>
+#include <__algorithm/ranges_for_each.h>
 #include <__algorithm/ranges_transform.h>
 #include <__chrono/statically_widen.h>
 #include <__concepts/same_as.h>
@@ -503,36 +504,17 @@ __escape(basic_string<_CharT>& __str, basic_string_view<_CharT> __values, __esca
   __unicode::__code_point_view<_CharT> __view{__values.begin(), __values.end()};
 
   while (!__view.__at_end()) {
-    auto __first                                        = __view.__position();
-    typename __unicode::__consume_p2286_result __result = __view.__consume_p2286();
-    if (__result.__ill_formed_size == 0) {
-      if (!__formatter::__is_escaped_sequence_written(__str, __result.__value, __mark))
+    auto __first                                  = __view.__position();
+    typename __unicode::__consume_result __result = __view.__consume();
+    if (__result.__status == __unicode::__consume_result::__ok) {
+      if (!__formatter::__is_escaped_sequence_written(__str, __result.__code_point, __mark))
         // 2.2.1.3 - Add the character
         ranges::copy(__first, __view.__position(), std::back_insert_iterator(__str));
-
     } else {
       // 2.2.3 sequence of ill-formed code units
-      // The number of code-units in __result.__value depends on the character type being used.
-      if constexpr (sizeof(_CharT) == 1) {
-        _LIBCPP_ASSERT(__result.__ill_formed_size == 1 || __result.__ill_formed_size == 4,
-                       "illegal number of invalid code units.");
-        if (__result.__ill_formed_size == 1) // ill-formed, one code unit
-          __formatter::__write_escape_ill_formed_code_unit(__str, __result.__value & 0xff);
-        else { // out of valid range, four code units
-               // The code point was properly encoded, decode the value.
-          __formatter::__write_escape_ill_formed_code_unit(__str, __result.__value >> 18 | 0xf0);
-          __formatter::__write_escape_ill_formed_code_unit(__str, (__result.__value >> 12 & 0x3f) | 0x80);
-          __formatter::__write_escape_ill_formed_code_unit(__str, (__result.__value >> 6 & 0x3f) | 0x80);
-          __formatter::__write_escape_ill_formed_code_unit(__str, (__result.__value & 0x3f) | 0x80);
-        }
-      } else if constexpr (sizeof(_CharT) == 2) {
-        _LIBCPP_ASSERT(__result.__ill_formed_size == 1, "for UTF-16 at most one invalid code unit");
-        __formatter::__write_escape_ill_formed_code_unit(__str, __result.__value & 0xffff);
-      } else {
-        static_assert(sizeof(_CharT) == 4, "unsupported character width");
-        _LIBCPP_ASSERT(__result.__ill_formed_size == 1, "for UTF-32 one code unit is one code point");
-        __formatter::__write_escape_ill_formed_code_unit(__str, __result.__value);
-      }
+      ranges::for_each(__first, __view.__position(), [&](_CharT __value) {
+        __formatter::__write_escape_ill_formed_code_unit(__str, __formatter::__to_char32(__value));
+      });
     }
   }
 }

diff  --git a/libcxx/include/__format/unicode.h b/libcxx/include/__format/unicode.h
index 53b5320600dc1..12aed507990e8 100644
--- a/libcxx/include/__format/unicode.h
+++ b/libcxx/include/__format/unicode.h
@@ -31,23 +31,28 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 
 namespace __unicode {
 
-#  if _LIBCPP_STD_VER >= 23
-
-/// The result of consuming a code point using P2286' semantics
-///
-/// TODO FMT Combine __consume and  __consume_p2286 in one function.
-struct __consume_p2286_result {
-  // A size of 0 means well formed. This to 
diff erenciate between
-  // a valid code point and a code unit that's invalid like 0b11111xxx.
-  int __ill_formed_size;
-
-  // If well formed the consumed code point.
-  // Otherwise the ill-formed code units as unsigned 8-bit values. They are
-  // stored in reverse order, to make it easier to extract the values.
-  char32_t __value;
+// Helper struct for the result of a consume operation.
+//
+// The status value for a correct code point is 0. This allows a valid value to
+// be used without masking.
+// When the decoding fails it know the number of code units affected. For the
+// current use-cases that value is not needed, therefore it is not stored.
+// The escape routine needs the number of code units for both a valid and
+// invalid character and keeps track of it itself. Doing it in this result
+// unconditionally would give some overhead when the value is unneeded.
+struct __consume_result {
+  // When __status == __ok it contains the decoded code point.
+  // Else it contains the replacement character U+FFFD
+  char32_t __code_point : 31;
+
+  enum : char32_t {
+    // Consumed a well-formed code point.
+    __ok = 0,
+    // Encountered invalid UTF-8
+    __error = 1
+  } __status : 1 {__ok};
 };
-
-#  endif // _LIBCPP_STD_VER >= 23
+static_assert(sizeof(__consume_result) == sizeof(char32_t));
 
 #  ifndef _LIBCPP_HAS_NO_UNICODE
 
@@ -66,6 +71,36 @@ struct __consume_p2286_result {
 
 inline constexpr char32_t __replacement_character = U'\ufffd';
 
+// The error of a consume operation.
+//
+// This sets the code point to the replacement character. This code point does
+// not participate in the grapheme clustering, so grapheme clustering code can
+// ignore the error status and always use the code point.
+inline constexpr __consume_result __consume_result_error{__replacement_character, __consume_result::__error};
+
+[[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr bool __is_high_surrogate(char32_t __value) {
+  return __value >= 0xd800 && __value <= 0xdbff;
+}
+
+[[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr bool __is_low_surrogate(char32_t __value) {
+  return __value >= 0xdc00 && __value <= 0xdfff;
+}
+
+// https://www.unicode.org/glossary/#surrogate_code_point
+[[nodiscard]] _LIBCPP_HIDE_FROM_ABI inline constexpr bool __is_surrogate(char32_t __value) {
+  return __value >= 0xd800 && __value <= 0xdfff;
+}
+
+// https://www.unicode.org/glossary/#code_point
+[[nodiscard]] _LIBCPP_HIDE_FROM_ABI inline constexpr bool __is_code_point(char32_t __value) {
+  return __value <= 0x10ffff;
+}
+
+// https://www.unicode.org/glossary/#unicode_scalar_value
+[[nodiscard]] _LIBCPP_HIDE_FROM_ABI inline constexpr bool __is_scalar_value(char32_t __value) {
+  return __unicode::__is_code_point(__value) && !__unicode::__is_surrogate(__value);
+}
+
 template <contiguous_iterator _Iterator>
   requires same_as<iter_value_t<_Iterator>, char>
 _LIBCPP_HIDE_FROM_ABI constexpr bool __is_continuation(_Iterator __char, int __count) {
@@ -97,122 +132,103 @@ class __code_point_view<char> {
   _LIBCPP_HIDE_FROM_ABI constexpr bool __at_end() const noexcept { return __first_ == __last_; }
   _LIBCPP_HIDE_FROM_ABI constexpr _Iterator __position() const noexcept { return __first_; }
 
-  _LIBCPP_HIDE_FROM_ABI constexpr char32_t __consume() noexcept {
+  // https://www.unicode.org/versions/latest/ch03.pdf#G7404
+  // Based on Table 3-7, Well-Formed UTF-8 Byte Sequences
+  //
+  // Code Points        First Byte Second Byte Third Byte Fourth Byte  Remarks
+  // U+0000..U+007F     00..7F                                         U+0000..U+007F 1 code unit range
+  //                    C0..C1     80..BF                              invalid overlong encoding
+  // U+0080..U+07FF     C2..DF     80..BF                              U+0080..U+07FF 2 code unit range
+  //                    E0         80..9F      80..BF                  invalid overlong encoding
+  // U+0800..U+0FFF     E0         A0..BF      80..BF                  U+0800..U+FFFF 3 code unit range
+  // U+1000..U+CFFF     E1..EC     80..BF      80..BF
+  // U+D000..U+D7FF     ED         80..9F      80..BF
+  // U+D800..U+DFFF     ED         A0..BF      80..BF                  invalid encoding of surrogate code point
+  // U+E000..U+FFFF     EE..EF     80..BF      80..BF
+  //                    F0         80..8F      80..BF     80..BF       invalid overlong encoding
+  // U+10000..U+3FFFF   F0         90..BF      80..BF     80..BF       U+10000..U+10FFFF 4 code unit range
+  // U+40000..U+FFFFF   F1..F3     80..BF      80..BF     80..BF
+  // U+100000..U+10FFFF F4         80..8F      80..BF     80..BF
+  //                    F4         90..BF      80..BF     80..BF       U+110000.. invalid code point range
+  //
+  // Unlike other parsers, these invalid entries are tested after decoding.
+  // - The parser always needs to consume these code units
+  // - The code is optimized for well-formed UTF-8
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr __consume_result __consume() noexcept {
     _LIBCPP_ASSERT(__first_ != __last_, "can't move beyond the end of input");
 
     // Based on the number of leading 1 bits the number of code units in the
     // code point can be determined. See
     // https://en.wikipedia.org/wiki/UTF-8#Encoding
-    switch (_VSTD::countl_one(static_cast<unsigned char>(*__first_))) {
+    switch (std::countl_one(static_cast<unsigned char>(*__first_))) {
     case 0:
-      return *__first_++;
+      return {static_cast<unsigned char>(*__first_++)};
 
-    case 2:
+    case 2: {
       if (__last_ - __first_ < 2 || !__unicode::__is_continuation(__first_ + 1, 1)) [[unlikely]]
         break;
-      else {
-        char32_t __value = static_cast<unsigned char>(*__first_++) & 0x1f;
-        __value <<= 6;
-        __value |= static_cast<unsigned char>(*__first_++) & 0x3f;
-        return __value;
-      }
 
-    case 3:
-      if (__last_ - __first_ < 3 || !__unicode::__is_continuation(__first_ + 1, 2)) [[unlikely]]
-        break;
-      else {
-        char32_t __value = static_cast<unsigned char>(*__first_++) & 0x0f;
-        __value <<= 6;
-        __value |= static_cast<unsigned char>(*__first_++) & 0x3f;
-        __value <<= 6;
-        __value |= static_cast<unsigned char>(*__first_++) & 0x3f;
-        return __value;
-      }
+      char32_t __value = static_cast<unsigned char>(*__first_++) & 0x1f;
+      __value <<= 6;
+      __value |= static_cast<unsigned char>(*__first_++) & 0x3f;
 
-    case 4:
-      if (__last_ - __first_ < 4 || !__unicode::__is_continuation(__first_ + 1, 3)) [[unlikely]]
-        break;
-      else {
-        char32_t __value = static_cast<unsigned char>(*__first_++) & 0x07;
-        __value <<= 6;
-        __value |= static_cast<unsigned char>(*__first_++) & 0x3f;
-        __value <<= 6;
-        __value |= static_cast<unsigned char>(*__first_++) & 0x3f;
-        __value <<= 6;
-        __value |= static_cast<unsigned char>(*__first_++) & 0x3f;
-        return __value;
-      }
-    }
-    // An invalid number of leading ones can be garbage or a code unit in the
-    // middle of a code point. By consuming one code unit the parser may get
-    // "in sync" after a few code units.
-    ++__first_;
-    return __replacement_character;
-  }
-
-#    if _LIBCPP_STD_VER >= 23
-  _LIBCPP_HIDE_FROM_ABI constexpr __consume_p2286_result __consume_p2286() noexcept {
-    _LIBCPP_ASSERT(__first_ != __last_, "can't move beyond the end of input");
+      // These values should be encoded in 1 UTF-8 code unit.
+      if (__value < 0x0080) [[unlikely]]
+        return __consume_result_error;
 
-    // Based on the number of leading 1 bits the number of code units in the
-    // code point can be determined. See
-    // https://en.wikipedia.org/wiki/UTF-8#Encoding
-    switch (std::countl_one(static_cast<unsigned char>(*__first_))) {
-    case 0:
-      return {0, static_cast<unsigned char>(*__first_++)};
+      return {__value};
+    }
 
-    case 2:
-      if (__last_ - __first_ < 2) [[unlikely]]
+    case 3: {
+      if (__last_ - __first_ < 3 || !__unicode::__is_continuation(__first_ + 1, 2)) [[unlikely]]
         break;
 
-      if (__unicode::__is_continuation(__first_ + 1, 1)) {
-        char32_t __value = static_cast<unsigned char>(*__first_++) & 0x1f;
-        __value <<= 6;
-        __value |= static_cast<unsigned char>(*__first_++) & 0x3f;
-        return {0, __value};
-      }
-      break;
+      char32_t __value = static_cast<unsigned char>(*__first_++) & 0x0f;
+      __value <<= 6;
+      __value |= static_cast<unsigned char>(*__first_++) & 0x3f;
+      __value <<= 6;
+      __value |= static_cast<unsigned char>(*__first_++) & 0x3f;
 
-    case 3:
-      if (__last_ - __first_ < 3) [[unlikely]]
-        break;
+      // These values should be encoded in 1 or 2 UTF-8 code units.
+      if (__value < 0x0800) [[unlikely]]
+        return __consume_result_error;
 
-      if (__unicode::__is_continuation(__first_ + 1, 2)) {
-        char32_t __value = static_cast<unsigned char>(*__first_++) & 0x0f;
-        __value <<= 6;
-        __value |= static_cast<unsigned char>(*__first_++) & 0x3f;
-        __value <<= 6;
-        __value |= static_cast<unsigned char>(*__first_++) & 0x3f;
-        return {0, __value};
-      }
-      break;
+      // A surrogate value is always encoded in 3 UTF-8 code units.
+      if (__unicode::__is_surrogate(__value)) [[unlikely]]
+        return __consume_result_error;
+
+      return {__value};
+    }
 
-    case 4:
-      if (__last_ - __first_ < 4) [[unlikely]]
+    case 4: {
+      if (__last_ - __first_ < 4 || !__unicode::__is_continuation(__first_ + 1, 3)) [[unlikely]]
         break;
 
-      if (__unicode::__is_continuation(__first_ + 1, 3)) {
-        char32_t __value = static_cast<unsigned char>(*__first_++) & 0x07;
-        __value <<= 6;
-        __value |= static_cast<unsigned char>(*__first_++) & 0x3f;
-        __value <<= 6;
-        __value |= static_cast<unsigned char>(*__first_++) & 0x3f;
-        __value <<= 6;
-        __value |= static_cast<unsigned char>(*__first_++) & 0x3f;
+      char32_t __value = static_cast<unsigned char>(*__first_++) & 0x07;
+      __value <<= 6;
+      __value |= static_cast<unsigned char>(*__first_++) & 0x3f;
+      __value <<= 6;
+      __value |= static_cast<unsigned char>(*__first_++) & 0x3f;
+      __value <<= 6;
+      __value |= static_cast<unsigned char>(*__first_++) & 0x3f;
 
-        if (__value > 0x10FFFF) // Outside the valid Unicode range?
-          return {4, __value};
+      // These values should be encoded in 1, 2, or 3 UTF-8 code units.
+      if (__value < 0x10000) [[unlikely]]
+        return __consume_result_error;
 
-        return {0, __value};
-      }
-      break;
+      // A value too large is always encoded in 4 UTF-8 code units.
+      if (!__unicode::__is_code_point(__value)) [[unlikely]]
+        return __consume_result_error;
+
+      return {__value};
+    }
     }
     // An invalid number of leading ones can be garbage or a code unit in the
     // middle of a code point. By consuming one code unit the parser may get
     // "in sync" after a few code units.
-    return {1, static_cast<unsigned char>(*__first_++)};
+    ++__first_;
+    return __consume_result_error;
   }
-#    endif // _LIBCPP_STD_VER >= 23
 
 private:
   _Iterator __first_;
@@ -244,62 +260,33 @@ class __code_point_view<wchar_t> {
   _LIBCPP_HIDE_FROM_ABI constexpr _Iterator __position() const noexcept { return __first_; }
   _LIBCPP_HIDE_FROM_ABI constexpr bool __at_end() const noexcept { return __first_ == __last_; }
 
-  _LIBCPP_HIDE_FROM_ABI constexpr char32_t __consume() noexcept {
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr __consume_result __consume() noexcept {
     _LIBCPP_ASSERT(__first_ != __last_, "can't move beyond the end of input");
 
+    char32_t __value = static_cast<char32_t>(*__first_++);
     if constexpr (sizeof(wchar_t) == 2) {
-      char32_t __result = *__first_++;
-      // Is the code unit part of a surrogate pair? See
-      // https://en.wikipedia.org/wiki/UTF-16#U+D800_to_U+DFFF
-      if (__result >= 0xd800 && __result <= 0xDfff) {
-        // Malformed Unicode.
-        if (__first_ == __last_) [[unlikely]]
-          return __replacement_character;
-
-        __result -= 0xd800;
-        __result <<= 10;
-        __result += *__first_++ - 0xdc00;
-        __result += 0x10000;
-      }
-      return __result;
+      if (__unicode::__is_low_surrogate(__value)) [[unlikely]]
+        return __consume_result_error;
 
-    } else if constexpr (sizeof(wchar_t) == 4) {
-      char32_t __result = *__first_++;
-      if (__result > 0x10FFFF) [[unlikely]]
-        return __replacement_character;
-      return __result;
-    } else {
-      __libcpp_unreachable();
-    }
-  }
+      if (__unicode::__is_high_surrogate(__value)) {
+        if (__first_ == __last_ || !__unicode::__is_low_surrogate(static_cast<char32_t>(*__first_))) [[unlikely]]
+          return __consume_result_error;
 
-#      if _LIBCPP_STD_VER >= 23
-  _LIBCPP_HIDE_FROM_ABI constexpr __consume_p2286_result __consume_p2286() noexcept {
-    _LIBCPP_ASSERT(__first_ != __last_, "can't move beyond the end of input");
+        __value -= 0xd800;
+        __value <<= 10;
+        __value += static_cast<char32_t>(*__first_++) - 0xdc00;
+        __value += 0x10000;
 
-    char32_t __result = *__first_++;
-    if constexpr (sizeof(wchar_t) == 2) {
-      // https://en.wikipedia.org/wiki/UTF-16#U+D800_to_U+DFFF
-      if (__is_surrogate_pair_high(__result)) {
-        // Malformed Unicode.
-        if (__first_ == __last_ || !__is_surrogate_pair_low(*(__first_ + 1))) [[unlikely]]
-          return {1, __result};
-
-        __result -= 0xd800;
-        __result <<= 10;
-        __result += *__first_++ - 0xdc00;
-        __result += 0x10000;
-      } else if (__is_surrogate_pair_low(__result))
-        // A code point shouldn't start with the low surrogate pair
-        return {1, __result};
+        if (!__unicode::__is_code_point(__value)) [[unlikely]]
+          return __consume_result_error;
+      }
     } else {
-      if (__result > 0x10FFFF) [[unlikely]]
-        return {1, __result};
+      if (!__unicode::__is_scalar_value(__value)) [[unlikely]]
+        return __consume_result_error;
     }
 
-    return {0, __result};
+    return {__value};
   }
-#      endif // _LIBCPP_STD_VER >= 23
 
 private:
   _Iterator __first_;
@@ -399,7 +386,7 @@ class __extended_grapheme_cluster_view {
 public:
   _LIBCPP_HIDE_FROM_ABI constexpr explicit __extended_grapheme_cluster_view(_Iterator __first, _Iterator __last)
       : __code_point_view_(__first, __last),
-        __next_code_point_(__code_point_view_.__consume()),
+        __next_code_point_(__code_point_view_.__consume().__code_point),
         __next_prop_(__extended_grapheme_custer_property_boundary::__get_property(__next_code_point_)) {}
 
   struct __cluster {
@@ -420,6 +407,7 @@ class __extended_grapheme_cluster_view {
     _LIBCPP_ASSERT(
         __next_prop_ != __extended_grapheme_custer_property_boundary::__property::__eot,
         "can't move beyond the end of input");
+
     char32_t __code_point = __next_code_point_;
     if (!__code_point_view_.__at_end())
       return {__code_point, __get_break()};
@@ -444,7 +432,7 @@ class __extended_grapheme_cluster_view {
         __next_prop_ = __extended_grapheme_custer_property_boundary::__property::__eot;
         return __result;
       }
-      __next_code_point_ = __code_point_view_.__consume();
+      __next_code_point_ = __code_point_view_.__consume().__code_point;
       __next_prop_       = __extended_grapheme_custer_property_boundary::__get_property(__next_code_point_);
 
       __has_extened_pictographic |=
@@ -474,18 +462,10 @@ class __code_point_view {
   _LIBCPP_HIDE_FROM_ABI constexpr bool __at_end() const noexcept { return __first_ == __last_; }
   _LIBCPP_HIDE_FROM_ABI constexpr _Iterator __position() const noexcept { return __first_; }
 
-  _LIBCPP_HIDE_FROM_ABI constexpr char32_t __consume() noexcept {
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr __consume_result __consume() noexcept {
     _LIBCPP_ASSERT(__first_ != __last_, "can't move beyond the end of input");
-    return *__first_++;
-  }
-
-#    if _LIBCPP_STD_VER >= 23
-  _LIBCPP_HIDE_FROM_ABI constexpr __consume_p2286_result __consume_p2286() noexcept {
-    _LIBCPP_ASSERT(__first_ != __last_, "can't move beyond the end of input");
-
-    return {0, std::make_unsigned_t<_CharT>(*__first_++)};
+    return {static_cast<char32_t>(*__first_++)};
   }
-#    endif // _LIBCPP_STD_VER >= 23
 
 private:
   _Iterator __first_;

diff  --git a/libcxx/test/std/utilities/format/format.functions/escaped_output.unicode.pass.cpp b/libcxx/test/std/utilities/format/format.functions/escaped_output.unicode.pass.cpp
index 90bd305b80fd4..9caa3a22d7503 100644
--- a/libcxx/test/std/utilities/format/format.functions/escaped_output.unicode.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.functions/escaped_output.unicode.pass.cpp
@@ -202,14 +202,9 @@ void test_char() {
     test_format(V{L"'\\u{600}'"}, L"{:?}", L'\x600');   // ARABIC NUMBER SIGN
     test_format(V{L"'\\u{feff}'"}, L"{:?}", L'\xfeff'); // ZERO WIDTH NO-BREAK SPACE
 
-    if constexpr (sizeof(CharT) == 2) {
-      // Incomplete surrogate pair in UTF-16
-      test_format(V{L"'\\x{d800}'"}, L"{:?}", L'\xd800'); // <surrogate-D800>
-      test_format(V{L"'\\x{dfff}'"}, L"{:?}", L'\xdfff'); // <surrogate-DFFF>
-    } else {
-      test_format(V{L"'\\u{d800}'"}, L"{:?}", L'\xd800'); // <surrogate-D800>
-      test_format(V{L"'\\u{dfff}'"}, L"{:?}", L'\xdfff'); // <surrogate-DFFF>
-    }
+    // Incomplete surrogate pair in UTF-16
+    test_format(V{L"'\\x{d800}'"}, L"{:?}", L'\xd800'); // <surrogate-D800>
+    test_format(V{L"'\\x{dfff}'"}, L"{:?}", L'\xdfff'); // <surrogate-DFFF>
 
     // Private_Use
     test_format(V{L"'\\u{e000}'"}, L"{:?}", L'\xe000'); // <private-use-E000>
@@ -277,6 +272,48 @@ void test_string() {
     // Ill-formend UTF-8
     test_format(SV(R"(["\x{c3}"])"), SV("[{:?}]"), "\xc3");
     test_format(SV(R"(["\x{c3}("])"), SV("[{:?}]"), "\xc3\x28");
+
+    /* U+0000..U+0007F 1 code unit range, encoded in 2 code units. */
+    test_format(SV(R"(["\x{c0}\x{80}"])"), SV("[{:?}]"), "\xc0\x80"); // U+0000
+    test_format(SV(R"(["\x{c1}\x{bf}"])"), SV("[{:?}]"), "\xc1\xbf"); // U+007F
+    test_format(SV(R"(["\u{80}"])"), SV("[{:?}]"), "\xc2\x80");       // U+0080 first valid (General_Category=Control)
+
+    /* U+0000..U+07FFF 1 and 2 code unit range, encoded in 3 code units. */
+    test_format(SV(R"(["\x{e0}\x{80}\x{80}"])"), SV("[{:?}]"), "\xe0\x80\x80"); // U+0000
+    test_format(SV(R"(["\x{e0}\x{81}\x{bf}"])"), SV("[{:?}]"), "\xe0\x81\xbf"); // U+007F
+    test_format(SV(R"(["\x{e0}\x{82}\x{80}"])"), SV("[{:?}]"), "\xe0\x82\x80"); // U+0080
+    test_format(SV(R"(["\x{e0}\x{9f}\x{bf}"])"), SV("[{:?}]"), "\xe0\x9f\xbf"); // U+07FF
+    test_format(SV("[\"\u0800\"]"), SV("[{:?}]"), "\xe0\xa0\x80");              // U+0800 first valid
+
+#if 0
+	// This code point is in the Hangul Jamo Extended-B block and at the time of writing
+	// it's unassigned. When it comes defined, this branch might become true.
+    test_format(SV("[\"\ud7ff\"]"), SV("[{:?}]"), "\xed\x9f\xbf");              // U+D7FF last valid
+#else
+    /* U+D800..D+DFFFF surrogate range */
+    test_format(SV(R"(["\u{d7ff}"])"), SV("[{:?}]"), "\xed\x9f\xbf");           // U+D7FF last valid
+#endif
+    test_format(SV(R"(["\x{ed}\x{a0}\x{80}"])"), SV("[{:?}]"), "\xed\xa0\x80"); // U+D800
+    test_format(SV(R"(["\x{ed}\x{af}\x{bf}"])"), SV("[{:?}]"), "\xed\xaf\xbf"); // U+DBFF
+    test_format(SV(R"(["\x{ed}\x{bf}\x{80}"])"), SV("[{:?}]"), "\xed\xbf\x80"); // U+DC00
+    test_format(SV(R"(["\x{ed}\x{bf}\x{bf}"])"), SV("[{:?}]"), "\xed\xbf\xbf"); // U+DFFF
+    test_format(SV(R"(["\u{e000}"])"), SV("[{:?}]"), "\xee\x80\x80");           // U+E000 first valid
+                                                                                // (in the Private Use Area block)
+
+    /* U+0000..U+FFFF 1, 2, and 3 code unit range */
+    test_format(SV(R"(["\x{f0}\x{80}\x{80}\x{80}"])"), SV("[{:?}]"), "\xf0\x80\x80\x80"); // U+0000
+    test_format(SV(R"(["\x{f0}\x{80}\x{81}\x{bf}"])"), SV("[{:?}]"), "\xf0\x80\x81\xbf"); // U+007F
+    test_format(SV(R"(["\x{f0}\x{80}\x{82}\x{80}"])"), SV("[{:?}]"), "\xf0\x80\x82\x80"); // U+0080
+    test_format(SV(R"(["\x{f0}\x{80}\x{9f}\x{bf}"])"), SV("[{:?}]"), "\xf0\x80\x9f\xbf"); // U+07FF
+    test_format(SV(R"(["\x{f0}\x{80}\x{a0}\x{80}"])"), SV("[{:?}]"), "\xf0\x80\xa0\x80"); // U+0800
+    test_format(SV(R"(["\x{f0}\x{8f}\x{bf}\x{bf}"])"), SV("[{:?}]"), "\xf0\x8f\xbf\xbf"); // U+FFFF
+    test_format(SV("[\"\U00010000\"]"), SV("[{:?}]"), "\xf0\x90\x80\x80");                // U+10000 first valid
+
+    /* U+10FFFF..U+1FFFFF invalid range */
+    test_format(SV(R"(["\u{10ffff}"])"), SV("[{:?}]"), "\xf4\x8f\xbf\xbf"); // U+10FFFF last valid
+                                                                            // (in Supplementary Private Use Area-B)
+    test_format(SV(R"(["\x{f4}\x{90}\x{80}\x{80}"])"), SV("[{:?}]"), "\xf4\x90\x80\x80"); // U+110000
+    test_format(SV(R"(["\x{f4}\x{bf}\x{bf}\x{bf}"])"), SV("[{:?}]"), "\xf4\xbf\xbf\xbf"); // U+11FFFF
   } else {
     // Valid UTF-16 and UTF-32
     test_format(SV("[\"\u00c3\"]"), SV("[{:?}]"), L"\xc3"); // LATIN CAPITAL LETTER A WITH TILDE
@@ -320,11 +357,8 @@ void test_string() {
     // Format
     test_format(V{LR"("\u{ad}\u{600}\u{feff}")"}, L"{:?}", L"\xad\x600\xfeff");
 
-    if constexpr (sizeof(CharT) == 2)
-      // Incomplete surrogate pair in UTF-16
-      test_format(V{LR"("\x{d800}")"}, L"{:?}", L"\xd800");
-    else
-      test_format(V{LR"("\u{d800}")"}, L"{:?}", L"\xd800");
+    // Incomplete surrogate pair in UTF-16
+    test_format(V{LR"("\x{d800}")"}, L"{:?}", L"\xd800");
 
     // Private_Use
     test_format(V{LR"("\u{e000}\u{f8ff}")"}, L"{:?}", L"\xe000\xf8ff");


        


More information about the libcxx-commits mailing list