[libcxx-commits] [PATCH] D143349: [libc++] Fix UTF-8 decoding in codecvts. Fix #60177.

Thu Apr 13 13:03:10 PDT 2023

tahonermann added a comment.

I added some minor suggested edits, but otherwise, I think this is fine to accept.

================
Comment at: libcxx/test/std/localization/codecvt_unicode.pass.cpp:46
+  // UTF-8 string of 1-byte code point (CP), 2-byte CP, 3-byte CP and 4-byte CP
+  const unsigned char input[] = "b\u0448\uAAAA\U0010AAAA";
+  const char32_t expected[]   = {'b', 0x0448, 0xAAAA, 0x10AAAA, 0};
----------------
This depends on the ordinary literal encoding being UTF-8 and that is not guaranteed (note that people are working on Clang's support for non-ASCII based operating systems). The suggested edit avoids that dependency.

================
Comment at: libcxx/test/std/localization/codecvt_unicode.pass.cpp:100
+  // UTF-8 string of 1-byte code point (CP), 2-byte CP, 3-byte CP and 4-byte CP
+  const unsigned char input[] = "b\u0448\uAAAA\U0010AAAA";
+  const char32_t expected[]   = {'b', 0x0448, 0xAAAA, 0x10AAAA, 0};
----------------

================
Comment at: libcxx/test/std/localization/codecvt_unicode.pass.cpp:158
+  // UTF-8 string of 1-byte code point (CP), 2-byte CP, 3-byte CP and 4-byte CP
+  const unsigned char input[] = "b\u0448\uD700\U0010AAAA";
+  const char32_t expected[]   = {'b', 0x0448, 0xD700, 0x10AAAA, 0};
----------------

================
Comment at: libcxx/test/std/localization/codecvt_unicode.pass.cpp:174
+  // 3. Surrogate CP
+  // 4. Ovelong sequence
+  // 5. CP out of Unicode range
----------------

================
Comment at: libcxx/test/std/localization/codecvt_unicode.pass.cpp:301
+  const char32_t input[]         = {'b', 0x0448, 0xAAAA, 0x10AAAA, 0};
+  const unsigned char expected[] = "b\u0448\uAAAA\U0010AAAA";
+  static_assert(array_size(input) == 5, "");
----------------

================
Comment at: libcxx/test/std/localization/codecvt_unicode.pass.cpp:336
+  const char32_t input[]         = {'b', 0x0448, 0xAAAA, 0x10AAAA, 0};
+  const unsigned char expected[] = "b\u0448\uAAAA\U0010AAAA";
+  static_assert(array_size(input) == 5, "");
----------------

================
Comment at: libcxx/test/std/localization/codecvt_unicode.pass.cpp:387
+  const char32_t input[]         = {'b', 0x0448, 0xAAAA, 0x10AAAA, 0};
+  const unsigned char expected[] = "b\u0448\uAAAA\U0010AAAA";
+  static_assert(array_size(input) == 5, "");
----------------

================
Comment at: libcxx/test/std/localization/codecvt_unicode.pass.cpp:454
+  // UTF-8 string of 1-byte code point (CP), 2-byte CP, 3-byte CP and 4-byte CP
+  const unsigned char input[] = "b\u0448\uAAAA\U0010AAAA";
+  const char16_t expected[]   = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
----------------

================
Comment at: libcxx/test/std/localization/codecvt_unicode.pass.cpp:507
+  // UTF-8 string of 1-byte code point (CP), 2-byte CP, 3-byte CP and 4-byte CP
+  const unsigned char input[] = "b\u0448\uAAAA\U0010AAAA";
+  const char16_t expected[]   = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
----------------

================
Comment at: libcxx/test/std/localization/codecvt_unicode.pass.cpp:570
+  // UTF-8 string of 1-byte code point (CP), 2-byte CP, 3-byte CP and 4-byte CP
+  const unsigned char input[] = "b\u0448\uD700\U0010AAAA";
+  const char16_t expected[]   = {'b', 0x0448, 0xD700, 0xDBEA, 0xDEAA, 0};
----------------

================
Comment at: libcxx/test/std/localization/codecvt_unicode.pass.cpp:713
+  const char16_t input[]         = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
+  const unsigned char expected[] = "b\u0448\uAAAA\U0010AAAA";
+  static_assert(array_size(input) == 6, "");
----------------

================
Comment at: libcxx/test/std/localization/codecvt_unicode.pass.cpp:748
+  const char16_t input[]         = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
+  const unsigned char expected[] = "b\u0448\uAAAA\U0010AAAA";
+  static_assert(array_size(input) == 6, "");
----------------

================
Comment at: libcxx/test/std/localization/codecvt_unicode.pass.cpp:806
+  const char16_t input[]         = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
+  const unsigned char expected[] = "b\u0448\uAAAA\U0010AAAA";
+  static_assert(array_size(input) == 6, "");
----------------

================
Comment at: libcxx/test/std/localization/codecvt_unicode.pass.cpp:889
+  // UTF-8 string of 1-byte code point (CP), 2-byte CP and 3-byte CP
+  const unsigned char input[] = "b\u0448\uAAAA";
+  const char16_t expected[]   = {'b', 0x0448, 0xAAAA, 0};
----------------

================
Comment at: libcxx/test/std/localization/codecvt_unicode.pass.cpp:942
+  // UTF-8 string of 1-byte code point (CP), 2-byte CP and 3-byte CP
+  const unsigned char input[] = "b\u0448\uAAAA";
+  const char16_t expected[]   = {'b', 0x0448, 0xAAAA, 0};
----------------

================
Comment at: libcxx/test/std/localization/codecvt_unicode.pass.cpp:991
+void utf8_to_ucs2_in_error(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
+  const unsigned char input[] = "b\u0448\uD700\U0010AAAA";
+  const char16_t expected[]   = {'b', 0x0448, 0xD700, 0xDBEA, 0xDEAA, 0};
----------------

================
Comment at: libcxx/test/std/localization/codecvt_unicode.pass.cpp:1145
+  const char16_t input[]         = {'b', 0x0448, 0xAAAA, 0};
+  const unsigned char expected[] = "b\u0448\uAAAA";
+  static_assert(array_size(input) == 4, "");
----------------

================
Comment at: libcxx/test/std/localization/codecvt_unicode.pass.cpp:1180
+  const char16_t input[]         = {'b', 0x0448, 0xAAAA, 0};
+  const unsigned char expected[] = "b\u0448\uAAAA";
+  static_assert(array_size(input) == 4, "");
----------------

================
Comment at: libcxx/test/std/localization/codecvt_unicode.pass.cpp:1225
+  const char16_t input[]         = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
+  const unsigned char expected[] = "b\u0448\uAAAA\U0010AAAA";
+  static_assert(array_size(input) == 6, "");
----------------

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D143349/new/

https://reviews.llvm.org/D143349