[libcxx-commits] [libcxx] 09addf9 - [libc++][format] Fixes UTF-8 continuation.

Mark de Wever via libcxx-commits libcxx-commits at lists.llvm.org
Tue Jun 20 10:28:12 PDT 2023


Author: Mark de Wever
Date: 2023-06-20T19:28:02+02:00
New Revision: 09addf9cbe0a5a8ba9d5666b6e25e6308c441ae2

URL: https://github.com/llvm/llvm-project/commit/09addf9cbe0a5a8ba9d5666b6e25e6308c441ae2
DIFF: https://github.com/llvm/llvm-project/commit/09addf9cbe0a5a8ba9d5666b6e25e6308c441ae2.diff

LOG: [libc++][format] Fixes UTF-8 continuation.

The mask used to check whether a code unit is a valid continuation was
incorrect and accepts non-continuation code points. This fixes the
issue.

Reviewed By: ldionne, tahonermann, #libc

Differential Revision: https://reviews.llvm.org/D149672

Added: 
    

Modified: 
    libcxx/include/__format/unicode.h
    libcxx/test/std/utilities/format/format.functions/escaped_output.unicode.pass.cpp
    libcxx/test/std/utilities/format/format.functions/unicode.pass.cpp

Removed: 
    


################################################################################
diff  --git a/libcxx/include/__format/unicode.h b/libcxx/include/__format/unicode.h
index 12aed507990e8..c6a124e25ccf3 100644
--- a/libcxx/include/__format/unicode.h
+++ b/libcxx/include/__format/unicode.h
@@ -105,7 +105,7 @@ template <contiguous_iterator _Iterator>
   requires same_as<iter_value_t<_Iterator>, char>
 _LIBCPP_HIDE_FROM_ABI constexpr bool __is_continuation(_Iterator __char, int __count) {
   do {
-    if ((*__char & 0b1000'0000) != 0b1000'0000)
+    if ((*__char & 0b1100'0000) != 0b1000'0000)
       return false;
     --__count;
     ++__char;

diff  --git a/libcxx/test/std/utilities/format/format.functions/escaped_output.unicode.pass.cpp b/libcxx/test/std/utilities/format/format.functions/escaped_output.unicode.pass.cpp
index 95b00d0109bfe..431e0b6805a1a 100644
--- a/libcxx/test/std/utilities/format/format.functions/escaped_output.unicode.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.functions/escaped_output.unicode.pass.cpp
@@ -501,6 +501,12 @@ static void test_ill_formed_utf8() {
               "\xf7\xbf\xbf"
               "a");
 
+  test_format(R"("a\x{f1}\x{80}\x{80}\x{e1}\x{80}\x{c2}b")"sv,
+              "{:?}",
+              "a"
+              "\xf1\x80\x80\xe1\x80\xc2"
+              "b");
+
   // Code unit out of range
   test_format(R"("\u{10ffff}")"sv, "{:?}", "\xf4\x8f\xbf\xbf");               // last valid code point
   test_format(R"("\x{f4}\x{90}\x{80}\x{80}")"sv, "{:?}", "\xf4\x90\x80\x80"); // first invalid code point

diff  --git a/libcxx/test/std/utilities/format/format.functions/unicode.pass.cpp b/libcxx/test/std/utilities/format/format.functions/unicode.pass.cpp
index 1ee142614b15b..b5b0442fb376c 100644
--- a/libcxx/test/std/utilities/format/format.functions/unicode.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.functions/unicode.pass.cpp
@@ -269,6 +269,31 @@ static void test_malformed_code_point() {
     check(SV("*ZZZZ\xefZZZZ*"), SV("{:*^11}"), SV("ZZZZ\xefZZZZ"));
     check(SV("*ZZZZ\xffZZZZ*"), SV("{:*^11}"), SV("ZZZZ\xffZZZZ"));
 
+    // Invalid continuations
+    check(SV("\xc2\x00"), SV("{}"), SV("\xc2\x00"));
+    check(SV("\xc2\x40"), SV("{}"), SV("\xc2\x40"));
+    check(SV("\xc2\xc0"), SV("{}"), SV("\xc2\xc0"));
+
+    check(SV("\xe0\x00\x80"), SV("{}"), SV("\xe0\x00\x80"));
+    check(SV("\xe0\x40\x80"), SV("{}"), SV("\xe0\x40\x80"));
+    check(SV("\xe0\xc0\x80"), SV("{}"), SV("\xe0\xc0\x80"));
+
+    check(SV("\xe0\x80\x00"), SV("{}"), SV("\xe0\x80\x00"));
+    check(SV("\xe0\x80\x40"), SV("{}"), SV("\xe0\x80\x40"));
+    check(SV("\xe0\x80\xc0"), SV("{}"), SV("\xe0\x80\xc0"));
+
+    check(SV("\xf0\x80\x80\x00"), SV("{}"), SV("\xf0\x80\x80\x00"));
+    check(SV("\xf0\x80\x80\x40"), SV("{}"), SV("\xf0\x80\x80\x40"));
+    check(SV("\xf0\x80\x80\xc0"), SV("{}"), SV("\xf0\x80\x80\xc0"));
+
+    check(SV("\xf0\x80\x00\x80"), SV("{}"), SV("\xf0\x80\x00\x80"));
+    check(SV("\xf0\x80\x40\x80"), SV("{}"), SV("\xf0\x80\x40\x80"));
+    check(SV("\xf0\x80\xc0\x80"), SV("{}"), SV("\xf0\x80\xc0\x80"));
+
+    check(SV("\xf0\x00\x80\x80"), SV("{}"), SV("\xf0\x00\x80\x80"));
+    check(SV("\xf0\x40\x80\x80"), SV("{}"), SV("\xf0\x40\x80\x80"));
+    check(SV("\xf0\xc0\x80\x80"), SV("{}"), SV("\xf0\xc0\x80\x80"));
+
     // Premature end.
     check(SV("*ZZZZ\xef\xf5*"), SV("{:*^8}"), SV("ZZZZ\xef\xf5"));
     check(SV("*ZZZZ\xef\xf5ZZZZ*"), SV("{:*^12}"), SV("ZZZZ\xef\xf5ZZZZ"));


        


More information about the libcxx-commits mailing list