[libcxx-commits] [libcxx] [libc++] Fix UTF-8 decoding in codecvts. Fix #60177. (PR #68442)
via libcxx-commits
libcxx-commits at lists.llvm.org
Fri Oct 6 13:02:44 PDT 2023
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-libcxx
<details>
<summary>Changes</summary>
This patch fixes one case where the decoding member function `in()` was returning `partial` instead of `error`. Additionally, it adds large testsuite that tests all `codecvt` facets that were added in C++11 and in C++20. The testsuite covers this bug.
---
Patch is 88.99 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/68442.diff
2 Files Affected:
- (modified) libcxx/src/locale.cpp (+49-23)
- (added) libcxx/test/std/localization/codecvt_unicode.pass.cpp (+2130)
``````````diff
diff --git a/libcxx/src/locale.cpp b/libcxx/src/locale.cpp
index 317b4dec7d241e5..18bcf02d2a12f5a 100644
--- a/libcxx/src/locale.cpp
+++ b/libcxx/src/locale.cpp
@@ -1985,10 +1985,9 @@ utf8_to_utf16(const uint8_t* frm, const uint8_t* frm_end, const uint8_t*& frm_nx
}
else if (c1 < 0xF0)
{
- if (frm_end-frm_nxt < 3)
+ if (frm_end-frm_nxt < 2)
return codecvt_base::partial;
uint8_t c2 = frm_nxt[1];
- uint8_t c3 = frm_nxt[2];
switch (c1)
{
case 0xE0:
@@ -2004,6 +2003,9 @@ utf8_to_utf16(const uint8_t* frm, const uint8_t* frm_end, const uint8_t*& frm_nx
return codecvt_base::error;
break;
}
+ if (frm_end-frm_nxt < 3)
+ return codecvt_base::partial;
+ uint8_t c3 = frm_nxt[2];
if ((c3 & 0xC0) != 0x80)
return codecvt_base::error;
uint16_t t = static_cast<uint16_t>(((c1 & 0x0F) << 12)
@@ -2016,11 +2018,9 @@ utf8_to_utf16(const uint8_t* frm, const uint8_t* frm_end, const uint8_t*& frm_nx
}
else if (c1 < 0xF5)
{
- if (frm_end-frm_nxt < 4)
+ if (frm_end-frm_nxt < 2)
return codecvt_base::partial;
uint8_t c2 = frm_nxt[1];
- uint8_t c3 = frm_nxt[2];
- uint8_t c4 = frm_nxt[3];
switch (c1)
{
case 0xF0:
@@ -2036,8 +2036,16 @@ utf8_to_utf16(const uint8_t* frm, const uint8_t* frm_end, const uint8_t*& frm_nx
return codecvt_base::error;
break;
}
- if ((c3 & 0xC0) != 0x80 || (c4 & 0xC0) != 0x80)
- return codecvt_base::error;
+ if (frm_end-frm_nxt < 3)
+ return codecvt_base::partial;
+ uint8_t c3 = frm_nxt[2];
+ if ((c3 & 0xC0) != 0x80)
+ return codecvt_base::error;
+ if (frm_end-frm_nxt < 4)
+ return codecvt_base::partial;
+ uint8_t c4 = frm_nxt[3];
+ if ((c4 & 0xC0) != 0x80)
+ return codecvt_base::error;
if (to_end-to_nxt < 2)
return codecvt_base::partial;
if ((((c1 & 7UL) << 18) +
@@ -2106,10 +2114,9 @@ utf8_to_utf16(const uint8_t* frm, const uint8_t* frm_end, const uint8_t*& frm_nx
}
else if (c1 < 0xF0)
{
- if (frm_end-frm_nxt < 3)
+ if (frm_end-frm_nxt < 2)
return codecvt_base::partial;
uint8_t c2 = frm_nxt[1];
- uint8_t c3 = frm_nxt[2];
switch (c1)
{
case 0xE0:
@@ -2125,6 +2132,9 @@ utf8_to_utf16(const uint8_t* frm, const uint8_t* frm_end, const uint8_t*& frm_nx
return codecvt_base::error;
break;
}
+ if (frm_end-frm_nxt < 3)
+ return codecvt_base::partial;
+ uint8_t c3 = frm_nxt[2];
if ((c3 & 0xC0) != 0x80)
return codecvt_base::error;
uint16_t t = static_cast<uint16_t>(((c1 & 0x0F) << 12)
@@ -2137,11 +2147,9 @@ utf8_to_utf16(const uint8_t* frm, const uint8_t* frm_end, const uint8_t*& frm_nx
}
else if (c1 < 0xF5)
{
- if (frm_end-frm_nxt < 4)
+ if (frm_end-frm_nxt < 2)
return codecvt_base::partial;
uint8_t c2 = frm_nxt[1];
- uint8_t c3 = frm_nxt[2];
- uint8_t c4 = frm_nxt[3];
switch (c1)
{
case 0xF0:
@@ -2157,8 +2165,16 @@ utf8_to_utf16(const uint8_t* frm, const uint8_t* frm_end, const uint8_t*& frm_nx
return codecvt_base::error;
break;
}
- if ((c3 & 0xC0) != 0x80 || (c4 & 0xC0) != 0x80)
- return codecvt_base::error;
+ if (frm_end-frm_nxt < 3)
+ return codecvt_base::partial;
+ uint8_t c3 = frm_nxt[2];
+ if ((c3 & 0xC0) != 0x80)
+ return codecvt_base::error;
+ if (frm_end-frm_nxt < 4)
+ return codecvt_base::partial;
+ uint8_t c4 = frm_nxt[3];
+ if ((c4 & 0xC0) != 0x80)
+ return codecvt_base::error;
if (to_end-to_nxt < 2)
return codecvt_base::partial;
if ((((c1 & 7UL) << 18) +
@@ -2384,10 +2400,9 @@ utf8_to_ucs4(const uint8_t* frm, const uint8_t* frm_end, const uint8_t*& frm_nxt
}
else if (c1 < 0xF0)
{
- if (frm_end-frm_nxt < 3)
+ if (frm_end-frm_nxt < 2)
return codecvt_base::partial;
uint8_t c2 = frm_nxt[1];
- uint8_t c3 = frm_nxt[2];
switch (c1)
{
case 0xE0:
@@ -2403,6 +2418,9 @@ utf8_to_ucs4(const uint8_t* frm, const uint8_t* frm_end, const uint8_t*& frm_nxt
return codecvt_base::error;
break;
}
+ if (frm_end-frm_nxt < 3)
+ return codecvt_base::partial;
+ uint8_t c3 = frm_nxt[2];
if ((c3 & 0xC0) != 0x80)
return codecvt_base::error;
uint32_t t = static_cast<uint32_t>(((c1 & 0x0F) << 12)
@@ -2415,11 +2433,9 @@ utf8_to_ucs4(const uint8_t* frm, const uint8_t* frm_end, const uint8_t*& frm_nxt
}
else if (c1 < 0xF5)
{
- if (frm_end-frm_nxt < 4)
+ if (frm_end-frm_nxt < 2)
return codecvt_base::partial;
uint8_t c2 = frm_nxt[1];
- uint8_t c3 = frm_nxt[2];
- uint8_t c4 = frm_nxt[3];
switch (c1)
{
case 0xF0:
@@ -2435,8 +2451,16 @@ utf8_to_ucs4(const uint8_t* frm, const uint8_t* frm_end, const uint8_t*& frm_nxt
return codecvt_base::error;
break;
}
- if ((c3 & 0xC0) != 0x80 || (c4 & 0xC0) != 0x80)
- return codecvt_base::error;
+ if (frm_end-frm_nxt < 3)
+ return codecvt_base::partial;
+ uint8_t c3 = frm_nxt[2];
+ if ((c3 & 0xC0) != 0x80)
+ return codecvt_base::error;
+ if (frm_end-frm_nxt < 4)
+ return codecvt_base::partial;
+ uint8_t c4 = frm_nxt[3];
+ if ((c4 & 0xC0) != 0x80)
+ return codecvt_base::error;
uint32_t t = static_cast<uint32_t>(((c1 & 0x07) << 18)
| ((c2 & 0x3F) << 12)
| ((c3 & 0x3F) << 6)
@@ -2642,10 +2666,9 @@ utf8_to_ucs2(const uint8_t* frm, const uint8_t* frm_end, const uint8_t*& frm_nxt
}
else if (c1 < 0xF0)
{
- if (frm_end-frm_nxt < 3)
+ if (frm_end-frm_nxt < 2)
return codecvt_base::partial;
uint8_t c2 = frm_nxt[1];
- uint8_t c3 = frm_nxt[2];
switch (c1)
{
case 0xE0:
@@ -2661,6 +2684,9 @@ utf8_to_ucs2(const uint8_t* frm, const uint8_t* frm_end, const uint8_t*& frm_nxt
return codecvt_base::error;
break;
}
+ if (frm_end-frm_nxt < 3)
+ return codecvt_base::partial;
+ uint8_t c3 = frm_nxt[2];
if ((c3 & 0xC0) != 0x80)
return codecvt_base::error;
uint16_t t = static_cast<uint16_t>(((c1 & 0x0F) << 12)
diff --git a/libcxx/test/std/localization/codecvt_unicode.pass.cpp b/libcxx/test/std/localization/codecvt_unicode.pass.cpp
new file mode 100644
index 000000000000000..6abe3d7ba69da6a
--- /dev/null
+++ b/libcxx/test/std/localization/codecvt_unicode.pass.cpp
@@ -0,0 +1,2130 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx{{10.9|10.10|10.11|10.12|10.13|10.14|10.15|11.0|12.0|13.0}}
+
+#include <algorithm>
+#include <cassert>
+#include <codecvt>
+#include <locale>
+#include <string>
+
+#include "test_macros.h"
+
+struct test_offsets_ok {
+ size_t in_size;
+ size_t out_size;
+};
+struct test_offsets_partial {
+ size_t in_size;
+ size_t out_size;
+ size_t expected_in_next;
+ size_t expected_out_next;
+};
+
+template <class CharT>
+struct test_offsets_error {
+ size_t in_size;
+ size_t out_size;
+ size_t expected_in_next;
+ size_t expected_out_next;
+ CharT replace_char;
+ size_t replace_pos;
+};
+
+#define array_size(x) (sizeof(x) / sizeof(x)[0])
+
+using std::begin;
+using std::char_traits;
+using std::codecvt_base;
+using std::copy;
+using std::end;
+
+template <class InternT, class ExternT>
+void utf8_to_utf32_in_ok(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
+ // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP
+ const unsigned char input[] = "b\u0448\uAAAA\U0010AAAA";
+ const char32_t expected[] = {'b', 0x0448, 0xAAAA, 0x10AAAA, 0};
+ static_assert(array_size(input) == 11, "");
+ static_assert(array_size(expected) == 5, "");
+
+ ExternT in[array_size(input)];
+ InternT exp[array_size(expected)];
+ copy(begin(input), end(input), begin(in));
+ copy(begin(expected), end(expected), begin(exp));
+ assert(char_traits<ExternT>::length(in) == 10);
+ assert(char_traits<InternT>::length(exp) == 4);
+ test_offsets_ok offsets[] = {{0, 0}, {1, 1}, {3, 2}, {6, 3}, {10, 4}};
+ for (test_offsets_ok* it = begin(offsets); it != end(offsets); ++it) {
+ test_offsets_ok t = *it;
+ InternT out[array_size(exp) - 1] = {};
+ assert(t.in_size <= array_size(in));
+ assert(t.out_size <= array_size(out));
+ mbstate_t state = {};
+ const ExternT* in_next = nullptr;
+ InternT* out_next = nullptr;
+ codecvt_base::result res = codecvt_base::ok;
+
+ res = cvt.in(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next);
+ assert(res == cvt.ok);
+ assert(in_next == in + t.in_size);
+ assert(out_next == out + t.out_size);
+ assert(char_traits<InternT>::compare(out, exp, t.out_size) == 0);
+ if (t.out_size < array_size(out))
+ assert(out[t.out_size] == 0);
+ }
+
+ for (test_offsets_ok* it = begin(offsets); it != end(offsets); ++it) {
+ test_offsets_ok t = *it;
+ InternT out[array_size(exp)] = {};
+ assert(t.in_size <= array_size(in));
+ assert(t.out_size <= array_size(out));
+ mbstate_t state = {};
+ const ExternT* in_next = nullptr;
+ InternT* out_next = nullptr;
+ codecvt_base::result res = codecvt_base::ok;
+
+ res = cvt.in(state, in, in + t.in_size, in_next, out, end(out), out_next);
+ assert(res == cvt.ok);
+ assert(in_next == in + t.in_size);
+ assert(out_next == out + t.out_size);
+ assert(char_traits<InternT>::compare(out, exp, t.out_size) == 0);
+ if (t.out_size < array_size(out))
+ assert(out[t.out_size] == 0);
+ }
+}
+
+template <class InternT, class ExternT>
+void utf8_to_utf32_in_partial(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
+ // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP
+ const unsigned char input[] = "b\u0448\uAAAA\U0010AAAA";
+ const char32_t expected[] = {'b', 0x0448, 0xAAAA, 0x10AAAA, 0};
+ static_assert(array_size(input) == 11, "");
+ static_assert(array_size(expected) == 5, "");
+
+ ExternT in[array_size(input)];
+ InternT exp[array_size(expected)];
+ copy(begin(input), end(input), begin(in));
+ copy(begin(expected), end(expected), begin(exp));
+ assert(char_traits<ExternT>::length(in) == 10);
+ assert(char_traits<InternT>::length(exp) == 4);
+
+ test_offsets_partial offsets[] = {
+ {1, 0, 0, 0}, // no space for first CP
+
+ {3, 1, 1, 1}, // no space for second CP
+ {2, 2, 1, 1}, // incomplete second CP
+ {2, 1, 1, 1}, // incomplete second CP, and no space for it
+
+ {6, 2, 3, 2}, // no space for third CP
+ {4, 3, 3, 2}, // incomplete third CP
+ {5, 3, 3, 2}, // incomplete third CP
+ {4, 2, 3, 2}, // incomplete third CP, and no space for it
+ {5, 2, 3, 2}, // incomplete third CP, and no space for it
+
+ {10, 3, 6, 3}, // no space for fourth CP
+ {7, 4, 6, 3}, // incomplete fourth CP
+ {8, 4, 6, 3}, // incomplete fourth CP
+ {9, 4, 6, 3}, // incomplete fourth CP
+ {7, 3, 6, 3}, // incomplete fourth CP, and no space for it
+ {8, 3, 6, 3}, // incomplete fourth CP, and no space for it
+ {9, 3, 6, 3}, // incomplete fourth CP, and no space for it
+ };
+
+ for (test_offsets_partial* it = begin(offsets); it != end(offsets); ++it) {
+ test_offsets_partial t = *it;
+ InternT out[array_size(exp) - 1] = {};
+ assert(t.in_size <= array_size(in));
+ assert(t.out_size <= array_size(out));
+ assert(t.expected_in_next <= t.in_size);
+ assert(t.expected_out_next <= t.out_size);
+ mbstate_t state = {};
+ const ExternT* in_next = nullptr;
+ InternT* out_next = nullptr;
+ codecvt_base::result res = codecvt_base::ok;
+
+ res = cvt.in(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next);
+ assert(res == cvt.partial);
+ assert(in_next == in + t.expected_in_next);
+ assert(out_next == out + t.expected_out_next);
+ assert(char_traits<InternT>::compare(out, exp, t.expected_out_next) == 0);
+ if (t.expected_out_next < array_size(out))
+ assert(out[t.expected_out_next] == 0);
+ }
+}
+
+template <class InternT, class ExternT>
+void utf8_to_utf32_in_error(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
+ // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP, 4-byte CP
+ const unsigned char input[] = "b\u0448\uD700\U0010AAAA";
+ const char32_t expected[] = {'b', 0x0448, 0xD700, 0x10AAAA, 0};
+ static_assert(array_size(input) == 11, "");
+ static_assert(array_size(expected) == 5, "");
+
+ ExternT in[array_size(input)];
+ InternT exp[array_size(expected)];
+ copy(begin(input), end(input), begin(in));
+ copy(begin(expected), end(expected), begin(exp));
+ assert(char_traits<ExternT>::length(in) == 10);
+ assert(char_traits<InternT>::length(exp) == 4);
+
+ // There are 5 classes of errors in UTF-8 decoding
+ // 1. Missing leading byte
+ // 2. Missing trailing byte
+ // 3. Surrogate CP
+ // 4. Overlong sequence
+ // 5. CP out of Unicode range
+ test_offsets_error<unsigned char> offsets[] = {
+
+ // 1. Missing leading byte. We will replace the leading byte with
+ // non-leading byte, such as a byte that is always invalid or a trailing
+ // byte.
+
+ // replace leading byte with invalid byte
+ {1, 4, 0, 0, 0xFF, 0},
+ {3, 4, 1, 1, 0xFF, 1},
+ {6, 4, 3, 2, 0xFF, 3},
+ {10, 4, 6, 3, 0xFF, 6},
+
+ // replace leading byte with trailing byte
+ {1, 4, 0, 0, 0b10101010, 0},
+ {3, 4, 1, 1, 0b10101010, 1},
+ {6, 4, 3, 2, 0b10101010, 3},
+ {10, 4, 6, 3, 0b10101010, 6},
+
+ // 2. Missing trailing byte. We will replace the trailing byte with
+ // non-trailing byte, such as a byte that is always invalid or a leading
+ // byte (simple ASCII byte in our case).
+
+ // replace first trailing byte with ASCII byte
+ {3, 4, 1, 1, 'z', 2},
+ {6, 4, 3, 2, 'z', 4},
+ {10, 4, 6, 3, 'z', 7},
+
+ // replace first trailing byte with invalid byte
+ {3, 4, 1, 1, 0xFF, 2},
+ {6, 4, 3, 2, 0xFF, 4},
+ {10, 4, 6, 3, 0xFF, 7},
+
+ // replace second trailing byte with ASCII byte
+ {6, 4, 3, 2, 'z', 5},
+ {10, 4, 6, 3, 'z', 8},
+
+ // replace second trailing byte with invalid byte
+ {6, 4, 3, 2, 0xFF, 5},
+ {10, 4, 6, 3, 0xFF, 8},
+
+ // replace third trailing byte
+ {10, 4, 6, 3, 'z', 9},
+ {10, 4, 6, 3, 0xFF, 9},
+
+ // 2.1 The following test-cases raise doubt whether error or partial should
+ // be returned. For example, we have 4-byte sequence with valid leading
+ // byte. If we hide the last byte we need to return partial. But, if the
+ // second or third byte, which are visible to the call to codecvt, are
+ // malformed then error should be returned.
+
+ // replace first trailing byte with ASCII byte, also incomplete at end
+ {5, 4, 3, 2, 'z', 4},
+ {8, 4, 6, 3, 'z', 7},
+ {9, 4, 6, 3, 'z', 7},
+
+ // replace first trailing byte with invalid byte, also incomplete at end
+ {5, 4, 3, 2, 0xFF, 4},
+ {8, 4, 6, 3, 0xFF, 7},
+ {9, 4, 6, 3, 0xFF, 7},
+
+ // replace second trailing byte with ASCII byte, also incomplete at end
+ {9, 4, 6, 3, 'z', 8},
+
+ // replace second trailing byte with invalid byte, also incomplete at end
+ {9, 4, 6, 3, 0xFF, 8},
+
+ // 3. Surrogate CP. We modify the second byte (first trailing) of the 3-byte
+ // CP U+D700
+ {6, 4, 3, 2, 0b10100000, 4}, // turn U+D700 into U+D800
+ {6, 4, 3, 2, 0b10101100, 4}, // turn U+D700 into U+DB00
+ {6, 4, 3, 2, 0b10110000, 4}, // turn U+D700 into U+DC00
+ {6, 4, 3, 2, 0b10111100, 4}, // turn U+D700 into U+DF00
+
+ // 4. Overlong sequence. The CPs in the input are chosen such as modifying
+ // just the leading byte is enough to make them overlong, i.e. for the
+ // 3-byte and 4-byte CP the second byte (first trailing) has enough leading
+ // zeroes.
+ {3, 4, 1, 1, 0b11000000, 1}, // make the 2-byte CP overlong
+ {3, 4, 1, 1, 0b11000001, 1}, // make the 2-byte CP overlong
+ {6, 4, 3, 2, 0b11100000, 3}, // make the 3-byte CP overlong
+ {10, 4, 6, 3, 0b11110000, 6}, // make the 4-byte CP overlong
+
+ // 5. CP above range
+ // turn U+10AAAA into U+14AAAA by changing its leading byte
+ {10, 4, 6, 3, 0b11110101, 6},
+ // turn U+10AAAA into U+11AAAA by changing its 2nd byte
+ {10, 4, 6, 3, 0b10011010, 7},
+ };
+ for (test_offsets_error<unsigned char>* it = begin(offsets); it != end(offsets); ++it) {
+ test_offsets_error<unsigned char> t = *it;
+ InternT out[array_size(exp) - 1] = {};
+ assert(t.in_size <= array_size(in));
+ assert(t.out_size <= array_size(out));
+ assert(t.expected_in_next <= t.in_size);
+ assert(t.expected_out_next <= t.out_size);
+ ExternT old_char = in[t.replace_pos];
+ in[t.replace_pos] = t.replace_char;
+
+ mbstate_t state = {};
+ const ExternT* in_next = nullptr;
+ InternT* out_next = nullptr;
+ codecvt_base::result res = codecvt_base::ok;
+
+ res = cvt.in(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next);
+ assert(res == cvt.error);
+ assert(in_next == in + t.expected_in_next);
+ assert(out_next == out + t.expected_out_next);
+ assert(char_traits<InternT>::compare(out, exp, t.expected_out_next) == 0);
+ if (t.expected_out_next < array_size(out))
+ assert(out[t.expected_out_next] == 0);
+
+ in[t.replace_pos] = old_char;
+ }
+}
+
+template <class InternT, class ExternT>
+void utf8_to_utf32_in(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
+ utf8_to_utf32_in_ok(cvt);
+ utf8_to_utf32_in_partial(cvt);
+ utf8_to_utf32_in_error(cvt);
+}
+
+template <class InternT, class ExternT>
+void utf32_to_utf8_out_ok(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
+ // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP
+ const char32_t input[] = {'b', 0x0448, 0xAAAA, 0x10AAAA, 0};
+ const unsigned char expected[] = "b\u0448\uAAAA\U0010AAAA";
+ static_assert(array_size(input) == 5, "");
+ static_assert(array_size(expected) == 11, "");
+
+ InternT in[array_size(input)];
+ ExternT exp[array_size(expected)];
+ copy(begin(input), end(input), begin(in));
+ copy(begin(expected), end(expected), begin(exp));
+ assert(char_traits<InternT>::length(in) == 4);
+ assert(char_traits<ExternT>::length(exp) == 10);
+
+ test_offsets_ok offsets[] = {{0, 0}, {1, 1}, {2, 3}, {3, 6}, {4, 10}};
+ for (test_off...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/68442
More information about the libcxx-commits
mailing list