[libc-commits] [libc] [libc] utf8 to 32 CharacterConverter (PR #143973)
Brooks Moses via libc-commits
libc-commits at lists.llvm.org
Fri Jun 13 15:51:23 PDT 2025
================
@@ -0,0 +1,197 @@
+//===-- Unittests for character_converter utf8->utf32 ---------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/error_or.h"
+#include "src/__support/wchar/character_converter.h"
+#include "src/__support/wchar/mbstate.h"
+#include "src/__support/wchar/utf_ret.h"
+#include "test/UnitTest/Test.h"
+
+TEST(LlvmLibcCharacterConverterUTF8To32Test, OneByte) {
+ LIBC_NAMESPACE::internal::mbstate state;
+ state.bytes_processed = 0;
+ state.total_bytes = 0;
+ char ch = 'A';
+
+ LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
+ int err = char_conv.push(static_cast<char8_t>(ch));
+ auto wch = char_conv.pop_utf32();
+
+ ASSERT_EQ(err, 0);
+ ASSERT_TRUE(wch.has_value());
+ ASSERT_EQ(static_cast<int>(wch.value()), 65);
+}
+
+TEST(LlvmLibcCharacterConverterUTF8To32Test, TwoBytes) {
+ LIBC_NAMESPACE::internal::mbstate state;
+ state.bytes_processed = 0;
+ state.total_bytes = 0;
+ const char ch[2] = {static_cast<char>(0xC2),
+ static_cast<char>(0x8E)}; // car symbol
+
+ LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
+ char_conv.push(static_cast<char8_t>(ch[0]));
+ char_conv.push(static_cast<char8_t>(ch[1]));
+ auto wch = char_conv.pop_utf32();
+
+ ASSERT_TRUE(wch.has_value());
+ ASSERT_EQ(static_cast<int>(wch.value()), 142);
+}
+
+TEST(LlvmLibcCharacterConverterUTF8To32Test, ThreeBytes) {
+ LIBC_NAMESPACE::internal::mbstate state;
+ state.bytes_processed = 0;
+ state.total_bytes = 0;
+ const char ch[3] = {static_cast<char>(0xE2), static_cast<char>(0x88),
+ static_cast<char>(0x91)}; // ∑ sigma symbol
+
+ LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
+ char_conv.push(static_cast<char8_t>(ch[0]));
+ char_conv.push(static_cast<char8_t>(ch[1]));
+ char_conv.push(static_cast<char8_t>(ch[2]));
+ auto wch = char_conv.pop_utf32();
+
+ ASSERT_TRUE(wch.has_value());
+ ASSERT_EQ(static_cast<int>(wch.value()), 8721);
+}
+
+TEST(LlvmLibcCharacterConverterUTF8To32Test, FourBytes) {
+ LIBC_NAMESPACE::internal::mbstate state;
+ state.bytes_processed = 0;
+ state.total_bytes = 0;
+ const char ch[4] = {static_cast<char>(0xF0), static_cast<char>(0x9F),
+ static_cast<char>(0xA4),
+ static_cast<char>(0xA1)}; // 🤡 clown emoji
+
+ LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
+ char_conv.push(static_cast<char8_t>(ch[0]));
+ char_conv.push(static_cast<char8_t>(ch[1]));
+ char_conv.push(static_cast<char8_t>(ch[2]));
+ char_conv.push(static_cast<char8_t>(ch[3]));
+ auto wch = char_conv.pop_utf32();
+
+ ASSERT_TRUE(wch.has_value());
+ ASSERT_EQ(static_cast<int>(wch.value()), 129313);
+}
+
+TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidByte) {
+ LIBC_NAMESPACE::internal::mbstate state;
+ state.bytes_processed = 0;
+ state.total_bytes = 0;
+ const char ch = static_cast<char>(0x80); // invalid starting bit sequence
+
+ LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
+ int err = char_conv.push(static_cast<char8_t>(ch));
+
+ ASSERT_EQ(err, -1);
+}
+
+TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidMultiByte) {
+ LIBC_NAMESPACE::internal::mbstate state;
+ state.bytes_processed = 0;
+ state.total_bytes = 0;
+ const char ch[4] = {
+ static_cast<char>(0x80), static_cast<char>(0x00), static_cast<char>(0x80),
+ static_cast<char>(0x00)}; // first, third, and last bytes are invalid
+
+ LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
+ int err = char_conv.push(static_cast<char8_t>(ch[0]));
+ ASSERT_EQ(err, -1);
+ err = char_conv.push(static_cast<char8_t>(ch[1]));
+ ASSERT_EQ(err, 0);
+ // Prev byte was single byte so trying to read another should error.
+ err = char_conv.push(static_cast<char8_t>(ch[2]));
+ ASSERT_EQ(err, -1);
+ err = char_conv.push(static_cast<char8_t>(ch[3]));
+ ASSERT_EQ(err, -1);
----------------
brooksmoses wrote:
This error also should have a comment explaining it.
https://github.com/llvm/llvm-project/pull/143973
More information about the libc-commits
mailing list