[libc-commits] [libc] [libc] utf8 to 32 CharacterConverter (PR #143973)
Uzair Nawaz via libc-commits
libc-commits at lists.llvm.org
Thu Jun 12 15:13:00 PDT 2025
================
@@ -0,0 +1,125 @@
+//===-- Unittests for character_converter utf8->3 -------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/wchar/character_converter.h"
+#include "src/__support/wchar/mbstate.h"
+#include "src/__support/wchar/utf_ret.h"
+#include "test/UnitTest/Test.h"
+
+TEST(LlvmLibcCharacterConverterUTF8To32Test, OneByte) {
+ LIBC_NAMESPACE::internal::mbstate state;
+ state.bytes_processed = 0;
+ state.total_bytes = 0;
+ char ch = 'A';
+
+ LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
+ int err = char_conv.push(static_cast<char8_t>(ch));
+ LIBC_NAMESPACE::internal::utf_ret<char32_t> wch = char_conv.pop_utf32();
+
+ EXPECT_EQ(err, 0);
+ EXPECT_EQ(wch.error, 0);
+ EXPECT_EQ(static_cast<int>(wch.out), 65);
+}
+
+TEST(LlvmLibcCharacterConverterUTF8To32Test, TwoBytes) {
+ LIBC_NAMESPACE::internal::mbstate state;
+ state.bytes_processed = 0;
+ state.total_bytes = 0;
+ const char *ch = ""; // hex 0xC2, 0x8E
+
+ LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
+ char_conv.push(static_cast<char8_t>(ch[0]));
+ char_conv.push(static_cast<char8_t>(ch[1]));
+ LIBC_NAMESPACE::internal::utf_ret<char32_t> wch = char_conv.pop_utf32();
+
+ ASSERT_EQ(wch.error, 0);
+ ASSERT_EQ(static_cast<int>(wch.out), 142);
+}
+
+TEST(LlvmLibcCharacterConverterUTF8To32Test, ThreeBytes) {
+ LIBC_NAMESPACE::internal::mbstate state;
+ state.bytes_processed = 0;
+ state.total_bytes = 0;
+ const char *ch = "∑"; // hex 0xE2, 0x88, 0x91
+
+ LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
+ char_conv.push(static_cast<char8_t>(ch[0]));
+ char_conv.push(static_cast<char8_t>(ch[1]));
+ char_conv.push(static_cast<char8_t>(ch[2]));
+ LIBC_NAMESPACE::internal::utf_ret<char32_t> wch = char_conv.pop_utf32();
+
+ ASSERT_EQ(wch.error, 0);
+ ASSERT_EQ(static_cast<int>(wch.out), 8721);
+}
+
+TEST(LlvmLibcCharacterConverterUTF8To32Test, FourBytes) {
+ LIBC_NAMESPACE::internal::mbstate state;
+ state.bytes_processed = 0;
+ state.total_bytes = 0;
+ const char *ch = "🤡"; // hex 0xF0, 0x9F, 0xA4, 0xA1
+
+ LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
+ char_conv.push(static_cast<char8_t>(ch[0]));
+ char_conv.push(static_cast<char8_t>(ch[1]));
+ char_conv.push(static_cast<char8_t>(ch[2]));
+ char_conv.push(static_cast<char8_t>(ch[3]));
+ LIBC_NAMESPACE::internal::utf_ret<char32_t> wch = char_conv.pop_utf32();
+
+ ASSERT_EQ(wch.error, 0);
+ ASSERT_EQ(static_cast<int>(wch.out), 129313);
+}
+
+TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidByte) {
+ LIBC_NAMESPACE::internal::mbstate state;
+ state.bytes_processed = 0;
+ state.total_bytes = 0;
+ const char ch = static_cast<char>(0x80); // invalid starting bit sequence
+
+ LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
+ int err = char_conv.push(static_cast<char8_t>(ch));
+
+ ASSERT_EQ(err, -1);
+}
+
+TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidMultiByte) {
+ LIBC_NAMESPACE::internal::mbstate state;
+ state.bytes_processed = 0;
+ state.total_bytes = 0;
+ const char ch[4] = {static_cast<char>(0x80), static_cast<char>(0x00),
+ static_cast<char>(0x00),
+ static_cast<char>(0x00)}; // All bytes are invalid
+
+ LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
+ int err = char_conv.push(static_cast<char8_t>(ch[0]));
+ ASSERT_EQ(err, -1);
+ err = char_conv.push(static_cast<char8_t>(ch[1]));
+ ASSERT_EQ(err, -1);
+ err = char_conv.push(static_cast<char8_t>(ch[2]));
+ ASSERT_EQ(err, -1);
+ err = char_conv.push(static_cast<char8_t>(ch[3]));
+ ASSERT_EQ(err, -1);
+}
+
+TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidMiddleByte) {
+ LIBC_NAMESPACE::internal::mbstate state;
+ state.bytes_processed = 0;
+ state.total_bytes = 0;
+ const char ch[4] = {static_cast<char>(0xF1), static_cast<char>(0xC0),
+ static_cast<char>(0x80),
+ static_cast<char>(0x80)}; // invalid second byte
+
+ LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
+ int err = char_conv.push(static_cast<char8_t>(ch[0]));
+ ASSERT_EQ(err, 0);
+ err = char_conv.push(static_cast<char8_t>(ch[1]));
+ ASSERT_EQ(err, -1);
+ err = char_conv.push(static_cast<char8_t>(ch[2]));
+ ASSERT_EQ(err, 0);
+ err = char_conv.push(static_cast<char8_t>(ch[3]));
+ ASSERT_EQ(err, 0);
+}
----------------
uzairnawaz wrote:
might also be worth testing two successful conversions one after each other with the same mbstate to make sure that the state gets reset after a push/pop
https://github.com/llvm/llvm-project/pull/143973
More information about the libc-commits
mailing list