[libc-commits] [libc] [libc]Utf8-32-character-converter (PR #143973)
via libc-commits
libc-commits at lists.llvm.org
Thu Jun 12 14:05:51 PDT 2025
https://github.com/sribee8 updated https://github.com/llvm/llvm-project/pull/143973
>From 9561ab5887c72dce013c59d1596afa1ff37183bf Mon Sep 17 00:00:00 2001
From: Sriya Pratipati <sriyap at google.com>
Date: Thu, 12 Jun 2025 20:58:35 +0000
Subject: [PATCH 1/2] [libc] CharacterConverter utf8 to 32 push and pop
Implemented push and pop for utf8 to 32 conversion and tests.
---
.../__support/wchar/character_converter.cpp | 70 +++++++++-
libc/test/src/__support/CMakeLists.txt | 1 +
libc/test/src/__support/wchar/CMakeLists.txt | 11 ++
.../src/__support/wchar/utf8_to_32_test.cpp | 125 ++++++++++++++++++
4 files changed, 203 insertions(+), 4 deletions(-)
create mode 100644 libc/test/src/__support/wchar/CMakeLists.txt
create mode 100644 libc/test/src/__support/wchar/utf8_to_32_test.cpp
diff --git a/libc/src/__support/wchar/character_converter.cpp b/libc/src/__support/wchar/character_converter.cpp
index f09c7815a6cc4..9c2fde3134837 100644
--- a/libc/src/__support/wchar/character_converter.cpp
+++ b/libc/src/__support/wchar/character_converter.cpp
@@ -22,13 +22,75 @@ bool CharacterConverter::isComplete() {
return state->bytes_processed == state->total_bytes;
}
-int CharacterConverter::push(char8_t utf8_byte) {}
+int CharacterConverter::push(char8_t utf8_byte) {
+ // Checking the first byte if first push
+ if (state->bytes_processed == 0 && state->total_bytes == 0) {
+ // 1 byte total
+ if ((utf8_byte & 128) == 0) {
+ state->total_bytes = 1;
+ state->bytes_processed = 1;
+ state->partial = static_cast<char32_t>(utf8_byte);
+ return 0;
+ }
+ // 2 bytes total
+ else if ((utf8_byte & 0xE0) == 0xC0) {
+ state->total_bytes = 2;
+ state->bytes_processed = 1;
+ utf8_byte &= 0x1F;
+ state->partial = static_cast<char32_t>(utf8_byte);
+ return 0;
+ }
+ // 3 bytes total
+ else if ((utf8_byte & 0xF0) == 0xE0) {
+ state->total_bytes = 3;
+ state->bytes_processed = 1;
+ utf8_byte &= 0x0F;
+ state->partial = static_cast<char32_t>(utf8_byte);
+ return 0;
+ }
+ // 4 bytes total
+ else if ((utf8_byte & 0xF8) == 0xF0) {
+ state->total_bytes = 4;
+ state->bytes_processed = 1;
+ utf8_byte &= 0x07;
+ state->partial = static_cast<char32_t>(utf8_byte);
+ return 0;
+ }
+ // Invalid
+ else {
+ state->bytes_processed++;
+ return -1;
+ }
+ }
+ // Any subsequent push
+ if ((utf8_byte & 0xC0) == 0x80) {
+ state->partial = state->partial << 6;
+ char32_t byte = utf8_byte & 0x3F;
+ state->partial |= byte;
+ state->bytes_processed++;
+ return 0;
+ }
+ state->bytes_processed++;
+ return -1;
+}
-int CharacterConverter::push(char32_t utf32) {}
+int CharacterConverter::push(char32_t utf32) {
+ return utf32;
+}
-utf_ret<char8_t> CharacterConverter::pop_utf8() {}
+utf_ret<char8_t> CharacterConverter::pop_utf8() {
+ utf_ret<char8_t> utf8;
+ utf8.error = 0;
+ utf8.out = 0;
+ return utf8;
+}
-utf_ret<char32_t> CharacterConverter::pop_utf32() {}
+utf_ret<char32_t> CharacterConverter::pop_utf32() {
+ utf_ret<char32_t> utf32;
+ utf32.error = 0;
+ utf32.out = state->partial;
+ return utf32;
+}
} // namespace internal
} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/test/src/__support/CMakeLists.txt b/libc/test/src/__support/CMakeLists.txt
index 4fb0dae86e5ca..8905ac2127620 100644
--- a/libc/test/src/__support/CMakeLists.txt
+++ b/libc/test/src/__support/CMakeLists.txt
@@ -275,3 +275,4 @@ add_subdirectory(fixed_point)
add_subdirectory(HashTable)
add_subdirectory(time)
add_subdirectory(threads)
+add_subdirectory(wchar)
diff --git a/libc/test/src/__support/wchar/CMakeLists.txt b/libc/test/src/__support/wchar/CMakeLists.txt
new file mode 100644
index 0000000000000..cf8e615a4fd59
--- /dev/null
+++ b/libc/test/src/__support/wchar/CMakeLists.txt
@@ -0,0 +1,11 @@
+add_custom_target(libc-support-wchar-tests)
+
+add_libc_test(
+ utf8_to_32_test
+ SUITE
+ libc-support-tests
+ SRCS
+ utf8_to_32_test.cpp
+ DEPENDS
+ libc.src.__support.wchar.character_converter
+)
\ No newline at end of file
diff --git a/libc/test/src/__support/wchar/utf8_to_32_test.cpp b/libc/test/src/__support/wchar/utf8_to_32_test.cpp
new file mode 100644
index 0000000000000..aef9cfc557549
--- /dev/null
+++ b/libc/test/src/__support/wchar/utf8_to_32_test.cpp
@@ -0,0 +1,125 @@
+//===-- Unittests for character_converter utf8->3 -------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/wchar/character_converter.h"
+#include "src/__support/wchar/mbstate.h"
+#include "src/__support/wchar/utf_ret.h"
+#include "test/UnitTest/Test.h"
+
+TEST(LlvmLibcCharacterConverterUTF8To32Test, OneByte) {
+ LIBC_NAMESPACE::internal::mbstate state;
+ state.bytes_processed = 0;
+ state.total_bytes = 0;
+ char ch = 'A';
+
+ LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
+ int err = char_conv.push(static_cast<char8_t>(ch));
+ LIBC_NAMESPACE::internal::utf_ret<char32_t> wch = char_conv.pop_utf32();
+
+ EXPECT_EQ(err, 0);
+ EXPECT_EQ(wch.error, 0);
+ EXPECT_EQ(static_cast<int>(wch.out), 65);
+}
+
+TEST(LlvmLibcCharacterConverterUTF8To32Test, TwoBytes) {
+ LIBC_NAMESPACE::internal::mbstate state;
+ state.bytes_processed = 0;
+ state.total_bytes = 0;
+ const char *ch = ""; // hex 0xC2, 0x8E
+
+ LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
+ char_conv.push(static_cast<char8_t>(ch[0]));
+ char_conv.push(static_cast<char8_t>(ch[1]));
+ LIBC_NAMESPACE::internal::utf_ret<char32_t> wch = char_conv.pop_utf32();
+
+ ASSERT_EQ(wch.error, 0);
+ ASSERT_EQ(static_cast<int>(wch.out), 142);
+}
+
+TEST(LlvmLibcCharacterConverterUTF8To32Test, ThreeBytes) {
+ LIBC_NAMESPACE::internal::mbstate state;
+ state.bytes_processed = 0;
+ state.total_bytes = 0;
+ const char *ch = "∑"; // hex 0xE2, 0x88, 0x91
+
+ LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
+ char_conv.push(static_cast<char8_t>(ch[0]));
+ char_conv.push(static_cast<char8_t>(ch[1]));
+ char_conv.push(static_cast<char8_t>(ch[2]));
+ LIBC_NAMESPACE::internal::utf_ret<char32_t> wch = char_conv.pop_utf32();
+
+ ASSERT_EQ(wch.error, 0);
+ ASSERT_EQ(static_cast<int>(wch.out), 8721);
+}
+
+TEST(LlvmLibcCharacterConverterUTF8To32Test, FourBytes) {
+ LIBC_NAMESPACE::internal::mbstate state;
+ state.bytes_processed = 0;
+ state.total_bytes = 0;
+ const char *ch = "🤡"; // hex 0xF0, 0x9F, 0xA4, 0xA1
+
+ LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
+ char_conv.push(static_cast<char8_t>(ch[0]));
+ char_conv.push(static_cast<char8_t>(ch[1]));
+ char_conv.push(static_cast<char8_t>(ch[2]));
+ char_conv.push(static_cast<char8_t>(ch[3]));
+ LIBC_NAMESPACE::internal::utf_ret<char32_t> wch = char_conv.pop_utf32();
+
+ ASSERT_EQ(wch.error, 0);
+ ASSERT_EQ(static_cast<int>(wch.out), 129313);
+}
+
+TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidByte) {
+ LIBC_NAMESPACE::internal::mbstate state;
+ state.bytes_processed = 0;
+ state.total_bytes = 0;
+ const char ch = static_cast<char>(0x80); // invalid starting bit sequence
+
+ LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
+ int err = char_conv.push(static_cast<char8_t>(ch));
+
+ ASSERT_EQ(err, -1);
+}
+
+TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidMultiByte) {
+ LIBC_NAMESPACE::internal::mbstate state;
+ state.bytes_processed = 0;
+ state.total_bytes = 0;
+ const char ch[4] = {static_cast<char>(0x80), static_cast<char>(0x00),
+ static_cast<char>(0x00),
+ static_cast<char>(0x00)}; // All bytes are invalid
+
+ LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
+ int err = char_conv.push(static_cast<char8_t>(ch[0]));
+ ASSERT_EQ(err, -1);
+ err = char_conv.push(static_cast<char8_t>(ch[1]));
+ ASSERT_EQ(err, -1);
+ err = char_conv.push(static_cast<char8_t>(ch[2]));
+ ASSERT_EQ(err, -1);
+ err = char_conv.push(static_cast<char8_t>(ch[3]));
+ ASSERT_EQ(err, -1);
+}
+
+TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidMiddleByte) {
+ LIBC_NAMESPACE::internal::mbstate state;
+ state.bytes_processed = 0;
+ state.total_bytes = 0;
+ const char ch[4] = {static_cast<char>(0xF1), static_cast<char>(0xC0),
+ static_cast<char>(0x80),
+ static_cast<char>(0x80)}; // invalid second byte
+
+ LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
+ int err = char_conv.push(static_cast<char8_t>(ch[0]));
+ ASSERT_EQ(err, 0);
+ err = char_conv.push(static_cast<char8_t>(ch[1]));
+ ASSERT_EQ(err, -1);
+ err = char_conv.push(static_cast<char8_t>(ch[2]));
+ ASSERT_EQ(err, 0);
+ err = char_conv.push(static_cast<char8_t>(ch[3]));
+ ASSERT_EQ(err, 0);
+}
>From 54c64e0e952de307ca994320ccffe399b013ebf4 Mon Sep 17 00:00:00 2001
From: Sriya Pratipati <sriyap at google.com>
Date: Thu, 12 Jun 2025 21:05:39 +0000
Subject: [PATCH 2/2] fixed formatting
---
libc/src/__support/wchar/character_converter.cpp | 8 +++-----
1 file changed, 3 insertions(+), 5 deletions(-)
diff --git a/libc/src/__support/wchar/character_converter.cpp b/libc/src/__support/wchar/character_converter.cpp
index 9c2fde3134837..952d6e2ee94c7 100644
--- a/libc/src/__support/wchar/character_converter.cpp
+++ b/libc/src/__support/wchar/character_converter.cpp
@@ -58,8 +58,8 @@ int CharacterConverter::push(char8_t utf8_byte) {
}
// Invalid
else {
- state->bytes_processed++;
- return -1;
+ state->bytes_processed++;
+ return -1;
}
}
// Any subsequent push
@@ -74,9 +74,7 @@ int CharacterConverter::push(char8_t utf8_byte) {
return -1;
}
-int CharacterConverter::push(char32_t utf32) {
- return utf32;
-}
+int CharacterConverter::push(char32_t utf32) { return utf32; }
utf_ret<char8_t> CharacterConverter::pop_utf8() {
utf_ret<char8_t> utf8;
More information about the libc-commits
mailing list