[libc-commits] [libc] [libc] utf8 to 32 CharacterConverter (PR #143973)
via libc-commits
libc-commits at lists.llvm.org
Fri Jun 13 09:57:51 PDT 2025
https://github.com/sribee8 updated https://github.com/llvm/llvm-project/pull/143973
>From 9561ab5887c72dce013c59d1596afa1ff37183bf Mon Sep 17 00:00:00 2001
From: Sriya Pratipati <sriyap at google.com>
Date: Thu, 12 Jun 2025 20:58:35 +0000
Subject: [PATCH 01/12] [libc] CharacterConverter utf8 to 32 push and pop
Implemented push and pop for utf8 to 32 conversion and tests.
---
.../__support/wchar/character_converter.cpp | 70 +++++++++-
libc/test/src/__support/CMakeLists.txt | 1 +
libc/test/src/__support/wchar/CMakeLists.txt | 11 ++
.../src/__support/wchar/utf8_to_32_test.cpp | 125 ++++++++++++++++++
4 files changed, 203 insertions(+), 4 deletions(-)
create mode 100644 libc/test/src/__support/wchar/CMakeLists.txt
create mode 100644 libc/test/src/__support/wchar/utf8_to_32_test.cpp
diff --git a/libc/src/__support/wchar/character_converter.cpp b/libc/src/__support/wchar/character_converter.cpp
index f09c7815a6cc4..9c2fde3134837 100644
--- a/libc/src/__support/wchar/character_converter.cpp
+++ b/libc/src/__support/wchar/character_converter.cpp
@@ -22,13 +22,75 @@ bool CharacterConverter::isComplete() {
return state->bytes_processed == state->total_bytes;
}
-int CharacterConverter::push(char8_t utf8_byte) {}
+int CharacterConverter::push(char8_t utf8_byte) {
+ // Checking the first byte if first push
+ if (state->bytes_processed == 0 && state->total_bytes == 0) {
+ // 1 byte total
+ if ((utf8_byte & 128) == 0) {
+ state->total_bytes = 1;
+ state->bytes_processed = 1;
+ state->partial = static_cast<char32_t>(utf8_byte);
+ return 0;
+ }
+ // 2 bytes total
+ else if ((utf8_byte & 0xE0) == 0xC0) {
+ state->total_bytes = 2;
+ state->bytes_processed = 1;
+ utf8_byte &= 0x1F;
+ state->partial = static_cast<char32_t>(utf8_byte);
+ return 0;
+ }
+ // 3 bytes total
+ else if ((utf8_byte & 0xF0) == 0xE0) {
+ state->total_bytes = 3;
+ state->bytes_processed = 1;
+ utf8_byte &= 0x0F;
+ state->partial = static_cast<char32_t>(utf8_byte);
+ return 0;
+ }
+ // 4 bytes total
+ else if ((utf8_byte & 0xF8) == 0xF0) {
+ state->total_bytes = 4;
+ state->bytes_processed = 1;
+ utf8_byte &= 0x07;
+ state->partial = static_cast<char32_t>(utf8_byte);
+ return 0;
+ }
+ // Invalid
+ else {
+ state->bytes_processed++;
+ return -1;
+ }
+ }
+ // Any subsequent push
+ if ((utf8_byte & 0xC0) == 0x80) {
+ state->partial = state->partial << 6;
+ char32_t byte = utf8_byte & 0x3F;
+ state->partial |= byte;
+ state->bytes_processed++;
+ return 0;
+ }
+ state->bytes_processed++;
+ return -1;
+}
-int CharacterConverter::push(char32_t utf32) {}
+int CharacterConverter::push(char32_t utf32) {
+ return utf32;
+}
-utf_ret<char8_t> CharacterConverter::pop_utf8() {}
+utf_ret<char8_t> CharacterConverter::pop_utf8() {
+ utf_ret<char8_t> utf8;
+ utf8.error = 0;
+ utf8.out = 0;
+ return utf8;
+}
-utf_ret<char32_t> CharacterConverter::pop_utf32() {}
+utf_ret<char32_t> CharacterConverter::pop_utf32() {
+ utf_ret<char32_t> utf32;
+ utf32.error = 0;
+ utf32.out = state->partial;
+ return utf32;
+}
} // namespace internal
} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/test/src/__support/CMakeLists.txt b/libc/test/src/__support/CMakeLists.txt
index 4fb0dae86e5ca..8905ac2127620 100644
--- a/libc/test/src/__support/CMakeLists.txt
+++ b/libc/test/src/__support/CMakeLists.txt
@@ -275,3 +275,4 @@ add_subdirectory(fixed_point)
add_subdirectory(HashTable)
add_subdirectory(time)
add_subdirectory(threads)
+add_subdirectory(wchar)
diff --git a/libc/test/src/__support/wchar/CMakeLists.txt b/libc/test/src/__support/wchar/CMakeLists.txt
new file mode 100644
index 0000000000000..cf8e615a4fd59
--- /dev/null
+++ b/libc/test/src/__support/wchar/CMakeLists.txt
@@ -0,0 +1,11 @@
+add_custom_target(libc-support-wchar-tests)
+
+add_libc_test(
+ utf8_to_32_test
+ SUITE
+ libc-support-tests
+ SRCS
+ utf8_to_32_test.cpp
+ DEPENDS
+ libc.src.__support.wchar.character_converter
+)
\ No newline at end of file
diff --git a/libc/test/src/__support/wchar/utf8_to_32_test.cpp b/libc/test/src/__support/wchar/utf8_to_32_test.cpp
new file mode 100644
index 0000000000000..aef9cfc557549
--- /dev/null
+++ b/libc/test/src/__support/wchar/utf8_to_32_test.cpp
@@ -0,0 +1,125 @@
+//===-- Unittests for character_converter utf8->3 -------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/wchar/character_converter.h"
+#include "src/__support/wchar/mbstate.h"
+#include "src/__support/wchar/utf_ret.h"
+#include "test/UnitTest/Test.h"
+
+TEST(LlvmLibcCharacterConverterUTF8To32Test, OneByte) {
+ LIBC_NAMESPACE::internal::mbstate state;
+ state.bytes_processed = 0;
+ state.total_bytes = 0;
+ char ch = 'A';
+
+ LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
+ int err = char_conv.push(static_cast<char8_t>(ch));
+ LIBC_NAMESPACE::internal::utf_ret<char32_t> wch = char_conv.pop_utf32();
+
+ EXPECT_EQ(err, 0);
+ EXPECT_EQ(wch.error, 0);
+ EXPECT_EQ(static_cast<int>(wch.out), 65);
+}
+
+TEST(LlvmLibcCharacterConverterUTF8To32Test, TwoBytes) {
+ LIBC_NAMESPACE::internal::mbstate state;
+ state.bytes_processed = 0;
+ state.total_bytes = 0;
+ const char *ch = ""; // hex 0xC2, 0x8E
+
+ LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
+ char_conv.push(static_cast<char8_t>(ch[0]));
+ char_conv.push(static_cast<char8_t>(ch[1]));
+ LIBC_NAMESPACE::internal::utf_ret<char32_t> wch = char_conv.pop_utf32();
+
+ ASSERT_EQ(wch.error, 0);
+ ASSERT_EQ(static_cast<int>(wch.out), 142);
+}
+
+TEST(LlvmLibcCharacterConverterUTF8To32Test, ThreeBytes) {
+ LIBC_NAMESPACE::internal::mbstate state;
+ state.bytes_processed = 0;
+ state.total_bytes = 0;
+ const char *ch = "∑"; // hex 0xE2, 0x88, 0x91
+
+ LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
+ char_conv.push(static_cast<char8_t>(ch[0]));
+ char_conv.push(static_cast<char8_t>(ch[1]));
+ char_conv.push(static_cast<char8_t>(ch[2]));
+ LIBC_NAMESPACE::internal::utf_ret<char32_t> wch = char_conv.pop_utf32();
+
+ ASSERT_EQ(wch.error, 0);
+ ASSERT_EQ(static_cast<int>(wch.out), 8721);
+}
+
+TEST(LlvmLibcCharacterConverterUTF8To32Test, FourBytes) {
+ LIBC_NAMESPACE::internal::mbstate state;
+ state.bytes_processed = 0;
+ state.total_bytes = 0;
+ const char *ch = "🤡"; // hex 0xF0, 0x9F, 0xA4, 0xA1
+
+ LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
+ char_conv.push(static_cast<char8_t>(ch[0]));
+ char_conv.push(static_cast<char8_t>(ch[1]));
+ char_conv.push(static_cast<char8_t>(ch[2]));
+ char_conv.push(static_cast<char8_t>(ch[3]));
+ LIBC_NAMESPACE::internal::utf_ret<char32_t> wch = char_conv.pop_utf32();
+
+ ASSERT_EQ(wch.error, 0);
+ ASSERT_EQ(static_cast<int>(wch.out), 129313);
+}
+
+TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidByte) {
+ LIBC_NAMESPACE::internal::mbstate state;
+ state.bytes_processed = 0;
+ state.total_bytes = 0;
+ const char ch = static_cast<char>(0x80); // invalid starting bit sequence
+
+ LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
+ int err = char_conv.push(static_cast<char8_t>(ch));
+
+ ASSERT_EQ(err, -1);
+}
+
+TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidMultiByte) {
+ LIBC_NAMESPACE::internal::mbstate state;
+ state.bytes_processed = 0;
+ state.total_bytes = 0;
+ const char ch[4] = {static_cast<char>(0x80), static_cast<char>(0x00),
+ static_cast<char>(0x00),
+ static_cast<char>(0x00)}; // All bytes are invalid
+
+ LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
+ int err = char_conv.push(static_cast<char8_t>(ch[0]));
+ ASSERT_EQ(err, -1);
+ err = char_conv.push(static_cast<char8_t>(ch[1]));
+ ASSERT_EQ(err, -1);
+ err = char_conv.push(static_cast<char8_t>(ch[2]));
+ ASSERT_EQ(err, -1);
+ err = char_conv.push(static_cast<char8_t>(ch[3]));
+ ASSERT_EQ(err, -1);
+}
+
+TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidMiddleByte) {
+ LIBC_NAMESPACE::internal::mbstate state;
+ state.bytes_processed = 0;
+ state.total_bytes = 0;
+ const char ch[4] = {static_cast<char>(0xF1), static_cast<char>(0xC0),
+ static_cast<char>(0x80),
+ static_cast<char>(0x80)}; // invalid second byte
+
+ LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
+ int err = char_conv.push(static_cast<char8_t>(ch[0]));
+ ASSERT_EQ(err, 0);
+ err = char_conv.push(static_cast<char8_t>(ch[1]));
+ ASSERT_EQ(err, -1);
+ err = char_conv.push(static_cast<char8_t>(ch[2]));
+ ASSERT_EQ(err, 0);
+ err = char_conv.push(static_cast<char8_t>(ch[3]));
+ ASSERT_EQ(err, 0);
+}
>From 54c64e0e952de307ca994320ccffe399b013ebf4 Mon Sep 17 00:00:00 2001
From: Sriya Pratipati <sriyap at google.com>
Date: Thu, 12 Jun 2025 21:05:39 +0000
Subject: [PATCH 02/12] fixed formatting
---
libc/src/__support/wchar/character_converter.cpp | 8 +++-----
1 file changed, 3 insertions(+), 5 deletions(-)
diff --git a/libc/src/__support/wchar/character_converter.cpp b/libc/src/__support/wchar/character_converter.cpp
index 9c2fde3134837..952d6e2ee94c7 100644
--- a/libc/src/__support/wchar/character_converter.cpp
+++ b/libc/src/__support/wchar/character_converter.cpp
@@ -58,8 +58,8 @@ int CharacterConverter::push(char8_t utf8_byte) {
}
// Invalid
else {
- state->bytes_processed++;
- return -1;
+ state->bytes_processed++;
+ return -1;
}
}
// Any subsequent push
@@ -74,9 +74,7 @@ int CharacterConverter::push(char8_t utf8_byte) {
return -1;
}
-int CharacterConverter::push(char32_t utf32) {
- return utf32;
-}
+int CharacterConverter::push(char32_t utf32) { return utf32; }
utf_ret<char8_t> CharacterConverter::pop_utf8() {
utf_ret<char8_t> utf8;
>From a82f9ee8b1ac9f2e4e7e863eb0bb55d3ea7cd62f Mon Sep 17 00:00:00 2001
From: Sriya Pratipati <sriyap at google.com>
Date: Thu, 12 Jun 2025 21:37:30 +0000
Subject: [PATCH 03/12] Changed cmakelists for tests since MacOS does not have
uchar header
---
libc/test/src/__support/CMakeLists.txt | 6 +++++-
1 file changed, 5 insertions(+), 1 deletion(-)
diff --git a/libc/test/src/__support/CMakeLists.txt b/libc/test/src/__support/CMakeLists.txt
index 8905ac2127620..9f626ed31cc07 100644
--- a/libc/test/src/__support/CMakeLists.txt
+++ b/libc/test/src/__support/CMakeLists.txt
@@ -275,4 +275,8 @@ add_subdirectory(fixed_point)
add_subdirectory(HashTable)
add_subdirectory(time)
add_subdirectory(threads)
-add_subdirectory(wchar)
+# Requires access to uchar header which is not on MacOS
+# Cannot currently build this on MacOS in overlay mode
+if(NOT(LIBC_TARGET_OS_IS_DARWIN))
+ add_subdirectory(wchar)
+endif()
>From 4087711f7c03cf3cb065603f1a7099c708361189 Mon Sep 17 00:00:00 2001
From: Sriya Pratipati <sriyap at google.com>
Date: Thu, 12 Jun 2025 21:42:41 +0000
Subject: [PATCH 04/12] Deleted unused functions
---
libc/src/__support/wchar/character_converter.cpp | 9 ---------
1 file changed, 9 deletions(-)
diff --git a/libc/src/__support/wchar/character_converter.cpp b/libc/src/__support/wchar/character_converter.cpp
index 952d6e2ee94c7..ffc51e6d6dc28 100644
--- a/libc/src/__support/wchar/character_converter.cpp
+++ b/libc/src/__support/wchar/character_converter.cpp
@@ -74,15 +74,6 @@ int CharacterConverter::push(char8_t utf8_byte) {
return -1;
}
-int CharacterConverter::push(char32_t utf32) { return utf32; }
-
-utf_ret<char8_t> CharacterConverter::pop_utf8() {
- utf_ret<char8_t> utf8;
- utf8.error = 0;
- utf8.out = 0;
- return utf8;
-}
-
utf_ret<char32_t> CharacterConverter::pop_utf32() {
utf_ret<char32_t> utf32;
utf32.error = 0;
>From 20cd8e572c5993694277949098abd8dcc96d15f4 Mon Sep 17 00:00:00 2001
From: Sriya Pratipati <sriyap at google.com>
Date: Thu, 12 Jun 2025 23:09:09 +0000
Subject: [PATCH 05/12] Cleaned up code, added edge cases, added new test cases
for edge cases
---
.../__support/wchar/character_converter.cpp | 47 ++++++------
libc/test/src/__support/wchar/CMakeLists.txt | 2 +-
.../src/__support/wchar/utf8_to_32_test.cpp | 73 ++++++++++++++++---
3 files changed, 89 insertions(+), 33 deletions(-)
diff --git a/libc/src/__support/wchar/character_converter.cpp b/libc/src/__support/wchar/character_converter.cpp
index ffc51e6d6dc28..276d2d95f47e3 100644
--- a/libc/src/__support/wchar/character_converter.cpp
+++ b/libc/src/__support/wchar/character_converter.cpp
@@ -8,6 +8,7 @@
#include "hdr/types/char32_t.h"
#include "hdr/types/char8_t.h"
+#include "src/__support/CPP/bit.h"
#include "src/__support/wchar/mbstate.h"
#include "src/__support/wchar/utf_ret.h"
@@ -25,52 +26,51 @@ bool CharacterConverter::isComplete() {
int CharacterConverter::push(char8_t utf8_byte) {
// Checking the first byte if first push
if (state->bytes_processed == 0 && state->total_bytes == 0) {
+ state->partial = static_cast<char32_t>(0);
// 1 byte total
- if ((utf8_byte & 128) == 0) {
+ if (cpp::countl_one(utf8_byte) == 0) {
state->total_bytes = 1;
- state->bytes_processed = 1;
- state->partial = static_cast<char32_t>(utf8_byte);
- return 0;
}
// 2 bytes total
- else if ((utf8_byte & 0xE0) == 0xC0) {
+ else if (cpp::countl_one(utf8_byte) == 2) {
state->total_bytes = 2;
- state->bytes_processed = 1;
utf8_byte &= 0x1F;
- state->partial = static_cast<char32_t>(utf8_byte);
- return 0;
}
// 3 bytes total
- else if ((utf8_byte & 0xF0) == 0xE0) {
+ else if (cpp::countl_one(utf8_byte) == 3) {
state->total_bytes = 3;
- state->bytes_processed = 1;
utf8_byte &= 0x0F;
- state->partial = static_cast<char32_t>(utf8_byte);
- return 0;
}
// 4 bytes total
- else if ((utf8_byte & 0xF8) == 0xF0) {
+ else if (cpp::countl_one(utf8_byte) == 4) {
state->total_bytes = 4;
- state->bytes_processed = 1;
utf8_byte &= 0x07;
- state->partial = static_cast<char32_t>(utf8_byte);
- return 0;
}
- // Invalid
+ // Invalid byte -> reset mbstate
else {
- state->bytes_processed++;
+ state->partial = static_cast<char32_t>(0);
+ state->bytes_processed = 0;
+ state->total_bytes = 0;
return -1;
}
+ state->partial = static_cast<char32_t>(utf8_byte);
+ state->bytes_processed++;
+ return 0;
}
// Any subsequent push
- if ((utf8_byte & 0xC0) == 0x80) {
- state->partial = state->partial << 6;
+ if (cpp::countl_one(utf8_byte) == 1 && !isComplete()) {
char32_t byte = utf8_byte & 0x3F;
+ state->partial = state->partial << 6;
state->partial |= byte;
state->bytes_processed++;
return 0;
}
- state->bytes_processed++;
+ // Invalid byte -> reset if we didn't get successful complete read
+ if (!isComplete()) {
+ state->partial = static_cast<char32_t>(0);
+ state->bytes_processed = 0;
+ state->total_bytes = 0;
+ }
return -1;
}
@@ -78,6 +78,11 @@ utf_ret<char32_t> CharacterConverter::pop_utf32() {
utf_ret<char32_t> utf32;
utf32.error = 0;
utf32.out = state->partial;
+ if (!isComplete())
+ utf32.error = -1;
+ state->bytes_processed = 0;
+ state->total_bytes = 0;
+ state->partial = static_cast<char32_t>(0);
return utf32;
}
diff --git a/libc/test/src/__support/wchar/CMakeLists.txt b/libc/test/src/__support/wchar/CMakeLists.txt
index cf8e615a4fd59..1c7132d1b5978 100644
--- a/libc/test/src/__support/wchar/CMakeLists.txt
+++ b/libc/test/src/__support/wchar/CMakeLists.txt
@@ -8,4 +8,4 @@ add_libc_test(
utf8_to_32_test.cpp
DEPENDS
libc.src.__support.wchar.character_converter
-)
\ No newline at end of file
+)
diff --git a/libc/test/src/__support/wchar/utf8_to_32_test.cpp b/libc/test/src/__support/wchar/utf8_to_32_test.cpp
index aef9cfc557549..7cf7ddba4bb97 100644
--- a/libc/test/src/__support/wchar/utf8_to_32_test.cpp
+++ b/libc/test/src/__support/wchar/utf8_to_32_test.cpp
@@ -1,4 +1,4 @@
-//===-- Unittests for character_converter utf8->3 -------------------------===//
+//===-- Unittests for character_converter utf8->utf32 ---------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
@@ -30,7 +30,7 @@ TEST(LlvmLibcCharacterConverterUTF8To32Test, TwoBytes) {
LIBC_NAMESPACE::internal::mbstate state;
state.bytes_processed = 0;
state.total_bytes = 0;
- const char *ch = ""; // hex 0xC2, 0x8E
+ const char ch[2] = {static_cast<char>(0xC2), static_cast<char>(0x8E)}; //
LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
char_conv.push(static_cast<char8_t>(ch[0]));
@@ -45,7 +45,8 @@ TEST(LlvmLibcCharacterConverterUTF8To32Test, ThreeBytes) {
LIBC_NAMESPACE::internal::mbstate state;
state.bytes_processed = 0;
state.total_bytes = 0;
- const char *ch = "∑"; // hex 0xE2, 0x88, 0x91
+ const char ch[3] = {static_cast<char>(0xE2), static_cast<char>(0x88),
+ static_cast<char>(0x91)}; // ∑
LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
char_conv.push(static_cast<char8_t>(ch[0]));
@@ -61,7 +62,8 @@ TEST(LlvmLibcCharacterConverterUTF8To32Test, FourBytes) {
LIBC_NAMESPACE::internal::mbstate state;
state.bytes_processed = 0;
state.total_bytes = 0;
- const char *ch = "🤡"; // hex 0xF0, 0x9F, 0xA4, 0xA1
+ const char ch[4] = {static_cast<char>(0xF0), static_cast<char>(0x9F),
+ static_cast<char>(0xA4), static_cast<char>(0xA1)}; // 🤡
LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
char_conv.push(static_cast<char8_t>(ch[0]));
@@ -90,36 +92,85 @@ TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidMultiByte) {
LIBC_NAMESPACE::internal::mbstate state;
state.bytes_processed = 0;
state.total_bytes = 0;
- const char ch[4] = {static_cast<char>(0x80), static_cast<char>(0x00),
- static_cast<char>(0x00),
- static_cast<char>(0x00)}; // All bytes are invalid
+ const char ch[4] = {
+ static_cast<char>(0x80), static_cast<char>(0x00), static_cast<char>(0x80),
+ static_cast<char>(0x00)}; // first, third, and last bytes are invalid
LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
int err = char_conv.push(static_cast<char8_t>(ch[0]));
ASSERT_EQ(err, -1);
err = char_conv.push(static_cast<char8_t>(ch[1]));
- ASSERT_EQ(err, -1);
+ ASSERT_EQ(err, 0);
+ // Prev byte was single byte so trying to read another should error.
err = char_conv.push(static_cast<char8_t>(ch[2]));
ASSERT_EQ(err, -1);
err = char_conv.push(static_cast<char8_t>(ch[3]));
ASSERT_EQ(err, -1);
}
-TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidMiddleByte) {
+TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidLastByte) {
LIBC_NAMESPACE::internal::mbstate state;
state.bytes_processed = 0;
state.total_bytes = 0;
- const char ch[4] = {static_cast<char>(0xF1), static_cast<char>(0xC0),
+ const char ch[4] = {static_cast<char>(0xF1), static_cast<char>(0x80),
static_cast<char>(0x80),
- static_cast<char>(0x80)}; // invalid second byte
+ static_cast<char>(0xC0)}; // invalid last byte
LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
int err = char_conv.push(static_cast<char8_t>(ch[0]));
ASSERT_EQ(err, 0);
err = char_conv.push(static_cast<char8_t>(ch[1]));
+ ASSERT_EQ(err, 0);
+ err = char_conv.push(static_cast<char8_t>(ch[2]));
+ ASSERT_EQ(err, 0);
+ err = char_conv.push(static_cast<char8_t>(ch[3]));
ASSERT_EQ(err, -1);
+}
+
+TEST(LlvmLibcCharacterConverterUTF8To32Test, ValidTwoByteWithExtraRead) {
+ LIBC_NAMESPACE::internal::mbstate state;
+ state.bytes_processed = 0;
+ state.total_bytes = 0;
+ const char ch[3] = {static_cast<char>(0xC2), static_cast<char>(0x8E),
+ static_cast<char>(0x80)};
+
+ LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
+ int err = char_conv.push(static_cast<char8_t>(ch[0]));
+ ASSERT_EQ(err, 0);
+ err = char_conv.push(static_cast<char8_t>(ch[1]));
+ ASSERT_EQ(err, 0);
+ // Should produce an error on 3rd byte
+ err = char_conv.push(static_cast<char8_t>(ch[2]));
+ ASSERT_EQ(err, -1);
+
+ LIBC_NAMESPACE::internal::utf_ret<char32_t> wch = char_conv.pop_utf32();
+ ASSERT_EQ(wch.error, 0);
+ // Should still output the correct result.
+ ASSERT_EQ(static_cast<int>(wch.out), 142);
+}
+
+TEST(LlvmLibcCharacterConverterUTF8To32Test, TwoValidTwoBytes) {
+ LIBC_NAMESPACE::internal::mbstate state;
+ state.bytes_processed = 0;
+ state.total_bytes = 0;
+ const char ch[4] = {static_cast<char>(0xC2), static_cast<char>(0x8E),
+ static_cast<char>(0xC7), static_cast<char>(0x8C)};
+
+ LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
+ int err = char_conv.push(static_cast<char8_t>(ch[0]));
+ ASSERT_EQ(err, 0);
+ err = char_conv.push(static_cast<char8_t>(ch[1]));
+ ASSERT_EQ(err, 0);
+ LIBC_NAMESPACE::internal::utf_ret<char32_t> wch = char_conv.pop_utf32();
+ ASSERT_EQ(wch.error, 0);
+ ASSERT_EQ(static_cast<int>(wch.out), 142);
+
+ // Second two byte character
err = char_conv.push(static_cast<char8_t>(ch[2]));
ASSERT_EQ(err, 0);
err = char_conv.push(static_cast<char8_t>(ch[3]));
ASSERT_EQ(err, 0);
+ wch = char_conv.pop_utf32();
+ ASSERT_EQ(wch.error, 0);
+ ASSERT_EQ(static_cast<int>(wch.out), 460);
}
>From a188c4bc1146523c2b40dcbf40eef77ddcfc49df Mon Sep 17 00:00:00 2001
From: Sriya Pratipati <sriyap at google.com>
Date: Thu, 12 Jun 2025 23:29:20 +0000
Subject: [PATCH 06/12] Fixed invalid pop and added test
---
.../src/__support/wchar/character_converter.cpp | 9 +++++----
.../src/__support/wchar/utf8_to_32_test.cpp | 17 +++++++++++++++++
2 files changed, 22 insertions(+), 4 deletions(-)
diff --git a/libc/src/__support/wchar/character_converter.cpp b/libc/src/__support/wchar/character_converter.cpp
index 276d2d95f47e3..3c7147d919330 100644
--- a/libc/src/__support/wchar/character_converter.cpp
+++ b/libc/src/__support/wchar/character_converter.cpp
@@ -48,9 +48,6 @@ int CharacterConverter::push(char8_t utf8_byte) {
}
// Invalid byte -> reset mbstate
else {
- state->partial = static_cast<char32_t>(0);
- state->bytes_processed = 0;
- state->total_bytes = 0;
return -1;
}
state->partial = static_cast<char32_t>(utf8_byte);
@@ -78,8 +75,12 @@ utf_ret<char32_t> CharacterConverter::pop_utf32() {
utf_ret<char32_t> utf32;
utf32.error = 0;
utf32.out = state->partial;
- if (!isComplete())
+ // if pop is called too early
+ if (!isComplete()) {
utf32.error = -1;
+ return utf32;
+ }
+ // reset if successful pop
state->bytes_processed = 0;
state->total_bytes = 0;
state->partial = static_cast<char32_t>(0);
diff --git a/libc/test/src/__support/wchar/utf8_to_32_test.cpp b/libc/test/src/__support/wchar/utf8_to_32_test.cpp
index 7cf7ddba4bb97..e1529320ad648 100644
--- a/libc/test/src/__support/wchar/utf8_to_32_test.cpp
+++ b/libc/test/src/__support/wchar/utf8_to_32_test.cpp
@@ -174,3 +174,20 @@ TEST(LlvmLibcCharacterConverterUTF8To32Test, TwoValidTwoBytes) {
ASSERT_EQ(wch.error, 0);
ASSERT_EQ(static_cast<int>(wch.out), 460);
}
+
+TEST(LlvmLibcCharacterConverterUTF8To32Test, InvlaidPop) {
+ LIBC_NAMESPACE::internal::mbstate state;
+ state.bytes_processed = 0;
+ state.total_bytes = 0;
+ LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
+ const char ch[2] = {static_cast<char>(0xC2), static_cast<char>(0x8E)};
+ int err = char_conv.push(static_cast<char8_t>(ch[0]));
+ ASSERT_EQ(err, 0);
+ LIBC_NAMESPACE::internal::utf_ret<char32_t> wch = char_conv.pop_utf32();
+ ASSERT_EQ(wch.error, -1); // Should fail since we have not read enough bytes
+ err = char_conv.push(static_cast<char8_t>(ch[1]));
+ ASSERT_EQ(err, 0);
+ wch = char_conv.pop_utf32();
+ ASSERT_EQ(wch.error, 0);
+ ASSERT_EQ(static_cast<int>(wch.out), 142);
+}
>From 9535c5beb817bbfe9cd498866cf0759ec84ae62f Mon Sep 17 00:00:00 2001
From: Sriya Pratipati <sriyap at google.com>
Date: Thu, 12 Jun 2025 23:34:48 +0000
Subject: [PATCH 07/12] cleaned up code
---
.../__support/wchar/character_converter.cpp | 22 ++++++++++---------
1 file changed, 12 insertions(+), 10 deletions(-)
diff --git a/libc/src/__support/wchar/character_converter.cpp b/libc/src/__support/wchar/character_converter.cpp
index 3c7147d919330..8b4e57a765071 100644
--- a/libc/src/__support/wchar/character_converter.cpp
+++ b/libc/src/__support/wchar/character_converter.cpp
@@ -27,27 +27,29 @@ int CharacterConverter::push(char8_t utf8_byte) {
// Checking the first byte if first push
if (state->bytes_processed == 0 && state->total_bytes == 0) {
state->partial = static_cast<char32_t>(0);
+ int numOnes = cpp::countl_one(utf8_byte);
+ switch (numOnes) {
// 1 byte total
- if (cpp::countl_one(utf8_byte) == 0) {
+ case 0:
state->total_bytes = 1;
- }
+ break;
// 2 bytes total
- else if (cpp::countl_one(utf8_byte) == 2) {
+ case 2:
state->total_bytes = 2;
utf8_byte &= 0x1F;
- }
+ break;
// 3 bytes total
- else if (cpp::countl_one(utf8_byte) == 3) {
+ case 3:
state->total_bytes = 3;
utf8_byte &= 0x0F;
- }
+ break;
// 4 bytes total
- else if (cpp::countl_one(utf8_byte) == 4) {
+ case 4:
state->total_bytes = 4;
utf8_byte &= 0x07;
- }
- // Invalid byte -> reset mbstate
- else {
+ break;
+ // Invalid first byte
+ default:
return -1;
}
state->partial = static_cast<char32_t>(utf8_byte);
>From 82bff4991e93cb39854ff9cea62d6e40cdd5a3ef Mon Sep 17 00:00:00 2001
From: Sriya Pratipati <sriyap at google.com>
Date: Thu, 12 Jun 2025 23:56:13 +0000
Subject: [PATCH 08/12] Changed return type to error_or for pop
---
.../__support/wchar/character_converter.cpp | 11 ++--
.../src/__support/wchar/character_converter.h | 5 +-
.../src/__support/wchar/utf8_to_32_test.cpp | 53 ++++++++++---------
3 files changed, 35 insertions(+), 34 deletions(-)
diff --git a/libc/src/__support/wchar/character_converter.cpp b/libc/src/__support/wchar/character_converter.cpp
index 8b4e57a765071..593d5c98f732a 100644
--- a/libc/src/__support/wchar/character_converter.cpp
+++ b/libc/src/__support/wchar/character_converter.cpp
@@ -9,6 +9,7 @@
#include "hdr/types/char32_t.h"
#include "hdr/types/char8_t.h"
#include "src/__support/CPP/bit.h"
+#include "src/__support/error_or.h"
#include "src/__support/wchar/mbstate.h"
#include "src/__support/wchar/utf_ret.h"
@@ -73,14 +74,12 @@ int CharacterConverter::push(char8_t utf8_byte) {
return -1;
}
-utf_ret<char32_t> CharacterConverter::pop_utf32() {
- utf_ret<char32_t> utf32;
- utf32.error = 0;
- utf32.out = state->partial;
+ErrorOr<char32_t> CharacterConverter::pop_utf32() {
+ char32_t utf32;
+ utf32 = state->partial;
// if pop is called too early
if (!isComplete()) {
- utf32.error = -1;
- return utf32;
+ return Error(-1);
}
// reset if successful pop
state->bytes_processed = 0;
diff --git a/libc/src/__support/wchar/character_converter.h b/libc/src/__support/wchar/character_converter.h
index d0602d2defe22..45c9176d796d2 100644
--- a/libc/src/__support/wchar/character_converter.h
+++ b/libc/src/__support/wchar/character_converter.h
@@ -11,6 +11,7 @@
#include "hdr/types/char32_t.h"
#include "hdr/types/char8_t.h"
+#include "src/__support/error_or.h"
#include "src/__support/wchar/mbstate.h"
#include "src/__support/wchar/utf_ret.h"
@@ -29,8 +30,8 @@ class CharacterConverter {
int push(char8_t utf8_byte);
int push(char32_t utf32);
- utf_ret<char8_t> pop_utf8();
- utf_ret<char32_t> pop_utf32();
+ ErrorOr<char8_t> pop_utf8();
+ ErrorOr<char32_t> pop_utf32();
};
} // namespace internal
diff --git a/libc/test/src/__support/wchar/utf8_to_32_test.cpp b/libc/test/src/__support/wchar/utf8_to_32_test.cpp
index e1529320ad648..949cc02027d3a 100644
--- a/libc/test/src/__support/wchar/utf8_to_32_test.cpp
+++ b/libc/test/src/__support/wchar/utf8_to_32_test.cpp
@@ -6,6 +6,7 @@
//
//===----------------------------------------------------------------------===//
+#include "src/__support/error_or.h"
#include "src/__support/wchar/character_converter.h"
#include "src/__support/wchar/mbstate.h"
#include "src/__support/wchar/utf_ret.h"
@@ -19,11 +20,11 @@ TEST(LlvmLibcCharacterConverterUTF8To32Test, OneByte) {
LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
int err = char_conv.push(static_cast<char8_t>(ch));
- LIBC_NAMESPACE::internal::utf_ret<char32_t> wch = char_conv.pop_utf32();
+ auto wch = char_conv.pop_utf32();
- EXPECT_EQ(err, 0);
- EXPECT_EQ(wch.error, 0);
- EXPECT_EQ(static_cast<int>(wch.out), 65);
+ ASSERT_EQ(err, 0);
+ ASSERT_TRUE(wch.has_value());
+ ASSERT_EQ(static_cast<int>(wch.value()), 65);
}
TEST(LlvmLibcCharacterConverterUTF8To32Test, TwoBytes) {
@@ -35,10 +36,10 @@ TEST(LlvmLibcCharacterConverterUTF8To32Test, TwoBytes) {
LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
char_conv.push(static_cast<char8_t>(ch[0]));
char_conv.push(static_cast<char8_t>(ch[1]));
- LIBC_NAMESPACE::internal::utf_ret<char32_t> wch = char_conv.pop_utf32();
+ auto wch = char_conv.pop_utf32();
- ASSERT_EQ(wch.error, 0);
- ASSERT_EQ(static_cast<int>(wch.out), 142);
+ ASSERT_TRUE(wch.has_value());
+ ASSERT_EQ(static_cast<int>(wch.value()), 142);
}
TEST(LlvmLibcCharacterConverterUTF8To32Test, ThreeBytes) {
@@ -52,10 +53,10 @@ TEST(LlvmLibcCharacterConverterUTF8To32Test, ThreeBytes) {
char_conv.push(static_cast<char8_t>(ch[0]));
char_conv.push(static_cast<char8_t>(ch[1]));
char_conv.push(static_cast<char8_t>(ch[2]));
- LIBC_NAMESPACE::internal::utf_ret<char32_t> wch = char_conv.pop_utf32();
+ auto wch = char_conv.pop_utf32();
- ASSERT_EQ(wch.error, 0);
- ASSERT_EQ(static_cast<int>(wch.out), 8721);
+ ASSERT_TRUE(wch.has_value());
+ ASSERT_EQ(static_cast<int>(wch.value()), 8721);
}
TEST(LlvmLibcCharacterConverterUTF8To32Test, FourBytes) {
@@ -70,10 +71,10 @@ TEST(LlvmLibcCharacterConverterUTF8To32Test, FourBytes) {
char_conv.push(static_cast<char8_t>(ch[1]));
char_conv.push(static_cast<char8_t>(ch[2]));
char_conv.push(static_cast<char8_t>(ch[3]));
- LIBC_NAMESPACE::internal::utf_ret<char32_t> wch = char_conv.pop_utf32();
+ auto wch = char_conv.pop_utf32();
- ASSERT_EQ(wch.error, 0);
- ASSERT_EQ(static_cast<int>(wch.out), 129313);
+ ASSERT_TRUE(wch.has_value());
+ ASSERT_EQ(static_cast<int>(wch.value()), 129313);
}
TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidByte) {
@@ -143,10 +144,10 @@ TEST(LlvmLibcCharacterConverterUTF8To32Test, ValidTwoByteWithExtraRead) {
err = char_conv.push(static_cast<char8_t>(ch[2]));
ASSERT_EQ(err, -1);
- LIBC_NAMESPACE::internal::utf_ret<char32_t> wch = char_conv.pop_utf32();
- ASSERT_EQ(wch.error, 0);
+ auto wch = char_conv.pop_utf32();
+ ASSERT_TRUE(wch.has_value());
// Should still output the correct result.
- ASSERT_EQ(static_cast<int>(wch.out), 142);
+ ASSERT_EQ(static_cast<int>(wch.value()), 142);
}
TEST(LlvmLibcCharacterConverterUTF8To32Test, TwoValidTwoBytes) {
@@ -161,9 +162,9 @@ TEST(LlvmLibcCharacterConverterUTF8To32Test, TwoValidTwoBytes) {
ASSERT_EQ(err, 0);
err = char_conv.push(static_cast<char8_t>(ch[1]));
ASSERT_EQ(err, 0);
- LIBC_NAMESPACE::internal::utf_ret<char32_t> wch = char_conv.pop_utf32();
- ASSERT_EQ(wch.error, 0);
- ASSERT_EQ(static_cast<int>(wch.out), 142);
+ auto wch = char_conv.pop_utf32();
+ ASSERT_TRUE(wch.has_value());
+ ASSERT_EQ(static_cast<int>(wch.value()), 142);
// Second two byte character
err = char_conv.push(static_cast<char8_t>(ch[2]));
@@ -171,11 +172,11 @@ TEST(LlvmLibcCharacterConverterUTF8To32Test, TwoValidTwoBytes) {
err = char_conv.push(static_cast<char8_t>(ch[3]));
ASSERT_EQ(err, 0);
wch = char_conv.pop_utf32();
- ASSERT_EQ(wch.error, 0);
- ASSERT_EQ(static_cast<int>(wch.out), 460);
+ ASSERT_TRUE(wch.has_value());
+ ASSERT_EQ(static_cast<int>(wch.value()), 460);
}
-TEST(LlvmLibcCharacterConverterUTF8To32Test, InvlaidPop) {
+TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidPop) {
LIBC_NAMESPACE::internal::mbstate state;
state.bytes_processed = 0;
state.total_bytes = 0;
@@ -183,11 +184,11 @@ TEST(LlvmLibcCharacterConverterUTF8To32Test, InvlaidPop) {
const char ch[2] = {static_cast<char>(0xC2), static_cast<char>(0x8E)};
int err = char_conv.push(static_cast<char8_t>(ch[0]));
ASSERT_EQ(err, 0);
- LIBC_NAMESPACE::internal::utf_ret<char32_t> wch = char_conv.pop_utf32();
- ASSERT_EQ(wch.error, -1); // Should fail since we have not read enough bytes
+ auto wch = char_conv.pop_utf32();
+ ASSERT_FALSE(wch.has_value()); // Should fail since we have not read enough bytes
err = char_conv.push(static_cast<char8_t>(ch[1]));
ASSERT_EQ(err, 0);
wch = char_conv.pop_utf32();
- ASSERT_EQ(wch.error, 0);
- ASSERT_EQ(static_cast<int>(wch.out), 142);
+ ASSERT_TRUE(wch.has_value());
+ ASSERT_EQ(static_cast<int>(wch.value()), 142);
}
>From b1e17adb8edbea380d38e27b1c9fba389111ee69 Mon Sep 17 00:00:00 2001
From: Sriya Pratipati <sriyap at google.com>
Date: Thu, 12 Jun 2025 23:57:26 +0000
Subject: [PATCH 09/12] fixed formatting
---
libc/test/src/__support/wchar/utf8_to_32_test.cpp | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/libc/test/src/__support/wchar/utf8_to_32_test.cpp b/libc/test/src/__support/wchar/utf8_to_32_test.cpp
index 949cc02027d3a..95b14b287bd05 100644
--- a/libc/test/src/__support/wchar/utf8_to_32_test.cpp
+++ b/libc/test/src/__support/wchar/utf8_to_32_test.cpp
@@ -185,7 +185,8 @@ TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidPop) {
int err = char_conv.push(static_cast<char8_t>(ch[0]));
ASSERT_EQ(err, 0);
auto wch = char_conv.pop_utf32();
- ASSERT_FALSE(wch.has_value()); // Should fail since we have not read enough bytes
+ ASSERT_FALSE(
+ wch.has_value()); // Should fail since we have not read enough bytes
err = char_conv.push(static_cast<char8_t>(ch[1]));
ASSERT_EQ(err, 0);
wch = char_conv.pop_utf32();
>From 3ad6866c1c92e5dfe502740ced8ae7f20e83c8cf Mon Sep 17 00:00:00 2001
From: Sriya Pratipati <sriyap at google.com>
Date: Fri, 13 Jun 2025 16:30:15 +0000
Subject: [PATCH 10/12] code cleanup
---
libc/src/__support/wchar/character_converter.cpp | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/libc/src/__support/wchar/character_converter.cpp b/libc/src/__support/wchar/character_converter.cpp
index 593d5c98f732a..6074557dc3687 100644
--- a/libc/src/__support/wchar/character_converter.cpp
+++ b/libc/src/__support/wchar/character_converter.cpp
@@ -75,12 +75,12 @@ int CharacterConverter::push(char8_t utf8_byte) {
}
ErrorOr<char32_t> CharacterConverter::pop_utf32() {
- char32_t utf32;
- utf32 = state->partial;
// if pop is called too early
- if (!isComplete()) {
+ if (!isComplete())
return Error(-1);
- }
+
+ char32_t utf32 = state->partial;
+
// reset if successful pop
state->bytes_processed = 0;
state->total_bytes = 0;
>From 3323d0de7df4255198c4462c9966a8adcaac224a Mon Sep 17 00:00:00 2001
From: Sriya Pratipati <sriyap at google.com>
Date: Fri, 13 Jun 2025 16:34:24 +0000
Subject: [PATCH 11/12] fixed formatting
---
libc/src/__support/wchar/character_converter.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/libc/src/__support/wchar/character_converter.cpp b/libc/src/__support/wchar/character_converter.cpp
index 6074557dc3687..8cdd114a69280 100644
--- a/libc/src/__support/wchar/character_converter.cpp
+++ b/libc/src/__support/wchar/character_converter.cpp
@@ -78,7 +78,7 @@ ErrorOr<char32_t> CharacterConverter::pop_utf32() {
// if pop is called too early
if (!isComplete())
return Error(-1);
-
+
char32_t utf32 = state->partial;
// reset if successful pop
>From 19fabd3630debea1b3e63ac297d7ca702800b43e Mon Sep 17 00:00:00 2001
From: Sriya Pratipati <sriyap at google.com>
Date: Fri, 13 Jun 2025 16:57:31 +0000
Subject: [PATCH 12/12] Cleaned up pushing first byte and some readability
changes
---
.../__support/wchar/character_converter.cpp | 33 +++++++------------
.../src/__support/wchar/utf8_to_32_test.cpp | 8 +++--
2 files changed, 17 insertions(+), 24 deletions(-)
diff --git a/libc/src/__support/wchar/character_converter.cpp b/libc/src/__support/wchar/character_converter.cpp
index 8cdd114a69280..520ec7b75076d 100644
--- a/libc/src/__support/wchar/character_converter.cpp
+++ b/libc/src/__support/wchar/character_converter.cpp
@@ -28,29 +28,18 @@ int CharacterConverter::push(char8_t utf8_byte) {
// Checking the first byte if first push
if (state->bytes_processed == 0 && state->total_bytes == 0) {
state->partial = static_cast<char32_t>(0);
- int numOnes = cpp::countl_one(utf8_byte);
- switch (numOnes) {
+ uint8_t numOnes = static_cast<uint8_t>(cpp::countl_one(utf8_byte));
// 1 byte total
- case 0:
+ if (numOnes == 0) {
state->total_bytes = 1;
- break;
- // 2 bytes total
- case 2:
- state->total_bytes = 2;
- utf8_byte &= 0x1F;
- break;
- // 3 bytes total
- case 3:
- state->total_bytes = 3;
- utf8_byte &= 0x0F;
- break;
- // 4 bytes total
- case 4:
- state->total_bytes = 4;
- utf8_byte &= 0x07;
- break;
+ }
+ // 2 through 4 bytes total
+ else if (numOnes >= 2 && numOnes <= 4) {
+ state->total_bytes = numOnes;
+ utf8_byte &= (0x7F >> numOnes);
+ }
// Invalid first byte
- default:
+ else {
return -1;
}
state->partial = static_cast<char32_t>(utf8_byte);
@@ -58,9 +47,11 @@ int CharacterConverter::push(char8_t utf8_byte) {
return 0;
}
// Any subsequent push
+ // Adding 6 more bits so need to left shift
+ const int shift_amount = 6;
if (cpp::countl_one(utf8_byte) == 1 && !isComplete()) {
char32_t byte = utf8_byte & 0x3F;
- state->partial = state->partial << 6;
+ state->partial = state->partial << shift_amount;
state->partial |= byte;
state->bytes_processed++;
return 0;
diff --git a/libc/test/src/__support/wchar/utf8_to_32_test.cpp b/libc/test/src/__support/wchar/utf8_to_32_test.cpp
index 95b14b287bd05..dabdb5bfc7481 100644
--- a/libc/test/src/__support/wchar/utf8_to_32_test.cpp
+++ b/libc/test/src/__support/wchar/utf8_to_32_test.cpp
@@ -31,7 +31,8 @@ TEST(LlvmLibcCharacterConverterUTF8To32Test, TwoBytes) {
LIBC_NAMESPACE::internal::mbstate state;
state.bytes_processed = 0;
state.total_bytes = 0;
- const char ch[2] = {static_cast<char>(0xC2), static_cast<char>(0x8E)}; //
+ const char ch[2] = {static_cast<char>(0xC2),
+ static_cast<char>(0x8E)}; // car symbol
LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
char_conv.push(static_cast<char8_t>(ch[0]));
@@ -47,7 +48,7 @@ TEST(LlvmLibcCharacterConverterUTF8To32Test, ThreeBytes) {
state.bytes_processed = 0;
state.total_bytes = 0;
const char ch[3] = {static_cast<char>(0xE2), static_cast<char>(0x88),
- static_cast<char>(0x91)}; // ∑
+ static_cast<char>(0x91)}; // ∑ sigma symbol
LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
char_conv.push(static_cast<char8_t>(ch[0]));
@@ -64,7 +65,8 @@ TEST(LlvmLibcCharacterConverterUTF8To32Test, FourBytes) {
state.bytes_processed = 0;
state.total_bytes = 0;
const char ch[4] = {static_cast<char>(0xF0), static_cast<char>(0x9F),
- static_cast<char>(0xA4), static_cast<char>(0xA1)}; // 🤡
+ static_cast<char>(0xA4),
+ static_cast<char>(0xA1)}; // 🤡 clown emoji
LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
char_conv.push(static_cast<char8_t>(ch[0]));
More information about the libc-commits
mailing list