[libc-commits] [libc] [libc] utf8 to 32 CharacterConverter (PR #143973)

via libc-commits libc-commits at lists.llvm.org
Fri Jun 13 09:35:16 PDT 2025


https://github.com/sribee8 updated https://github.com/llvm/llvm-project/pull/143973

>From 9561ab5887c72dce013c59d1596afa1ff37183bf Mon Sep 17 00:00:00 2001
From: Sriya Pratipati <sriyap at google.com>
Date: Thu, 12 Jun 2025 20:58:35 +0000
Subject: [PATCH 01/11] [libc] CharacterConverter utf8 to 32 push and pop

Implemented push and pop for utf8 to 32 conversion and tests.
---
 .../__support/wchar/character_converter.cpp   |  70 +++++++++-
 libc/test/src/__support/CMakeLists.txt        |   1 +
 libc/test/src/__support/wchar/CMakeLists.txt  |  11 ++
 .../src/__support/wchar/utf8_to_32_test.cpp   | 125 ++++++++++++++++++
 4 files changed, 203 insertions(+), 4 deletions(-)
 create mode 100644 libc/test/src/__support/wchar/CMakeLists.txt
 create mode 100644 libc/test/src/__support/wchar/utf8_to_32_test.cpp

diff --git a/libc/src/__support/wchar/character_converter.cpp b/libc/src/__support/wchar/character_converter.cpp
index f09c7815a6cc4..9c2fde3134837 100644
--- a/libc/src/__support/wchar/character_converter.cpp
+++ b/libc/src/__support/wchar/character_converter.cpp
@@ -22,13 +22,75 @@ bool CharacterConverter::isComplete() {
   return state->bytes_processed == state->total_bytes;
 }
 
-int CharacterConverter::push(char8_t utf8_byte) {}
+int CharacterConverter::push(char8_t utf8_byte) {
+  // Checking the first byte if first push
+  if (state->bytes_processed == 0 && state->total_bytes == 0) {
+    // 1 byte total
+    if ((utf8_byte & 128) == 0) {
+      state->total_bytes = 1;
+      state->bytes_processed = 1;
+      state->partial = static_cast<char32_t>(utf8_byte);
+      return 0;
+    }
+    // 2 bytes total
+    else if ((utf8_byte & 0xE0) == 0xC0) {
+      state->total_bytes = 2;
+      state->bytes_processed = 1;
+      utf8_byte &= 0x1F;
+      state->partial = static_cast<char32_t>(utf8_byte);
+      return 0;
+    }
+    // 3 bytes total
+    else if ((utf8_byte & 0xF0) == 0xE0) {
+      state->total_bytes = 3;
+      state->bytes_processed = 1;
+      utf8_byte &= 0x0F;
+      state->partial = static_cast<char32_t>(utf8_byte);
+      return 0;
+    }
+    // 4 bytes total
+    else if ((utf8_byte & 0xF8) == 0xF0) {
+      state->total_bytes = 4;
+      state->bytes_processed = 1;
+      utf8_byte &= 0x07;
+      state->partial = static_cast<char32_t>(utf8_byte);
+      return 0;
+    }
+    // Invalid
+    else {
+        state->bytes_processed++;
+        return -1;
+    }
+  }
+  // Any subsequent push
+  if ((utf8_byte & 0xC0) == 0x80) {
+    state->partial = state->partial << 6;
+    char32_t byte = utf8_byte & 0x3F;
+    state->partial |= byte;
+    state->bytes_processed++;
+    return 0;
+  }
+  state->bytes_processed++;
+  return -1;
+}
 
-int CharacterConverter::push(char32_t utf32) {}
+int CharacterConverter::push(char32_t utf32) { 
+    return utf32; 
+}
 
-utf_ret<char8_t> CharacterConverter::pop_utf8() {}
+utf_ret<char8_t> CharacterConverter::pop_utf8() {
+  utf_ret<char8_t> utf8;
+  utf8.error = 0;
+  utf8.out = 0;
+  return utf8;
+}
 
-utf_ret<char32_t> CharacterConverter::pop_utf32() {}
+utf_ret<char32_t> CharacterConverter::pop_utf32() {
+  utf_ret<char32_t> utf32;
+  utf32.error = 0;
+  utf32.out = state->partial;
+  return utf32;
+}
 
 } // namespace internal
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/test/src/__support/CMakeLists.txt b/libc/test/src/__support/CMakeLists.txt
index 4fb0dae86e5ca..8905ac2127620 100644
--- a/libc/test/src/__support/CMakeLists.txt
+++ b/libc/test/src/__support/CMakeLists.txt
@@ -275,3 +275,4 @@ add_subdirectory(fixed_point)
 add_subdirectory(HashTable)
 add_subdirectory(time)
 add_subdirectory(threads)
+add_subdirectory(wchar)
diff --git a/libc/test/src/__support/wchar/CMakeLists.txt b/libc/test/src/__support/wchar/CMakeLists.txt
new file mode 100644
index 0000000000000..cf8e615a4fd59
--- /dev/null
+++ b/libc/test/src/__support/wchar/CMakeLists.txt
@@ -0,0 +1,11 @@
+add_custom_target(libc-support-wchar-tests)
+
+add_libc_test(
+  utf8_to_32_test 
+  SUITE
+    libc-support-tests
+  SRCS
+    utf8_to_32_test.cpp 
+  DEPENDS
+    libc.src.__support.wchar.character_converter
+)
\ No newline at end of file
diff --git a/libc/test/src/__support/wchar/utf8_to_32_test.cpp b/libc/test/src/__support/wchar/utf8_to_32_test.cpp
new file mode 100644
index 0000000000000..aef9cfc557549
--- /dev/null
+++ b/libc/test/src/__support/wchar/utf8_to_32_test.cpp
@@ -0,0 +1,125 @@
+//===-- Unittests for character_converter utf8->3 -------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/wchar/character_converter.h"
+#include "src/__support/wchar/mbstate.h"
+#include "src/__support/wchar/utf_ret.h"
+#include "test/UnitTest/Test.h"
+
+TEST(LlvmLibcCharacterConverterUTF8To32Test, OneByte) {
+  LIBC_NAMESPACE::internal::mbstate state;
+  state.bytes_processed = 0;
+  state.total_bytes = 0;
+  char ch = 'A';
+
+  LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
+  int err = char_conv.push(static_cast<char8_t>(ch));
+  LIBC_NAMESPACE::internal::utf_ret<char32_t> wch = char_conv.pop_utf32();
+
+  EXPECT_EQ(err, 0);
+  EXPECT_EQ(wch.error, 0);
+  EXPECT_EQ(static_cast<int>(wch.out), 65);
+}
+
+TEST(LlvmLibcCharacterConverterUTF8To32Test, TwoBytes) {
+  LIBC_NAMESPACE::internal::mbstate state;
+  state.bytes_processed = 0;
+  state.total_bytes = 0;
+  const char *ch = "Ž"; // hex 0xC2, 0x8E
+
+  LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
+  char_conv.push(static_cast<char8_t>(ch[0]));
+  char_conv.push(static_cast<char8_t>(ch[1]));
+  LIBC_NAMESPACE::internal::utf_ret<char32_t> wch = char_conv.pop_utf32();
+
+  ASSERT_EQ(wch.error, 0);
+  ASSERT_EQ(static_cast<int>(wch.out), 142);
+}
+
+TEST(LlvmLibcCharacterConverterUTF8To32Test, ThreeBytes) {
+  LIBC_NAMESPACE::internal::mbstate state;
+  state.bytes_processed = 0;
+  state.total_bytes = 0;
+  const char *ch = "∑"; // hex 0xE2, 0x88, 0x91
+
+  LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
+  char_conv.push(static_cast<char8_t>(ch[0]));
+  char_conv.push(static_cast<char8_t>(ch[1]));
+  char_conv.push(static_cast<char8_t>(ch[2]));
+  LIBC_NAMESPACE::internal::utf_ret<char32_t> wch = char_conv.pop_utf32();
+
+  ASSERT_EQ(wch.error, 0);
+  ASSERT_EQ(static_cast<int>(wch.out), 8721);
+}
+
+TEST(LlvmLibcCharacterConverterUTF8To32Test, FourBytes) {
+  LIBC_NAMESPACE::internal::mbstate state;
+  state.bytes_processed = 0;
+  state.total_bytes = 0;
+  const char *ch = "🤡"; // hex 0xF0, 0x9F, 0xA4, 0xA1
+
+  LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
+  char_conv.push(static_cast<char8_t>(ch[0]));
+  char_conv.push(static_cast<char8_t>(ch[1]));
+  char_conv.push(static_cast<char8_t>(ch[2]));
+  char_conv.push(static_cast<char8_t>(ch[3]));
+  LIBC_NAMESPACE::internal::utf_ret<char32_t> wch = char_conv.pop_utf32();
+
+  ASSERT_EQ(wch.error, 0);
+  ASSERT_EQ(static_cast<int>(wch.out), 129313);
+}
+
+TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidByte) {
+  LIBC_NAMESPACE::internal::mbstate state;
+  state.bytes_processed = 0;
+  state.total_bytes = 0;
+  const char ch = static_cast<char>(0x80); // invalid starting bit sequence
+
+  LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
+  int err = char_conv.push(static_cast<char8_t>(ch));
+
+  ASSERT_EQ(err, -1);
+}
+
+TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidMultiByte) {
+  LIBC_NAMESPACE::internal::mbstate state;
+  state.bytes_processed = 0;
+  state.total_bytes = 0;
+  const char ch[4] = {static_cast<char>(0x80), static_cast<char>(0x00),
+                      static_cast<char>(0x00),
+                      static_cast<char>(0x00)}; // All bytes are invalid
+
+  LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
+  int err = char_conv.push(static_cast<char8_t>(ch[0]));
+  ASSERT_EQ(err, -1);
+  err = char_conv.push(static_cast<char8_t>(ch[1]));
+  ASSERT_EQ(err, -1);
+  err = char_conv.push(static_cast<char8_t>(ch[2]));
+  ASSERT_EQ(err, -1);
+  err = char_conv.push(static_cast<char8_t>(ch[3]));
+  ASSERT_EQ(err, -1);
+}
+
+TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidMiddleByte) {
+  LIBC_NAMESPACE::internal::mbstate state;
+  state.bytes_processed = 0;
+  state.total_bytes = 0;
+  const char ch[4] = {static_cast<char>(0xF1), static_cast<char>(0xC0),
+                      static_cast<char>(0x80),
+                      static_cast<char>(0x80)}; // invalid second byte
+
+  LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
+  int err = char_conv.push(static_cast<char8_t>(ch[0]));
+  ASSERT_EQ(err, 0);
+  err = char_conv.push(static_cast<char8_t>(ch[1]));
+  ASSERT_EQ(err, -1);
+  err = char_conv.push(static_cast<char8_t>(ch[2]));
+  ASSERT_EQ(err, 0);
+  err = char_conv.push(static_cast<char8_t>(ch[3]));
+  ASSERT_EQ(err, 0);
+}

>From 54c64e0e952de307ca994320ccffe399b013ebf4 Mon Sep 17 00:00:00 2001
From: Sriya Pratipati <sriyap at google.com>
Date: Thu, 12 Jun 2025 21:05:39 +0000
Subject: [PATCH 02/11] fixed formatting

---
 libc/src/__support/wchar/character_converter.cpp | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/libc/src/__support/wchar/character_converter.cpp b/libc/src/__support/wchar/character_converter.cpp
index 9c2fde3134837..952d6e2ee94c7 100644
--- a/libc/src/__support/wchar/character_converter.cpp
+++ b/libc/src/__support/wchar/character_converter.cpp
@@ -58,8 +58,8 @@ int CharacterConverter::push(char8_t utf8_byte) {
     }
     // Invalid
     else {
-        state->bytes_processed++;
-        return -1;
+      state->bytes_processed++;
+      return -1;
     }
   }
   // Any subsequent push
@@ -74,9 +74,7 @@ int CharacterConverter::push(char8_t utf8_byte) {
   return -1;
 }
 
-int CharacterConverter::push(char32_t utf32) { 
-    return utf32; 
-}
+int CharacterConverter::push(char32_t utf32) { return utf32; }
 
 utf_ret<char8_t> CharacterConverter::pop_utf8() {
   utf_ret<char8_t> utf8;

>From a82f9ee8b1ac9f2e4e7e863eb0bb55d3ea7cd62f Mon Sep 17 00:00:00 2001
From: Sriya Pratipati <sriyap at google.com>
Date: Thu, 12 Jun 2025 21:37:30 +0000
Subject: [PATCH 03/11] Changed cmakelists for tests since MacOS does not have
 uchar header

---
 libc/test/src/__support/CMakeLists.txt | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/libc/test/src/__support/CMakeLists.txt b/libc/test/src/__support/CMakeLists.txt
index 8905ac2127620..9f626ed31cc07 100644
--- a/libc/test/src/__support/CMakeLists.txt
+++ b/libc/test/src/__support/CMakeLists.txt
@@ -275,4 +275,8 @@ add_subdirectory(fixed_point)
 add_subdirectory(HashTable)
 add_subdirectory(time)
 add_subdirectory(threads)
-add_subdirectory(wchar)
+# Requires access to uchar header which is not on MacOS
+# Cannot currently build this on MacOS in overlay mode
+if(NOT(LIBC_TARGET_OS_IS_DARWIN))
+  add_subdirectory(wchar)
+endif()

>From 4087711f7c03cf3cb065603f1a7099c708361189 Mon Sep 17 00:00:00 2001
From: Sriya Pratipati <sriyap at google.com>
Date: Thu, 12 Jun 2025 21:42:41 +0000
Subject: [PATCH 04/11] Deleted unused functions

---
 libc/src/__support/wchar/character_converter.cpp | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/libc/src/__support/wchar/character_converter.cpp b/libc/src/__support/wchar/character_converter.cpp
index 952d6e2ee94c7..ffc51e6d6dc28 100644
--- a/libc/src/__support/wchar/character_converter.cpp
+++ b/libc/src/__support/wchar/character_converter.cpp
@@ -74,15 +74,6 @@ int CharacterConverter::push(char8_t utf8_byte) {
   return -1;
 }
 
-int CharacterConverter::push(char32_t utf32) { return utf32; }
-
-utf_ret<char8_t> CharacterConverter::pop_utf8() {
-  utf_ret<char8_t> utf8;
-  utf8.error = 0;
-  utf8.out = 0;
-  return utf8;
-}
-
 utf_ret<char32_t> CharacterConverter::pop_utf32() {
   utf_ret<char32_t> utf32;
   utf32.error = 0;

>From 20cd8e572c5993694277949098abd8dcc96d15f4 Mon Sep 17 00:00:00 2001
From: Sriya Pratipati <sriyap at google.com>
Date: Thu, 12 Jun 2025 23:09:09 +0000
Subject: [PATCH 05/11] Cleaned up code, added edge cases, added new test cases
 for edge cases

---
 .../__support/wchar/character_converter.cpp   | 47 ++++++------
 libc/test/src/__support/wchar/CMakeLists.txt  |  2 +-
 .../src/__support/wchar/utf8_to_32_test.cpp   | 73 ++++++++++++++++---
 3 files changed, 89 insertions(+), 33 deletions(-)

diff --git a/libc/src/__support/wchar/character_converter.cpp b/libc/src/__support/wchar/character_converter.cpp
index ffc51e6d6dc28..276d2d95f47e3 100644
--- a/libc/src/__support/wchar/character_converter.cpp
+++ b/libc/src/__support/wchar/character_converter.cpp
@@ -8,6 +8,7 @@
 
 #include "hdr/types/char32_t.h"
 #include "hdr/types/char8_t.h"
+#include "src/__support/CPP/bit.h"
 #include "src/__support/wchar/mbstate.h"
 #include "src/__support/wchar/utf_ret.h"
 
@@ -25,52 +26,51 @@ bool CharacterConverter::isComplete() {
 int CharacterConverter::push(char8_t utf8_byte) {
   // Checking the first byte if first push
   if (state->bytes_processed == 0 && state->total_bytes == 0) {
+    state->partial = static_cast<char32_t>(0);
     // 1 byte total
-    if ((utf8_byte & 128) == 0) {
+    if (cpp::countl_one(utf8_byte) == 0) {
       state->total_bytes = 1;
-      state->bytes_processed = 1;
-      state->partial = static_cast<char32_t>(utf8_byte);
-      return 0;
     }
     // 2 bytes total
-    else if ((utf8_byte & 0xE0) == 0xC0) {
+    else if (cpp::countl_one(utf8_byte) == 2) {
       state->total_bytes = 2;
-      state->bytes_processed = 1;
       utf8_byte &= 0x1F;
-      state->partial = static_cast<char32_t>(utf8_byte);
-      return 0;
     }
     // 3 bytes total
-    else if ((utf8_byte & 0xF0) == 0xE0) {
+    else if (cpp::countl_one(utf8_byte) == 3) {
       state->total_bytes = 3;
-      state->bytes_processed = 1;
       utf8_byte &= 0x0F;
-      state->partial = static_cast<char32_t>(utf8_byte);
-      return 0;
     }
     // 4 bytes total
-    else if ((utf8_byte & 0xF8) == 0xF0) {
+    else if (cpp::countl_one(utf8_byte) == 4) {
       state->total_bytes = 4;
-      state->bytes_processed = 1;
       utf8_byte &= 0x07;
-      state->partial = static_cast<char32_t>(utf8_byte);
-      return 0;
     }
-    // Invalid
+    // Invalid byte -> reset mbstate
     else {
-      state->bytes_processed++;
+      state->partial = static_cast<char32_t>(0);
+      state->bytes_processed = 0;
+      state->total_bytes = 0;
       return -1;
     }
+    state->partial = static_cast<char32_t>(utf8_byte);
+    state->bytes_processed++;
+    return 0;
   }
   // Any subsequent push
-  if ((utf8_byte & 0xC0) == 0x80) {
-    state->partial = state->partial << 6;
+  if (cpp::countl_one(utf8_byte) == 1 && !isComplete()) {
     char32_t byte = utf8_byte & 0x3F;
+    state->partial = state->partial << 6;
     state->partial |= byte;
     state->bytes_processed++;
     return 0;
   }
-  state->bytes_processed++;
+  // Invalid byte -> reset if we didn't get successful complete read
+  if (!isComplete()) {
+    state->partial = static_cast<char32_t>(0);
+    state->bytes_processed = 0;
+    state->total_bytes = 0;
+  }
   return -1;
 }
 
@@ -78,6 +78,11 @@ utf_ret<char32_t> CharacterConverter::pop_utf32() {
   utf_ret<char32_t> utf32;
   utf32.error = 0;
   utf32.out = state->partial;
+  if (!isComplete())
+    utf32.error = -1;
+  state->bytes_processed = 0;
+  state->total_bytes = 0;
+  state->partial = static_cast<char32_t>(0);
   return utf32;
 }
 
diff --git a/libc/test/src/__support/wchar/CMakeLists.txt b/libc/test/src/__support/wchar/CMakeLists.txt
index cf8e615a4fd59..1c7132d1b5978 100644
--- a/libc/test/src/__support/wchar/CMakeLists.txt
+++ b/libc/test/src/__support/wchar/CMakeLists.txt
@@ -8,4 +8,4 @@ add_libc_test(
     utf8_to_32_test.cpp 
   DEPENDS
     libc.src.__support.wchar.character_converter
-)
\ No newline at end of file
+)
diff --git a/libc/test/src/__support/wchar/utf8_to_32_test.cpp b/libc/test/src/__support/wchar/utf8_to_32_test.cpp
index aef9cfc557549..7cf7ddba4bb97 100644
--- a/libc/test/src/__support/wchar/utf8_to_32_test.cpp
+++ b/libc/test/src/__support/wchar/utf8_to_32_test.cpp
@@ -1,4 +1,4 @@
-//===-- Unittests for character_converter utf8->3 -------------------------===//
+//===-- Unittests for character_converter utf8->utf32 ---------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -30,7 +30,7 @@ TEST(LlvmLibcCharacterConverterUTF8To32Test, TwoBytes) {
   LIBC_NAMESPACE::internal::mbstate state;
   state.bytes_processed = 0;
   state.total_bytes = 0;
-  const char *ch = "Ž"; // hex 0xC2, 0x8E
+  const char ch[2] = {static_cast<char>(0xC2), static_cast<char>(0x8E)}; // Ž
 
   LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
   char_conv.push(static_cast<char8_t>(ch[0]));
@@ -45,7 +45,8 @@ TEST(LlvmLibcCharacterConverterUTF8To32Test, ThreeBytes) {
   LIBC_NAMESPACE::internal::mbstate state;
   state.bytes_processed = 0;
   state.total_bytes = 0;
-  const char *ch = "∑"; // hex 0xE2, 0x88, 0x91
+  const char ch[3] = {static_cast<char>(0xE2), static_cast<char>(0x88),
+                      static_cast<char>(0x91)}; // ∑
 
   LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
   char_conv.push(static_cast<char8_t>(ch[0]));
@@ -61,7 +62,8 @@ TEST(LlvmLibcCharacterConverterUTF8To32Test, FourBytes) {
   LIBC_NAMESPACE::internal::mbstate state;
   state.bytes_processed = 0;
   state.total_bytes = 0;
-  const char *ch = "🤡"; // hex 0xF0, 0x9F, 0xA4, 0xA1
+  const char ch[4] = {static_cast<char>(0xF0), static_cast<char>(0x9F),
+                      static_cast<char>(0xA4), static_cast<char>(0xA1)}; // 🤡
 
   LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
   char_conv.push(static_cast<char8_t>(ch[0]));
@@ -90,36 +92,85 @@ TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidMultiByte) {
   LIBC_NAMESPACE::internal::mbstate state;
   state.bytes_processed = 0;
   state.total_bytes = 0;
-  const char ch[4] = {static_cast<char>(0x80), static_cast<char>(0x00),
-                      static_cast<char>(0x00),
-                      static_cast<char>(0x00)}; // All bytes are invalid
+  const char ch[4] = {
+      static_cast<char>(0x80), static_cast<char>(0x00), static_cast<char>(0x80),
+      static_cast<char>(0x00)}; // first, third, and last bytes are invalid
 
   LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
   int err = char_conv.push(static_cast<char8_t>(ch[0]));
   ASSERT_EQ(err, -1);
   err = char_conv.push(static_cast<char8_t>(ch[1]));
-  ASSERT_EQ(err, -1);
+  ASSERT_EQ(err, 0);
+  // Prev byte was single byte so trying to read another should error.
   err = char_conv.push(static_cast<char8_t>(ch[2]));
   ASSERT_EQ(err, -1);
   err = char_conv.push(static_cast<char8_t>(ch[3]));
   ASSERT_EQ(err, -1);
 }
 
-TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidMiddleByte) {
+TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidLastByte) {
   LIBC_NAMESPACE::internal::mbstate state;
   state.bytes_processed = 0;
   state.total_bytes = 0;
-  const char ch[4] = {static_cast<char>(0xF1), static_cast<char>(0xC0),
+  const char ch[4] = {static_cast<char>(0xF1), static_cast<char>(0x80),
                       static_cast<char>(0x80),
-                      static_cast<char>(0x80)}; // invalid second byte
+                      static_cast<char>(0xC0)}; // invalid last byte
 
   LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
   int err = char_conv.push(static_cast<char8_t>(ch[0]));
   ASSERT_EQ(err, 0);
   err = char_conv.push(static_cast<char8_t>(ch[1]));
+  ASSERT_EQ(err, 0);
+  err = char_conv.push(static_cast<char8_t>(ch[2]));
+  ASSERT_EQ(err, 0);
+  err = char_conv.push(static_cast<char8_t>(ch[3]));
   ASSERT_EQ(err, -1);
+}
+
+TEST(LlvmLibcCharacterConverterUTF8To32Test, ValidTwoByteWithExtraRead) {
+  LIBC_NAMESPACE::internal::mbstate state;
+  state.bytes_processed = 0;
+  state.total_bytes = 0;
+  const char ch[3] = {static_cast<char>(0xC2), static_cast<char>(0x8E),
+                      static_cast<char>(0x80)};
+
+  LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
+  int err = char_conv.push(static_cast<char8_t>(ch[0]));
+  ASSERT_EQ(err, 0);
+  err = char_conv.push(static_cast<char8_t>(ch[1]));
+  ASSERT_EQ(err, 0);
+  // Should produce an error on 3rd byte
+  err = char_conv.push(static_cast<char8_t>(ch[2]));
+  ASSERT_EQ(err, -1);
+
+  LIBC_NAMESPACE::internal::utf_ret<char32_t> wch = char_conv.pop_utf32();
+  ASSERT_EQ(wch.error, 0);
+  // Should still output the correct result.
+  ASSERT_EQ(static_cast<int>(wch.out), 142);
+}
+
+TEST(LlvmLibcCharacterConverterUTF8To32Test, TwoValidTwoBytes) {
+  LIBC_NAMESPACE::internal::mbstate state;
+  state.bytes_processed = 0;
+  state.total_bytes = 0;
+  const char ch[4] = {static_cast<char>(0xC2), static_cast<char>(0x8E),
+                      static_cast<char>(0xC7), static_cast<char>(0x8C)};
+
+  LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
+  int err = char_conv.push(static_cast<char8_t>(ch[0]));
+  ASSERT_EQ(err, 0);
+  err = char_conv.push(static_cast<char8_t>(ch[1]));
+  ASSERT_EQ(err, 0);
+  LIBC_NAMESPACE::internal::utf_ret<char32_t> wch = char_conv.pop_utf32();
+  ASSERT_EQ(wch.error, 0);
+  ASSERT_EQ(static_cast<int>(wch.out), 142);
+
+  // Second two byte character
   err = char_conv.push(static_cast<char8_t>(ch[2]));
   ASSERT_EQ(err, 0);
   err = char_conv.push(static_cast<char8_t>(ch[3]));
   ASSERT_EQ(err, 0);
+  wch = char_conv.pop_utf32();
+  ASSERT_EQ(wch.error, 0);
+  ASSERT_EQ(static_cast<int>(wch.out), 460);
 }

>From a188c4bc1146523c2b40dcbf40eef77ddcfc49df Mon Sep 17 00:00:00 2001
From: Sriya Pratipati <sriyap at google.com>
Date: Thu, 12 Jun 2025 23:29:20 +0000
Subject: [PATCH 06/11] Fixed invalid pop and added test

---
 .../src/__support/wchar/character_converter.cpp |  9 +++++----
 .../src/__support/wchar/utf8_to_32_test.cpp     | 17 +++++++++++++++++
 2 files changed, 22 insertions(+), 4 deletions(-)

diff --git a/libc/src/__support/wchar/character_converter.cpp b/libc/src/__support/wchar/character_converter.cpp
index 276d2d95f47e3..3c7147d919330 100644
--- a/libc/src/__support/wchar/character_converter.cpp
+++ b/libc/src/__support/wchar/character_converter.cpp
@@ -48,9 +48,6 @@ int CharacterConverter::push(char8_t utf8_byte) {
     }
     // Invalid byte -> reset mbstate
     else {
-      state->partial = static_cast<char32_t>(0);
-      state->bytes_processed = 0;
-      state->total_bytes = 0;
       return -1;
     }
     state->partial = static_cast<char32_t>(utf8_byte);
@@ -78,8 +75,12 @@ utf_ret<char32_t> CharacterConverter::pop_utf32() {
   utf_ret<char32_t> utf32;
   utf32.error = 0;
   utf32.out = state->partial;
-  if (!isComplete())
+  // if pop is called too early
+  if (!isComplete()) {
     utf32.error = -1;
+    return utf32;
+  }
+  // reset if successful pop
   state->bytes_processed = 0;
   state->total_bytes = 0;
   state->partial = static_cast<char32_t>(0);
diff --git a/libc/test/src/__support/wchar/utf8_to_32_test.cpp b/libc/test/src/__support/wchar/utf8_to_32_test.cpp
index 7cf7ddba4bb97..e1529320ad648 100644
--- a/libc/test/src/__support/wchar/utf8_to_32_test.cpp
+++ b/libc/test/src/__support/wchar/utf8_to_32_test.cpp
@@ -174,3 +174,20 @@ TEST(LlvmLibcCharacterConverterUTF8To32Test, TwoValidTwoBytes) {
   ASSERT_EQ(wch.error, 0);
   ASSERT_EQ(static_cast<int>(wch.out), 460);
 }
+
+TEST(LlvmLibcCharacterConverterUTF8To32Test, InvlaidPop) {
+  LIBC_NAMESPACE::internal::mbstate state;
+  state.bytes_processed = 0;
+  state.total_bytes = 0;
+  LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
+  const char ch[2] = {static_cast<char>(0xC2), static_cast<char>(0x8E)};
+  int err = char_conv.push(static_cast<char8_t>(ch[0]));
+  ASSERT_EQ(err, 0);
+  LIBC_NAMESPACE::internal::utf_ret<char32_t> wch = char_conv.pop_utf32();
+  ASSERT_EQ(wch.error, -1); // Should fail since we have not read enough bytes
+  err = char_conv.push(static_cast<char8_t>(ch[1]));
+  ASSERT_EQ(err, 0);
+  wch = char_conv.pop_utf32();
+  ASSERT_EQ(wch.error, 0); 
+  ASSERT_EQ(static_cast<int>(wch.out), 142);
+}

>From 9535c5beb817bbfe9cd498866cf0759ec84ae62f Mon Sep 17 00:00:00 2001
From: Sriya Pratipati <sriyap at google.com>
Date: Thu, 12 Jun 2025 23:34:48 +0000
Subject: [PATCH 07/11] cleaned up code

---
 .../__support/wchar/character_converter.cpp   | 22 ++++++++++---------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/libc/src/__support/wchar/character_converter.cpp b/libc/src/__support/wchar/character_converter.cpp
index 3c7147d919330..8b4e57a765071 100644
--- a/libc/src/__support/wchar/character_converter.cpp
+++ b/libc/src/__support/wchar/character_converter.cpp
@@ -27,27 +27,29 @@ int CharacterConverter::push(char8_t utf8_byte) {
   // Checking the first byte if first push
   if (state->bytes_processed == 0 && state->total_bytes == 0) {
     state->partial = static_cast<char32_t>(0);
+    int numOnes = cpp::countl_one(utf8_byte);
+    switch (numOnes) {
     // 1 byte total
-    if (cpp::countl_one(utf8_byte) == 0) {
+    case 0:
       state->total_bytes = 1;
-    }
+      break;
     // 2 bytes total
-    else if (cpp::countl_one(utf8_byte) == 2) {
+    case 2:
       state->total_bytes = 2;
       utf8_byte &= 0x1F;
-    }
+      break;
     // 3 bytes total
-    else if (cpp::countl_one(utf8_byte) == 3) {
+    case 3:
       state->total_bytes = 3;
       utf8_byte &= 0x0F;
-    }
+      break;
     // 4 bytes total
-    else if (cpp::countl_one(utf8_byte) == 4) {
+    case 4:
       state->total_bytes = 4;
       utf8_byte &= 0x07;
-    }
-    // Invalid byte -> reset mbstate
-    else {
+      break;
+    // Invalid first byte
+    default:
       return -1;
     }
     state->partial = static_cast<char32_t>(utf8_byte);

>From 82bff4991e93cb39854ff9cea62d6e40cdd5a3ef Mon Sep 17 00:00:00 2001
From: Sriya Pratipati <sriyap at google.com>
Date: Thu, 12 Jun 2025 23:56:13 +0000
Subject: [PATCH 08/11] Changed return type to error_or for pop

---
 .../__support/wchar/character_converter.cpp   | 11 ++--
 .../src/__support/wchar/character_converter.h |  5 +-
 .../src/__support/wchar/utf8_to_32_test.cpp   | 53 ++++++++++---------
 3 files changed, 35 insertions(+), 34 deletions(-)

diff --git a/libc/src/__support/wchar/character_converter.cpp b/libc/src/__support/wchar/character_converter.cpp
index 8b4e57a765071..593d5c98f732a 100644
--- a/libc/src/__support/wchar/character_converter.cpp
+++ b/libc/src/__support/wchar/character_converter.cpp
@@ -9,6 +9,7 @@
 #include "hdr/types/char32_t.h"
 #include "hdr/types/char8_t.h"
 #include "src/__support/CPP/bit.h"
+#include "src/__support/error_or.h"
 #include "src/__support/wchar/mbstate.h"
 #include "src/__support/wchar/utf_ret.h"
 
@@ -73,14 +74,12 @@ int CharacterConverter::push(char8_t utf8_byte) {
   return -1;
 }
 
-utf_ret<char32_t> CharacterConverter::pop_utf32() {
-  utf_ret<char32_t> utf32;
-  utf32.error = 0;
-  utf32.out = state->partial;
+ErrorOr<char32_t> CharacterConverter::pop_utf32() {
+  char32_t utf32;
+  utf32 = state->partial;
   // if pop is called too early
   if (!isComplete()) {
-    utf32.error = -1;
-    return utf32;
+    return Error(-1);
   }
   // reset if successful pop
   state->bytes_processed = 0;
diff --git a/libc/src/__support/wchar/character_converter.h b/libc/src/__support/wchar/character_converter.h
index d0602d2defe22..45c9176d796d2 100644
--- a/libc/src/__support/wchar/character_converter.h
+++ b/libc/src/__support/wchar/character_converter.h
@@ -11,6 +11,7 @@
 
 #include "hdr/types/char32_t.h"
 #include "hdr/types/char8_t.h"
+#include "src/__support/error_or.h"
 #include "src/__support/wchar/mbstate.h"
 #include "src/__support/wchar/utf_ret.h"
 
@@ -29,8 +30,8 @@ class CharacterConverter {
   int push(char8_t utf8_byte);
   int push(char32_t utf32);
 
-  utf_ret<char8_t> pop_utf8();
-  utf_ret<char32_t> pop_utf32();
+  ErrorOr<char8_t> pop_utf8();
+  ErrorOr<char32_t> pop_utf32();
 };
 
 } // namespace internal
diff --git a/libc/test/src/__support/wchar/utf8_to_32_test.cpp b/libc/test/src/__support/wchar/utf8_to_32_test.cpp
index e1529320ad648..949cc02027d3a 100644
--- a/libc/test/src/__support/wchar/utf8_to_32_test.cpp
+++ b/libc/test/src/__support/wchar/utf8_to_32_test.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "src/__support/error_or.h"
 #include "src/__support/wchar/character_converter.h"
 #include "src/__support/wchar/mbstate.h"
 #include "src/__support/wchar/utf_ret.h"
@@ -19,11 +20,11 @@ TEST(LlvmLibcCharacterConverterUTF8To32Test, OneByte) {
 
   LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
   int err = char_conv.push(static_cast<char8_t>(ch));
-  LIBC_NAMESPACE::internal::utf_ret<char32_t> wch = char_conv.pop_utf32();
+  auto wch = char_conv.pop_utf32();
 
-  EXPECT_EQ(err, 0);
-  EXPECT_EQ(wch.error, 0);
-  EXPECT_EQ(static_cast<int>(wch.out), 65);
+  ASSERT_EQ(err, 0);
+  ASSERT_TRUE(wch.has_value());
+  ASSERT_EQ(static_cast<int>(wch.value()), 65);
 }
 
 TEST(LlvmLibcCharacterConverterUTF8To32Test, TwoBytes) {
@@ -35,10 +36,10 @@ TEST(LlvmLibcCharacterConverterUTF8To32Test, TwoBytes) {
   LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
   char_conv.push(static_cast<char8_t>(ch[0]));
   char_conv.push(static_cast<char8_t>(ch[1]));
-  LIBC_NAMESPACE::internal::utf_ret<char32_t> wch = char_conv.pop_utf32();
+  auto wch = char_conv.pop_utf32();
 
-  ASSERT_EQ(wch.error, 0);
-  ASSERT_EQ(static_cast<int>(wch.out), 142);
+  ASSERT_TRUE(wch.has_value());
+  ASSERT_EQ(static_cast<int>(wch.value()), 142);
 }
 
 TEST(LlvmLibcCharacterConverterUTF8To32Test, ThreeBytes) {
@@ -52,10 +53,10 @@ TEST(LlvmLibcCharacterConverterUTF8To32Test, ThreeBytes) {
   char_conv.push(static_cast<char8_t>(ch[0]));
   char_conv.push(static_cast<char8_t>(ch[1]));
   char_conv.push(static_cast<char8_t>(ch[2]));
-  LIBC_NAMESPACE::internal::utf_ret<char32_t> wch = char_conv.pop_utf32();
+  auto wch = char_conv.pop_utf32();
 
-  ASSERT_EQ(wch.error, 0);
-  ASSERT_EQ(static_cast<int>(wch.out), 8721);
+  ASSERT_TRUE(wch.has_value());
+  ASSERT_EQ(static_cast<int>(wch.value()), 8721);
 }
 
 TEST(LlvmLibcCharacterConverterUTF8To32Test, FourBytes) {
@@ -70,10 +71,10 @@ TEST(LlvmLibcCharacterConverterUTF8To32Test, FourBytes) {
   char_conv.push(static_cast<char8_t>(ch[1]));
   char_conv.push(static_cast<char8_t>(ch[2]));
   char_conv.push(static_cast<char8_t>(ch[3]));
-  LIBC_NAMESPACE::internal::utf_ret<char32_t> wch = char_conv.pop_utf32();
+  auto wch = char_conv.pop_utf32();
 
-  ASSERT_EQ(wch.error, 0);
-  ASSERT_EQ(static_cast<int>(wch.out), 129313);
+  ASSERT_TRUE(wch.has_value());
+  ASSERT_EQ(static_cast<int>(wch.value()), 129313);
 }
 
 TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidByte) {
@@ -143,10 +144,10 @@ TEST(LlvmLibcCharacterConverterUTF8To32Test, ValidTwoByteWithExtraRead) {
   err = char_conv.push(static_cast<char8_t>(ch[2]));
   ASSERT_EQ(err, -1);
 
-  LIBC_NAMESPACE::internal::utf_ret<char32_t> wch = char_conv.pop_utf32();
-  ASSERT_EQ(wch.error, 0);
+  auto wch = char_conv.pop_utf32();
+  ASSERT_TRUE(wch.has_value());
   // Should still output the correct result.
-  ASSERT_EQ(static_cast<int>(wch.out), 142);
+  ASSERT_EQ(static_cast<int>(wch.value()), 142);
 }
 
 TEST(LlvmLibcCharacterConverterUTF8To32Test, TwoValidTwoBytes) {
@@ -161,9 +162,9 @@ TEST(LlvmLibcCharacterConverterUTF8To32Test, TwoValidTwoBytes) {
   ASSERT_EQ(err, 0);
   err = char_conv.push(static_cast<char8_t>(ch[1]));
   ASSERT_EQ(err, 0);
-  LIBC_NAMESPACE::internal::utf_ret<char32_t> wch = char_conv.pop_utf32();
-  ASSERT_EQ(wch.error, 0);
-  ASSERT_EQ(static_cast<int>(wch.out), 142);
+  auto wch = char_conv.pop_utf32();
+  ASSERT_TRUE(wch.has_value());
+  ASSERT_EQ(static_cast<int>(wch.value()), 142);
 
   // Second two byte character
   err = char_conv.push(static_cast<char8_t>(ch[2]));
@@ -171,11 +172,11 @@ TEST(LlvmLibcCharacterConverterUTF8To32Test, TwoValidTwoBytes) {
   err = char_conv.push(static_cast<char8_t>(ch[3]));
   ASSERT_EQ(err, 0);
   wch = char_conv.pop_utf32();
-  ASSERT_EQ(wch.error, 0);
-  ASSERT_EQ(static_cast<int>(wch.out), 460);
+  ASSERT_TRUE(wch.has_value());
+  ASSERT_EQ(static_cast<int>(wch.value()), 460);
 }
 
-TEST(LlvmLibcCharacterConverterUTF8To32Test, InvlaidPop) {
+TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidPop) {
   LIBC_NAMESPACE::internal::mbstate state;
   state.bytes_processed = 0;
   state.total_bytes = 0;
@@ -183,11 +184,11 @@ TEST(LlvmLibcCharacterConverterUTF8To32Test, InvlaidPop) {
   const char ch[2] = {static_cast<char>(0xC2), static_cast<char>(0x8E)};
   int err = char_conv.push(static_cast<char8_t>(ch[0]));
   ASSERT_EQ(err, 0);
-  LIBC_NAMESPACE::internal::utf_ret<char32_t> wch = char_conv.pop_utf32();
-  ASSERT_EQ(wch.error, -1); // Should fail since we have not read enough bytes
+  auto wch = char_conv.pop_utf32();
+  ASSERT_FALSE(wch.has_value()); // Should fail since we have not read enough bytes
   err = char_conv.push(static_cast<char8_t>(ch[1]));
   ASSERT_EQ(err, 0);
   wch = char_conv.pop_utf32();
-  ASSERT_EQ(wch.error, 0); 
-  ASSERT_EQ(static_cast<int>(wch.out), 142);
+  ASSERT_TRUE(wch.has_value());
+  ASSERT_EQ(static_cast<int>(wch.value()), 142);
 }

>From b1e17adb8edbea380d38e27b1c9fba389111ee69 Mon Sep 17 00:00:00 2001
From: Sriya Pratipati <sriyap at google.com>
Date: Thu, 12 Jun 2025 23:57:26 +0000
Subject: [PATCH 09/11] fixed formatting

---
 libc/test/src/__support/wchar/utf8_to_32_test.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/libc/test/src/__support/wchar/utf8_to_32_test.cpp b/libc/test/src/__support/wchar/utf8_to_32_test.cpp
index 949cc02027d3a..95b14b287bd05 100644
--- a/libc/test/src/__support/wchar/utf8_to_32_test.cpp
+++ b/libc/test/src/__support/wchar/utf8_to_32_test.cpp
@@ -185,7 +185,8 @@ TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidPop) {
   int err = char_conv.push(static_cast<char8_t>(ch[0]));
   ASSERT_EQ(err, 0);
   auto wch = char_conv.pop_utf32();
-  ASSERT_FALSE(wch.has_value()); // Should fail since we have not read enough bytes
+  ASSERT_FALSE(
+      wch.has_value()); // Should fail since we have not read enough bytes
   err = char_conv.push(static_cast<char8_t>(ch[1]));
   ASSERT_EQ(err, 0);
   wch = char_conv.pop_utf32();

>From 3ad6866c1c92e5dfe502740ced8ae7f20e83c8cf Mon Sep 17 00:00:00 2001
From: Sriya Pratipati <sriyap at google.com>
Date: Fri, 13 Jun 2025 16:30:15 +0000
Subject: [PATCH 10/11] code cleanup

---
 libc/src/__support/wchar/character_converter.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/libc/src/__support/wchar/character_converter.cpp b/libc/src/__support/wchar/character_converter.cpp
index 593d5c98f732a..6074557dc3687 100644
--- a/libc/src/__support/wchar/character_converter.cpp
+++ b/libc/src/__support/wchar/character_converter.cpp
@@ -75,12 +75,12 @@ int CharacterConverter::push(char8_t utf8_byte) {
 }
 
 ErrorOr<char32_t> CharacterConverter::pop_utf32() {
-  char32_t utf32;
-  utf32 = state->partial;
   // if pop is called too early
-  if (!isComplete()) {
+  if (!isComplete())
     return Error(-1);
-  }
+    
+  char32_t utf32 = state->partial;
+
   // reset if successful pop
   state->bytes_processed = 0;
   state->total_bytes = 0;

>From 3323d0de7df4255198c4462c9966a8adcaac224a Mon Sep 17 00:00:00 2001
From: Sriya Pratipati <sriyap at google.com>
Date: Fri, 13 Jun 2025 16:34:24 +0000
Subject: [PATCH 11/11] fixed formatting

---
 libc/src/__support/wchar/character_converter.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libc/src/__support/wchar/character_converter.cpp b/libc/src/__support/wchar/character_converter.cpp
index 6074557dc3687..8cdd114a69280 100644
--- a/libc/src/__support/wchar/character_converter.cpp
+++ b/libc/src/__support/wchar/character_converter.cpp
@@ -78,7 +78,7 @@ ErrorOr<char32_t> CharacterConverter::pop_utf32() {
   // if pop is called too early
   if (!isComplete())
     return Error(-1);
-    
+
   char32_t utf32 = state->partial;
 
   // reset if successful pop



More information about the libc-commits mailing list