[libc-commits] [libc] [libc] Implemented CharacterConverter push/pop for utf32->utf8 conversions (PR #143971)

Uzair Nawaz via libc-commits libc-commits at lists.llvm.org
Fri Jun 13 11:09:09 PDT 2025


https://github.com/uzairnawaz updated https://github.com/llvm/llvm-project/pull/143971

>From a4c095a664e5327b3fe473f8e82fb01987472fb1 Mon Sep 17 00:00:00 2001
From: Uzair Nawaz <uzairnawaz at google.com>
Date: Wed, 11 Jun 2025 23:16:06 +0000
Subject: [PATCH 01/17] build fixes

---
 libc/hdr/CMakeLists.txt                      |  2 ++
 libc/hdr/types/CMakeLists.txt                | 23 ++++++++++++++++++++
 libc/include/llvm-libc-types/char8_t.h       |  3 +--
 libc/src/__support/CMakeLists.txt            |  2 ++
 libc/src/__support/wchar/mbstate.h           |  1 +
 libc/src/__support/wchar/utf_ret.h           |  3 ++-
 libc/test/src/__support/CMakeLists.txt       |  1 +
 libc/test/src/__support/wchar/CMakeLists.txt | 11 ++++++++++
 8 files changed, 43 insertions(+), 3 deletions(-)
 create mode 100644 libc/test/src/__support/wchar/CMakeLists.txt

diff --git a/libc/hdr/CMakeLists.txt b/libc/hdr/CMakeLists.txt
index 209fcb965242f..1e40e3e4cc908 100644
--- a/libc/hdr/CMakeLists.txt
+++ b/libc/hdr/CMakeLists.txt
@@ -212,6 +212,8 @@ add_proxy_header_library(
 
 add_header_library(wchar_overlay HDRS wchar_overlay.h)
 
+add_header_library(uchar_overlay HDRS uchar_overlay.h)
+
 add_proxy_header_library(
   wchar_macros
   HDRS
diff --git a/libc/hdr/types/CMakeLists.txt b/libc/hdr/types/CMakeLists.txt
index 5f6197c93d445..89eabc0bc4b2e 100644
--- a/libc/hdr/types/CMakeLists.txt
+++ b/libc/hdr/types/CMakeLists.txt
@@ -1,3 +1,26 @@
+
+add_proxy_header_library(
+  char8_t 
+  HDRS
+    char8_t.h
+  DEPENDS
+    libc.hdr.uchar_overlay
+  FULL_BUILD_DEPENDS
+    libc.include.llvm-libc-types.char8_t
+    libc.include.uchar
+)
+
+add_proxy_header_library(
+  char32_t 
+  HDRS
+    char32_t.h
+  DEPENDS
+    libc.hdr.uchar_overlay
+  FULL_BUILD_DEPENDS
+    libc.include.llvm-libc-types.char32_t
+    libc.include.uchar
+)
+
 add_proxy_header_library(
   div_t
   HDRS
diff --git a/libc/include/llvm-libc-types/char8_t.h b/libc/include/llvm-libc-types/char8_t.h
index ddadab1afa219..a343be77d810b 100644
--- a/libc/include/llvm-libc-types/char8_t.h
+++ b/libc/include/llvm-libc-types/char8_t.h
@@ -9,8 +9,7 @@
 #ifndef LLVM_LIBC_TYPES_CHAR8_T_H
 #define LLVM_LIBC_TYPES_CHAR8_T_H
 
-#if !defined(__cplusplus) && defined(__STDC_VERSION__) &&                      \
-    __STDC_VERSION__ >= 202311L
+#if !(defined(__cplusplus) && defined(__cpp_char8_t))
 typedef unsigned char char8_t;
 #endif
 
diff --git a/libc/src/__support/CMakeLists.txt b/libc/src/__support/CMakeLists.txt
index f92499fdbf451..201967fe05f0e 100644
--- a/libc/src/__support/CMakeLists.txt
+++ b/libc/src/__support/CMakeLists.txt
@@ -381,3 +381,5 @@ add_subdirectory(HashTable)
 add_subdirectory(fixed_point)
 
 add_subdirectory(time)
+
+add_subdirectory(wchar)
diff --git a/libc/src/__support/wchar/mbstate.h b/libc/src/__support/wchar/mbstate.h
index 72ec727560003..0a0c5e2ad4ce1 100644
--- a/libc/src/__support/wchar/mbstate.h
+++ b/libc/src/__support/wchar/mbstate.h
@@ -9,6 +9,7 @@
 #ifndef LLVM_LIBC_SRC___SUPPORT_MBSTATE_H
 #define LLVM_LIBC_SRC___SUPPORT_MBSTATE_H
 
+#include "src/__support/common.h"
 #include "hdr/types/char32_t.h"
 #include <stdint.h>
 
diff --git a/libc/src/__support/wchar/utf_ret.h b/libc/src/__support/wchar/utf_ret.h
index b8a8f6f094143..ad9690ec1f646 100644
--- a/libc/src/__support/wchar/utf_ret.h
+++ b/libc/src/__support/wchar/utf_ret.h
@@ -10,12 +10,13 @@
 #define LLVM_LIBC_SRC___SUPPORT_UTF_RET_H
 
 namespace LIBC_NAMESPACE_DECL {
-
+namespace internal {
 template <typename T> struct utf_ret {
   T out;
   int error;
 };
 
+} // namespace internal
 } // namespace LIBC_NAMESPACE_DECL
 
 #endif // LLVM_LIBC_SRC___SUPPORT_UTF_RET_H
diff --git a/libc/test/src/__support/CMakeLists.txt b/libc/test/src/__support/CMakeLists.txt
index 4fb0dae86e5ca..8905ac2127620 100644
--- a/libc/test/src/__support/CMakeLists.txt
+++ b/libc/test/src/__support/CMakeLists.txt
@@ -275,3 +275,4 @@ add_subdirectory(fixed_point)
 add_subdirectory(HashTable)
 add_subdirectory(time)
 add_subdirectory(threads)
+add_subdirectory(wchar)
diff --git a/libc/test/src/__support/wchar/CMakeLists.txt b/libc/test/src/__support/wchar/CMakeLists.txt
new file mode 100644
index 0000000000000..109f3ab3c85db
--- /dev/null
+++ b/libc/test/src/__support/wchar/CMakeLists.txt
@@ -0,0 +1,11 @@
+add_custom_target(libc-support-wchar-tests)
+
+add_libc_test(
+  utf32_to_8_test 
+  SUITE
+    libc-support-tests
+  SRCS
+    utf32_to_8_test.cpp 
+  DEPENDS
+    libc.src.__support.wchar.character_converter
+ )
\ No newline at end of file

>From e5ff004c47a60811417c6c0da6524f946a84c1f2 Mon Sep 17 00:00:00 2001
From: Uzair Nawaz <uzairnawaz at google.com>
Date: Wed, 11 Jun 2025 23:26:08 +0000
Subject: [PATCH 02/17] style fix, included empty test file to avoid failures

---
 libc/src/__support/wchar/mbstate.h                |  2 +-
 libc/test/src/__support/wchar/utf32_to_8_test.cpp | 15 +++++++++++++++
 2 files changed, 16 insertions(+), 1 deletion(-)
 create mode 100644 libc/test/src/__support/wchar/utf32_to_8_test.cpp

diff --git a/libc/src/__support/wchar/mbstate.h b/libc/src/__support/wchar/mbstate.h
index 0a0c5e2ad4ce1..cb8950374de41 100644
--- a/libc/src/__support/wchar/mbstate.h
+++ b/libc/src/__support/wchar/mbstate.h
@@ -9,8 +9,8 @@
 #ifndef LLVM_LIBC_SRC___SUPPORT_MBSTATE_H
 #define LLVM_LIBC_SRC___SUPPORT_MBSTATE_H
 
-#include "src/__support/common.h"
 #include "hdr/types/char32_t.h"
+#include "src/__support/common.h"
 #include <stdint.h>
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/test/src/__support/wchar/utf32_to_8_test.cpp b/libc/test/src/__support/wchar/utf32_to_8_test.cpp
new file mode 100644
index 0000000000000..3638439862fbb
--- /dev/null
+++ b/libc/test/src/__support/wchar/utf32_to_8_test.cpp
@@ -0,0 +1,15 @@
+//===-- Unittests for the CharacterConverter class (utf32 -> 8) -----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/common.h"
+#include "src/__support/wchar/character_converter.h"
+#include "src/__support/wchar/mbstate.h"
+
+#include "test/UnitTest/Test.h"
+
+TEST(LlvmLibcCharacterConverterUTF32To8Test, OneByte) {}

>From 36127243b164086270ca8779aec60c54570735a9 Mon Sep 17 00:00:00 2001
From: Uzair Nawaz <uzairnawaz at google.com>
Date: Thu, 12 Jun 2025 16:17:16 +0000
Subject: [PATCH 03/17] style; missing include

---
 libc/hdr/types/CMakeLists.txt                | 1 -
 libc/src/__support/wchar/utf_ret.h           | 2 ++
 libc/test/src/__support/wchar/CMakeLists.txt | 2 +-
 3 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/libc/hdr/types/CMakeLists.txt b/libc/hdr/types/CMakeLists.txt
index 89eabc0bc4b2e..c88c357009072 100644
--- a/libc/hdr/types/CMakeLists.txt
+++ b/libc/hdr/types/CMakeLists.txt
@@ -1,4 +1,3 @@
-
 add_proxy_header_library(
   char8_t 
   HDRS
diff --git a/libc/src/__support/wchar/utf_ret.h b/libc/src/__support/wchar/utf_ret.h
index ad9690ec1f646..fa99b76159bd8 100644
--- a/libc/src/__support/wchar/utf_ret.h
+++ b/libc/src/__support/wchar/utf_ret.h
@@ -9,6 +9,8 @@
 #ifndef LLVM_LIBC_SRC___SUPPORT_UTF_RET_H
 #define LLVM_LIBC_SRC___SUPPORT_UTF_RET_H
 
+#include "src/__support/common.h"
+
 namespace LIBC_NAMESPACE_DECL {
 namespace internal {
 template <typename T> struct utf_ret {
diff --git a/libc/test/src/__support/wchar/CMakeLists.txt b/libc/test/src/__support/wchar/CMakeLists.txt
index 109f3ab3c85db..f6abdbcc54bc3 100644
--- a/libc/test/src/__support/wchar/CMakeLists.txt
+++ b/libc/test/src/__support/wchar/CMakeLists.txt
@@ -8,4 +8,4 @@ add_libc_test(
     utf32_to_8_test.cpp 
   DEPENDS
     libc.src.__support.wchar.character_converter
- )
\ No newline at end of file
+)

>From e7ebd785df0ccf68d4759687dcdbf41708c14012 Mon Sep 17 00:00:00 2001
From: Uzair Nawaz <uzairnawaz at google.com>
Date: Thu, 12 Jun 2025 16:19:50 +0000
Subject: [PATCH 04/17] removed incomplete test from build

---
 libc/test/src/__support/wchar/CMakeLists.txt | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/libc/test/src/__support/wchar/CMakeLists.txt b/libc/test/src/__support/wchar/CMakeLists.txt
index f6abdbcc54bc3..0ed384571f232 100644
--- a/libc/test/src/__support/wchar/CMakeLists.txt
+++ b/libc/test/src/__support/wchar/CMakeLists.txt
@@ -1,11 +1 @@
 add_custom_target(libc-support-wchar-tests)
-
-add_libc_test(
-  utf32_to_8_test 
-  SUITE
-    libc-support-tests
-  SRCS
-    utf32_to_8_test.cpp 
-  DEPENDS
-    libc.src.__support.wchar.character_converter
-)

>From fe7f23d59c50f0aa221c04fff1c8ef2758ee5070 Mon Sep 17 00:00:00 2001
From: Uzair Nawaz <uzairnawaz at google.com>
Date: Thu, 12 Jun 2025 17:05:00 +0000
Subject: [PATCH 05/17] implemented characterconverter push/pop for utf32 -> 8;
 added tests

---
 .../__support/wchar/character_converter.cpp   | 140 +++++++++++++++++-
 .../src/__support/wchar/character_converter.h |  11 +-
 libc/test/src/__support/wchar/CMakeLists.txt  |  10 ++
 .../src/__support/wchar/utf32_to_8_test.cpp   | 128 +++++++++++++++-
 4 files changed, 280 insertions(+), 9 deletions(-)

diff --git a/libc/src/__support/wchar/character_converter.cpp b/libc/src/__support/wchar/character_converter.cpp
index 0afc2a6f59e64..88528960b1a2f 100644
--- a/libc/src/__support/wchar/character_converter.cpp
+++ b/libc/src/__support/wchar/character_converter.cpp
@@ -8,6 +8,7 @@
 
 #include "hdr/types/char32_t.h"
 #include "hdr/types/char8_t.h"
+#include "src/__support/common.h"
 #include "src/__support/wchar/mbstate.h"
 #include "src/__support/wchar/utf_ret.h"
 
@@ -16,17 +17,144 @@
 namespace LIBC_NAMESPACE_DECL {
 namespace internal {
 
-CharacterConverter::CharacterConverter(mbstate_t *mbstate) { state = mbstate; }
+CharacterConverter::CharacterConverter(mbstate *mbstate) { state = mbstate; }
 
-bool CharacterConverter::isComplete() {}
+bool CharacterConverter::isComplete() {
+  return state->bits_processed / 8 == state->total_bytes;
+}
 
-int CharacterConverter::push(char8_t utf8_byte) {}
+int CharacterConverter::push(char8_t utf8_byte) { return utf8_byte; }
 
-int CharacterConverter::push(char32_t utf32) {}
+int CharacterConverter::push(char32_t utf32) {
+  state->partial = utf32;
+  state->bits_processed = 0;
+  state->total_bytes = 0;
 
-utf_ret<char8_t> CharacterConverter::pop_utf8() {}
+  // determine number of utf-8 bytes needed to represent this utf32 value
+  char32_t ranges[] = {0x7f, 0x7ff, 0xffff, 0x10ffff};
+  const int num_ranges = 4;
+  for (uint8_t i = 0; i < num_ranges; i++) {
+    if (state->partial <= ranges[i]) {
+      state->total_bytes = i + 1;
+      break;
+    }
+  }
+  if (state->total_bytes == 0) {
+    return -1;
+  }
 
-utf_ret<char32_t> CharacterConverter::pop_utf32() {}
+  return 0;
+}
+
+utf_ret<char8_t> CharacterConverter::pop_utf8_seqlength1() {
+  utf_ret<char8_t> result;
+  result.error = 0;
+
+  // 0xxxxxxx
+  switch (state->bits_processed) {
+  case 0:
+    result.out = (char8_t)(state->partial);
+    break;
+  default:
+    result.error = -1;
+    return result;
+  }
+
+  state->bits_processed += 8;
+  return result;
+}
+
+utf_ret<char8_t> CharacterConverter::pop_utf8_seqlength2() {
+  utf_ret<char8_t> result;
+  result.error = 0;
+
+  // 110xxxxx 10xxxxxx
+  char32_t utf32 = state->partial;
+  switch (state->bits_processed) {
+  case 0:
+    result.out = (char8_t)(0xC0 | (utf32 >> 6));
+    break;
+  case 8:
+    result.out = (char8_t)(0x80 | (utf32 & 0x3f));
+    break;
+  default:
+    result.error = -1;
+    return result;
+  }
+
+  state->bits_processed += 8;
+  return result;
+}
+
+utf_ret<char8_t> CharacterConverter::pop_utf8_seqlength3() {
+  utf_ret<char8_t> result;
+  result.error = 0;
+
+  // 1110xxxx 10xxxxxx 10xxxxxx
+  char32_t utf32 = state->partial;
+  switch (state->bits_processed) {
+  case 0:
+    result.out = (char8_t)(0xE0 | (utf32 >> 12));
+    break;
+  case 8:
+    result.out = (char8_t)(0x80 | ((utf32 >> 6) & 0x3f));
+    break;
+  case 16:
+    result.out = (char8_t)(0x80 | (utf32 & 0x3f));
+    break;
+  default:
+    result.error = -1;
+    return result;
+  }
+
+  state->bits_processed += 8;
+  return result;
+}
+
+utf_ret<char8_t> CharacterConverter::pop_utf8_seqlength4() {
+  utf_ret<char8_t> result;
+  result.error = 0;
+
+  // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+  char32_t utf32 = state->partial;
+  switch (state->bits_processed) {
+  case 0:
+    result.out = (char8_t)(0xF0 | (utf32 >> 18));
+    break;
+  case 8:
+    result.out = (char8_t)(0x80 | ((utf32 >> 12) & 0x3f));
+    break;
+  case 16:
+    result.out = (char8_t)(0x80 | ((utf32 >> 6) & 0x3f));
+    break;
+  case 24:
+    result.out = (char8_t)(0x80 | (utf32 & 0x3f));
+    break;
+  default:
+    result.error = -1;
+    return result;
+  }
+
+  state->bits_processed += 8;
+  return result;
+}
+
+utf_ret<char8_t> CharacterConverter::pop_utf8() {
+  switch (state->total_bytes) {
+  case 1:
+    return pop_utf8_seqlength1();
+  case 2:
+    return pop_utf8_seqlength2();
+  case 3:
+    return pop_utf8_seqlength3();
+  case 4:
+    return pop_utf8_seqlength4();
+  }
+
+  return {.out = 0, .error = -1};
+}
+
+utf_ret<char32_t> CharacterConverter::pop_utf32() { return {0, -1}; }
 
 } // namespace internal
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/__support/wchar/character_converter.h b/libc/src/__support/wchar/character_converter.h
index a6bac43805376..a082efadc973a 100644
--- a/libc/src/__support/wchar/character_converter.h
+++ b/libc/src/__support/wchar/character_converter.h
@@ -13,16 +13,23 @@
 #include "hdr/types/char8_t.h"
 #include "src/__support/wchar/mbstate.h"
 #include "src/__support/wchar/utf_ret.h"
+#include "src/__support/common.h"
+
 
 namespace LIBC_NAMESPACE_DECL {
 namespace internal {
 
 class CharacterConverter {
 private:
-  mbstate_t *state;
+  mbstate *state;
+
+  utf_ret<char8_t> pop_utf8_seqlength1();
+  utf_ret<char8_t> pop_utf8_seqlength2();
+  utf_ret<char8_t> pop_utf8_seqlength3();
+  utf_ret<char8_t> pop_utf8_seqlength4();
 
 public:
-  CharacterConverter(mbstate_t *mbstate);
+  CharacterConverter(mbstate *mbstate);
 
   bool isComplete();
 
diff --git a/libc/test/src/__support/wchar/CMakeLists.txt b/libc/test/src/__support/wchar/CMakeLists.txt
index 0ed384571f232..f6abdbcc54bc3 100644
--- a/libc/test/src/__support/wchar/CMakeLists.txt
+++ b/libc/test/src/__support/wchar/CMakeLists.txt
@@ -1 +1,11 @@
 add_custom_target(libc-support-wchar-tests)
+
+add_libc_test(
+  utf32_to_8_test 
+  SUITE
+    libc-support-tests
+  SRCS
+    utf32_to_8_test.cpp 
+  DEPENDS
+    libc.src.__support.wchar.character_converter
+)
diff --git a/libc/test/src/__support/wchar/utf32_to_8_test.cpp b/libc/test/src/__support/wchar/utf32_to_8_test.cpp
index 3638439862fbb..8e90ded93704c 100644
--- a/libc/test/src/__support/wchar/utf32_to_8_test.cpp
+++ b/libc/test/src/__support/wchar/utf32_to_8_test.cpp
@@ -12,4 +12,130 @@
 
 #include "test/UnitTest/Test.h"
 
-TEST(LlvmLibcCharacterConverterUTF32To8Test, OneByte) {}
+TEST(LlvmLibcCharacterConverterUTF32To8Test, OneByte) {
+  LIBC_NAMESPACE::internal::mbstate state;
+  LIBC_NAMESPACE::internal::CharacterConverter cr(&state);
+
+  char32_t utf32_A = 0x41;
+  cr.push(utf32_A);
+  auto popped = cr.pop_utf8();
+  ASSERT_EQ(popped.error, 0);
+  ASSERT_EQ(static_cast<char>(popped.out), 'A');
+  ASSERT_TRUE(cr.isComplete());
+
+  char32_t utf32_B = 0x42;
+  cr.push(utf32_B);
+  popped = cr.pop_utf8();
+  ASSERT_EQ(popped.error, 0);
+  ASSERT_EQ(static_cast<char>(popped.out), 'B');
+  ASSERT_TRUE(cr.isComplete());
+
+  popped = cr.pop_utf8();
+  ASSERT_NE(popped.error, 0);
+}
+
+TEST(LlvmLibcCharacterConverterUTF32To8Test, TwoByte) {
+  LIBC_NAMESPACE::internal::mbstate state;
+  LIBC_NAMESPACE::internal::CharacterConverter cr(&state);
+
+  char32_t utf32 = 0xff;
+  cr.push(utf32);
+  auto popped = cr.pop_utf8();
+  ASSERT_EQ(popped.error, 0);
+  ASSERT_EQ(static_cast<int>(popped.out), 0xc3);
+  ASSERT_TRUE(!cr.isComplete());
+  popped = cr.pop_utf8();
+  ASSERT_EQ(popped.error, 0);
+  ASSERT_EQ(static_cast<int>(popped.out), 0xbf);
+  ASSERT_TRUE(cr.isComplete());
+
+  utf32 = 0x58e;
+  cr.push(utf32);
+  popped = cr.pop_utf8();
+  ASSERT_EQ(popped.error, 0);
+  ASSERT_EQ(static_cast<int>(popped.out), 0xd6);
+  ASSERT_TRUE(!cr.isComplete());
+  popped = cr.pop_utf8();
+  ASSERT_EQ(popped.error, 0);
+  ASSERT_EQ(static_cast<int>(popped.out), 0x8e);
+  ASSERT_TRUE(cr.isComplete());
+
+  popped = cr.pop_utf8();
+  ASSERT_NE(popped.error, 0);
+}
+
+TEST(LlvmLibcCharacterConverterUTF32To8Test, ThreeByte) {
+  LIBC_NAMESPACE::internal::mbstate state;
+  LIBC_NAMESPACE::internal::CharacterConverter cr(&state);
+
+  char32_t utf32 = 0xac15;
+  cr.push(utf32);
+  auto popped = cr.pop_utf8();
+  ASSERT_EQ(popped.error, 0);
+  ASSERT_EQ(static_cast<int>(popped.out), 0xea);
+  ASSERT_TRUE(!cr.isComplete());
+  popped = cr.pop_utf8();
+  ASSERT_EQ(popped.error, 0);
+  ASSERT_EQ(static_cast<int>(popped.out), 0xb0);
+  ASSERT_TRUE(!cr.isComplete());
+  popped = cr.pop_utf8();
+  ASSERT_EQ(popped.error, 0);
+  ASSERT_EQ(static_cast<int>(popped.out), 0x95);
+  ASSERT_TRUE(cr.isComplete());
+
+  utf32 = 0x267b;
+  cr.push(utf32);
+  popped = cr.pop_utf8();
+  ASSERT_EQ(popped.error, 0);
+  ASSERT_EQ(static_cast<int>(popped.out), 0xe2);
+  ASSERT_TRUE(!cr.isComplete());
+  popped = cr.pop_utf8();
+  ASSERT_EQ(popped.error, 0);
+  ASSERT_EQ(static_cast<int>(popped.out), 0x99);
+  ASSERT_TRUE(!cr.isComplete());
+  popped = cr.pop_utf8();
+  ASSERT_EQ(popped.error, 0);
+  ASSERT_EQ(static_cast<int>(popped.out), 0xbb);
+  ASSERT_TRUE(cr.isComplete());
+
+  popped = cr.pop_utf8();
+  ASSERT_NE(popped.error, 0);
+}
+
+TEST(LlvmLibcCharacterConverterUTF32To8Test, FourByte) {
+  LIBC_NAMESPACE::internal::mbstate state;
+  LIBC_NAMESPACE::internal::CharacterConverter cr(&state);
+
+  char32_t utf32 = 0xac15;
+  cr.push(utf32);
+  auto popped = cr.pop_utf8();
+  ASSERT_EQ(popped.error, 0);
+  ASSERT_EQ(static_cast<int>(popped.out), 0xea);
+  ASSERT_TRUE(!cr.isComplete());
+  popped = cr.pop_utf8();
+  ASSERT_EQ(popped.error, 0);
+  ASSERT_EQ(static_cast<int>(popped.out), 0xb0);
+  ASSERT_TRUE(!cr.isComplete());
+  popped = cr.pop_utf8();
+  ASSERT_EQ(popped.error, 0);
+  ASSERT_EQ(static_cast<int>(popped.out), 0x95);
+  ASSERT_TRUE(cr.isComplete());
+
+  utf32 = 0x267b;
+  cr.push(utf32);
+  popped = cr.pop_utf8();
+  ASSERT_EQ(popped.error, 0);
+  ASSERT_EQ(static_cast<int>(popped.out), 0xe2);
+  ASSERT_TRUE(!cr.isComplete());
+  popped = cr.pop_utf8();
+  ASSERT_EQ(popped.error, 0);
+  ASSERT_EQ(static_cast<int>(popped.out), 0x99);
+  ASSERT_TRUE(!cr.isComplete());
+  popped = cr.pop_utf8();
+  ASSERT_EQ(popped.error, 0);
+  ASSERT_EQ(static_cast<int>(popped.out), 0xbb);
+  ASSERT_TRUE(cr.isComplete());
+
+  popped = cr.pop_utf8();
+  ASSERT_NE(popped.error, 0);
+}

>From 3b6fccf53b40b8b5e2375e92aae2f7e69dd6d553 Mon Sep 17 00:00:00 2001
From: Uzair Nawaz <uzairnawaz at google.com>
Date: Thu, 12 Jun 2025 20:30:27 +0000
Subject: [PATCH 06/17] switched to bytes_processed

---
 .../__support/wchar/character_converter.cpp   | 34 +++++++---------
 .../src/__support/wchar/utf32_to_8_test.cpp   | 39 ++++++++++++++-----
 2 files changed, 44 insertions(+), 29 deletions(-)

diff --git a/libc/src/__support/wchar/character_converter.cpp b/libc/src/__support/wchar/character_converter.cpp
index 88528960b1a2f..2bf77d1b54178 100644
--- a/libc/src/__support/wchar/character_converter.cpp
+++ b/libc/src/__support/wchar/character_converter.cpp
@@ -19,15 +19,11 @@ namespace internal {
 
 CharacterConverter::CharacterConverter(mbstate *mbstate) { state = mbstate; }
 
-bool CharacterConverter::isComplete() {
-  return state->bits_processed / 8 == state->total_bytes;
-}
-
 int CharacterConverter::push(char8_t utf8_byte) { return utf8_byte; }
 
 int CharacterConverter::push(char32_t utf32) {
   state->partial = utf32;
-  state->bits_processed = 0;
+  state->bytes_processed = 0;
   state->total_bytes = 0;
 
   // determine number of utf-8 bytes needed to represent this utf32 value
@@ -51,7 +47,7 @@ utf_ret<char8_t> CharacterConverter::pop_utf8_seqlength1() {
   result.error = 0;
 
   // 0xxxxxxx
-  switch (state->bits_processed) {
+  switch (state->bytes_processed) {
   case 0:
     result.out = (char8_t)(state->partial);
     break;
@@ -60,7 +56,7 @@ utf_ret<char8_t> CharacterConverter::pop_utf8_seqlength1() {
     return result;
   }
 
-  state->bits_processed += 8;
+  state->bytes_processed++;
   return result;
 }
 
@@ -70,11 +66,11 @@ utf_ret<char8_t> CharacterConverter::pop_utf8_seqlength2() {
 
   // 110xxxxx 10xxxxxx
   char32_t utf32 = state->partial;
-  switch (state->bits_processed) {
+  switch (state->bytes_processed) {
   case 0:
     result.out = (char8_t)(0xC0 | (utf32 >> 6));
     break;
-  case 8:
+  case 1:
     result.out = (char8_t)(0x80 | (utf32 & 0x3f));
     break;
   default:
@@ -82,7 +78,7 @@ utf_ret<char8_t> CharacterConverter::pop_utf8_seqlength2() {
     return result;
   }
 
-  state->bits_processed += 8;
+  state->bytes_processed++;
   return result;
 }
 
@@ -92,14 +88,14 @@ utf_ret<char8_t> CharacterConverter::pop_utf8_seqlength3() {
 
   // 1110xxxx 10xxxxxx 10xxxxxx
   char32_t utf32 = state->partial;
-  switch (state->bits_processed) {
+  switch (state->bytes_processed) {
   case 0:
     result.out = (char8_t)(0xE0 | (utf32 >> 12));
     break;
-  case 8:
+  case 1:
     result.out = (char8_t)(0x80 | ((utf32 >> 6) & 0x3f));
     break;
-  case 16:
+  case 2:
     result.out = (char8_t)(0x80 | (utf32 & 0x3f));
     break;
   default:
@@ -107,7 +103,7 @@ utf_ret<char8_t> CharacterConverter::pop_utf8_seqlength3() {
     return result;
   }
 
-  state->bits_processed += 8;
+  state->bytes_processed++;
   return result;
 }
 
@@ -117,17 +113,17 @@ utf_ret<char8_t> CharacterConverter::pop_utf8_seqlength4() {
 
   // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
   char32_t utf32 = state->partial;
-  switch (state->bits_processed) {
+  switch (state->bytes_processed) {
   case 0:
     result.out = (char8_t)(0xF0 | (utf32 >> 18));
     break;
-  case 8:
+  case 1:
     result.out = (char8_t)(0x80 | ((utf32 >> 12) & 0x3f));
     break;
-  case 16:
+  case 2:
     result.out = (char8_t)(0x80 | ((utf32 >> 6) & 0x3f));
     break;
-  case 24:
+  case 3:
     result.out = (char8_t)(0x80 | (utf32 & 0x3f));
     break;
   default:
@@ -135,7 +131,7 @@ utf_ret<char8_t> CharacterConverter::pop_utf8_seqlength4() {
     return result;
   }
 
-  state->bits_processed += 8;
+  state->bytes_processed++;
   return result;
 }
 
diff --git a/libc/test/src/__support/wchar/utf32_to_8_test.cpp b/libc/test/src/__support/wchar/utf32_to_8_test.cpp
index 8e90ded93704c..28b4464eb1228 100644
--- a/libc/test/src/__support/wchar/utf32_to_8_test.cpp
+++ b/libc/test/src/__support/wchar/utf32_to_8_test.cpp
@@ -16,20 +16,22 @@ TEST(LlvmLibcCharacterConverterUTF32To8Test, OneByte) {
   LIBC_NAMESPACE::internal::mbstate state;
   LIBC_NAMESPACE::internal::CharacterConverter cr(&state);
 
-  char32_t utf32_A = 0x41;
+  // utf8 1-byte encodings are identical to their utf32 representations
+  char32_t utf32_A = 0x41; // 'A'
   cr.push(utf32_A);
   auto popped = cr.pop_utf8();
   ASSERT_EQ(popped.error, 0);
   ASSERT_EQ(static_cast<char>(popped.out), 'A');
   ASSERT_TRUE(cr.isComplete());
 
-  char32_t utf32_B = 0x42;
+  char32_t utf32_B = 0x42; // 'B'
   cr.push(utf32_B);
   popped = cr.pop_utf8();
   ASSERT_EQ(popped.error, 0);
   ASSERT_EQ(static_cast<char>(popped.out), 'B');
   ASSERT_TRUE(cr.isComplete());
 
+  // should error if we try to pop another utf8 byte out
   popped = cr.pop_utf8();
   ASSERT_NE(popped.error, 0);
 }
@@ -38,6 +40,7 @@ TEST(LlvmLibcCharacterConverterUTF32To8Test, TwoByte) {
   LIBC_NAMESPACE::internal::mbstate state;
   LIBC_NAMESPACE::internal::CharacterConverter cr(&state);
 
+  // testing utf32: 0xff -> utf8: 0xc3 0xbf
   char32_t utf32 = 0xff;
   cr.push(utf32);
   auto popped = cr.pop_utf8();
@@ -49,6 +52,7 @@ TEST(LlvmLibcCharacterConverterUTF32To8Test, TwoByte) {
   ASSERT_EQ(static_cast<int>(popped.out), 0xbf);
   ASSERT_TRUE(cr.isComplete());
 
+  // testing utf32: 0x58e -> utf8: 0xd6 0x8e
   utf32 = 0x58e;
   cr.push(utf32);
   popped = cr.pop_utf8();
@@ -60,6 +64,7 @@ TEST(LlvmLibcCharacterConverterUTF32To8Test, TwoByte) {
   ASSERT_EQ(static_cast<int>(popped.out), 0x8e);
   ASSERT_TRUE(cr.isComplete());
 
+  // should error if we try to pop another utf8 byte out
   popped = cr.pop_utf8();
   ASSERT_NE(popped.error, 0);
 }
@@ -68,6 +73,7 @@ TEST(LlvmLibcCharacterConverterUTF32To8Test, ThreeByte) {
   LIBC_NAMESPACE::internal::mbstate state;
   LIBC_NAMESPACE::internal::CharacterConverter cr(&state);
 
+  // testing utf32: 0xac15 -> utf8: 0xea 0xb0 0x95
   char32_t utf32 = 0xac15;
   cr.push(utf32);
   auto popped = cr.pop_utf8();
@@ -83,6 +89,7 @@ TEST(LlvmLibcCharacterConverterUTF32To8Test, ThreeByte) {
   ASSERT_EQ(static_cast<int>(popped.out), 0x95);
   ASSERT_TRUE(cr.isComplete());
 
+  // testing utf32: 0x267b -> utf8: 0xe2 0x99 0xbb
   utf32 = 0x267b;
   cr.push(utf32);
   popped = cr.pop_utf8();
@@ -98,6 +105,7 @@ TEST(LlvmLibcCharacterConverterUTF32To8Test, ThreeByte) {
   ASSERT_EQ(static_cast<int>(popped.out), 0xbb);
   ASSERT_TRUE(cr.isComplete());
 
+  // should error if we try to pop another utf8 byte out
   popped = cr.pop_utf8();
   ASSERT_NE(popped.error, 0);
 }
@@ -106,36 +114,47 @@ TEST(LlvmLibcCharacterConverterUTF32To8Test, FourByte) {
   LIBC_NAMESPACE::internal::mbstate state;
   LIBC_NAMESPACE::internal::CharacterConverter cr(&state);
 
-  char32_t utf32 = 0xac15;
+  // testing utf32: 0x1f921 -> utf8: 0xf0 0x9f 0xa4 0xa1
+  char32_t utf32 = 0x1f921;
   cr.push(utf32);
   auto popped = cr.pop_utf8();
   ASSERT_EQ(popped.error, 0);
-  ASSERT_EQ(static_cast<int>(popped.out), 0xea);
+  ASSERT_EQ(static_cast<int>(popped.out), 0xf0);
   ASSERT_TRUE(!cr.isComplete());
   popped = cr.pop_utf8();
   ASSERT_EQ(popped.error, 0);
-  ASSERT_EQ(static_cast<int>(popped.out), 0xb0);
+  ASSERT_EQ(static_cast<int>(popped.out), 0x9f);
   ASSERT_TRUE(!cr.isComplete());
   popped = cr.pop_utf8();
   ASSERT_EQ(popped.error, 0);
-  ASSERT_EQ(static_cast<int>(popped.out), 0x95);
+  ASSERT_EQ(static_cast<int>(popped.out), 0xa4);
+  ASSERT_TRUE(!cr.isComplete());
+  popped = cr.pop_utf8();
+  ASSERT_EQ(popped.error, 0);
+  ASSERT_EQ(static_cast<int>(popped.out), 0xa1);
   ASSERT_TRUE(cr.isComplete());
 
-  utf32 = 0x267b;
+  // testing utf32: 0x12121 -> utf8: 0xf0 0x92 0x84 0xa1
+  utf32 = 0x12121;
   cr.push(utf32);
   popped = cr.pop_utf8();
   ASSERT_EQ(popped.error, 0);
-  ASSERT_EQ(static_cast<int>(popped.out), 0xe2);
+  ASSERT_EQ(static_cast<int>(popped.out), 0xf0);
   ASSERT_TRUE(!cr.isComplete());
   popped = cr.pop_utf8();
   ASSERT_EQ(popped.error, 0);
-  ASSERT_EQ(static_cast<int>(popped.out), 0x99);
+  ASSERT_EQ(static_cast<int>(popped.out), 0x92);
   ASSERT_TRUE(!cr.isComplete());
   popped = cr.pop_utf8();
   ASSERT_EQ(popped.error, 0);
-  ASSERT_EQ(static_cast<int>(popped.out), 0xbb);
+  ASSERT_EQ(static_cast<int>(popped.out), 0x84);
+  ASSERT_TRUE(!cr.isComplete());
+  popped = cr.pop_utf8();
+  ASSERT_EQ(popped.error, 0);
+  ASSERT_EQ(static_cast<int>(popped.out), 0xa1);
   ASSERT_TRUE(cr.isComplete());
 
+  // should error if we try to pop another utf8 byte out
   popped = cr.pop_utf8();
   ASSERT_NE(popped.error, 0);
 }

>From 842a50550b20cd60dcf74f9e0969e173ec4206ef Mon Sep 17 00:00:00 2001
From: Uzair Nawaz <uzairnawaz at google.com>
Date: Thu, 12 Jun 2025 20:37:28 +0000
Subject: [PATCH 07/17] style

---
 libc/src/__support/wchar/character_converter.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/libc/src/__support/wchar/character_converter.cpp b/libc/src/__support/wchar/character_converter.cpp
index 481f7e61b6575..fd11b6843598a 100644
--- a/libc/src/__support/wchar/character_converter.cpp
+++ b/libc/src/__support/wchar/character_converter.cpp
@@ -154,7 +154,9 @@ utf_ret<char8_t> CharacterConverter::pop_utf8() {
   return {.out = 0, .error = -1};
 }
 
-utf_ret<char32_t> CharacterConverter::pop_utf32() { return {0, -1}; }
+utf_ret<char32_t> CharacterConverter::pop_utf32() {
+  return {.out = 0, .error = -1};
+}
 
 } // namespace internal
 } // namespace LIBC_NAMESPACE_DECL

>From 21b2c04fd99c5c87e3a9f5b31f1f0fae223e8849 Mon Sep 17 00:00:00 2001
From: Uzair Nawaz <uzairnawaz at google.com>
Date: Thu, 12 Jun 2025 20:40:18 +0000
Subject: [PATCH 08/17] style

---
 libc/test/src/__support/wchar/CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libc/test/src/__support/wchar/CMakeLists.txt b/libc/test/src/__support/wchar/CMakeLists.txt
index f6abdbcc54bc3..5dff6e9115f7d 100644
--- a/libc/test/src/__support/wchar/CMakeLists.txt
+++ b/libc/test/src/__support/wchar/CMakeLists.txt
@@ -1,11 +1,11 @@
 add_custom_target(libc-support-wchar-tests)
 
 add_libc_test(
-  utf32_to_8_test 
+  utf32_to_8_test
   SUITE
     libc-support-tests
   SRCS
-    utf32_to_8_test.cpp 
+    utf32_to_8_test.cpp
   DEPENDS
     libc.src.__support.wchar.character_converter
 )

>From fb758a3b7cca5eb11d1a8108204e8ebde7a98444 Mon Sep 17 00:00:00 2001
From: Uzair Nawaz <uzairnawaz at google.com>
Date: Thu, 12 Jun 2025 20:47:22 +0000
Subject: [PATCH 09/17] removed unimplemented functions from cpp

---
 libc/src/__support/wchar/character_converter.cpp | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/libc/src/__support/wchar/character_converter.cpp b/libc/src/__support/wchar/character_converter.cpp
index fd11b6843598a..ec78282d7f035 100644
--- a/libc/src/__support/wchar/character_converter.cpp
+++ b/libc/src/__support/wchar/character_converter.cpp
@@ -23,8 +23,6 @@ bool CharacterConverter::isComplete() {
   return state->bytes_processed == state->total_bytes;
 }
 
-int CharacterConverter::push(char8_t utf8_byte) { return utf8_byte; }
-
 int CharacterConverter::push(char32_t utf32) {
   state->partial = utf32;
   state->bytes_processed = 0;
@@ -154,9 +152,5 @@ utf_ret<char8_t> CharacterConverter::pop_utf8() {
   return {.out = 0, .error = -1};
 }
 
-utf_ret<char32_t> CharacterConverter::pop_utf32() {
-  return {.out = 0, .error = -1};
-}
-
 } // namespace internal
 } // namespace LIBC_NAMESPACE_DECL

>From e1cbdc389b20b7d895e56659b6de75c7df013f64 Mon Sep 17 00:00:00 2001
From: Uzair Nawaz <uzairnawaz at google.com>
Date: Thu, 12 Jun 2025 20:51:51 +0000
Subject: [PATCH 10/17] ensure mbstate is 0-initialized in tests

---
 libc/test/src/__support/wchar/utf32_to_8_test.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/libc/test/src/__support/wchar/utf32_to_8_test.cpp b/libc/test/src/__support/wchar/utf32_to_8_test.cpp
index 28b4464eb1228..fb492c7a53f9a 100644
--- a/libc/test/src/__support/wchar/utf32_to_8_test.cpp
+++ b/libc/test/src/__support/wchar/utf32_to_8_test.cpp
@@ -13,7 +13,7 @@
 #include "test/UnitTest/Test.h"
 
 TEST(LlvmLibcCharacterConverterUTF32To8Test, OneByte) {
-  LIBC_NAMESPACE::internal::mbstate state;
+  LIBC_NAMESPACE::internal::mbstate state{0, 0, 0};
   LIBC_NAMESPACE::internal::CharacterConverter cr(&state);
 
   // utf8 1-byte encodings are identical to their utf32 representations
@@ -37,7 +37,7 @@ TEST(LlvmLibcCharacterConverterUTF32To8Test, OneByte) {
 }
 
 TEST(LlvmLibcCharacterConverterUTF32To8Test, TwoByte) {
-  LIBC_NAMESPACE::internal::mbstate state;
+  LIBC_NAMESPACE::internal::mbstate state{0, 0, 0};
   LIBC_NAMESPACE::internal::CharacterConverter cr(&state);
 
   // testing utf32: 0xff -> utf8: 0xc3 0xbf
@@ -70,7 +70,7 @@ TEST(LlvmLibcCharacterConverterUTF32To8Test, TwoByte) {
 }
 
 TEST(LlvmLibcCharacterConverterUTF32To8Test, ThreeByte) {
-  LIBC_NAMESPACE::internal::mbstate state;
+  LIBC_NAMESPACE::internal::mbstate state{0, 0, 0};
   LIBC_NAMESPACE::internal::CharacterConverter cr(&state);
 
   // testing utf32: 0xac15 -> utf8: 0xea 0xb0 0x95
@@ -111,7 +111,7 @@ TEST(LlvmLibcCharacterConverterUTF32To8Test, ThreeByte) {
 }
 
 TEST(LlvmLibcCharacterConverterUTF32To8Test, FourByte) {
-  LIBC_NAMESPACE::internal::mbstate state;
+  LIBC_NAMESPACE::internal::mbstate state{0, 0, 0};
   LIBC_NAMESPACE::internal::CharacterConverter cr(&state);
 
   // testing utf32: 0x1f921 -> utf8: 0xf0 0x9f 0xa4 0xa1

>From ac92fff0c06db9772a7dde3a137bb9e3eec787da Mon Sep 17 00:00:00 2001
From: Uzair Nawaz <uzairnawaz at google.com>
Date: Thu, 12 Jun 2025 21:40:38 +0000
Subject: [PATCH 11/17] remove wchar test from macos

---
 libc/test/src/__support/CMakeLists.txt | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/libc/test/src/__support/CMakeLists.txt b/libc/test/src/__support/CMakeLists.txt
index 8905ac2127620..76218a16e0cf7 100644
--- a/libc/test/src/__support/CMakeLists.txt
+++ b/libc/test/src/__support/CMakeLists.txt
@@ -275,4 +275,9 @@ add_subdirectory(fixed_point)
 add_subdirectory(HashTable)
 add_subdirectory(time)
 add_subdirectory(threads)
-add_subdirectory(wchar)
+
+# Requires access to uchar header which is not on macos
+# Therefore, cannot currently build this on macos in overlay mode
+if(NOT(LIBC_TARGET_OS_IS_DARWIN))
+  add_subdirectory(wchar)
+endif()

>From 9698a8dd45da41791123f0a4acec184ea1a6737a Mon Sep 17 00:00:00 2001
From: Uzair Nawaz <uzairnawaz at google.com>
Date: Thu, 12 Jun 2025 23:15:17 +0000
Subject: [PATCH 12/17] condensed conversion math

---
 .../__support/wchar/character_converter.cpp   | 123 +++---------------
 .../src/__support/wchar/character_converter.h |   5 -
 2 files changed, 20 insertions(+), 108 deletions(-)

diff --git a/libc/src/__support/wchar/character_converter.cpp b/libc/src/__support/wchar/character_converter.cpp
index ec78282d7f035..f18c869c5788f 100644
--- a/libc/src/__support/wchar/character_converter.cpp
+++ b/libc/src/__support/wchar/character_converter.cpp
@@ -37,119 +37,36 @@ int CharacterConverter::push(char32_t utf32) {
       break;
     }
   }
-  if (state->total_bytes == 0) {
+  if (state->total_bytes == 0)
     return -1;
-  }
 
   return 0;
 }
 
-utf_ret<char8_t> CharacterConverter::pop_utf8_seqlength1() {
-  utf_ret<char8_t> result;
-  result.error = 0;
-
-  // 0xxxxxxx
-  switch (state->bytes_processed) {
-  case 0:
-    result.out = (char8_t)(state->partial);
-    break;
-  default:
-    result.error = -1;
-    return result;
-  }
-
-  state->bytes_processed++;
-  return result;
-}
-
-utf_ret<char8_t> CharacterConverter::pop_utf8_seqlength2() {
-  utf_ret<char8_t> result;
-  result.error = 0;
-
-  // 110xxxxx 10xxxxxx
-  char32_t utf32 = state->partial;
-  switch (state->bytes_processed) {
-  case 0:
-    result.out = (char8_t)(0xC0 | (utf32 >> 6));
-    break;
-  case 1:
-    result.out = (char8_t)(0x80 | (utf32 & 0x3f));
-    break;
-  default:
-    result.error = -1;
-    return result;
-  }
-
-  state->bytes_processed++;
-  return result;
-}
-
-utf_ret<char8_t> CharacterConverter::pop_utf8_seqlength3() {
-  utf_ret<char8_t> result;
-  result.error = 0;
-
-  // 1110xxxx 10xxxxxx 10xxxxxx
-  char32_t utf32 = state->partial;
-  switch (state->bytes_processed) {
-  case 0:
-    result.out = (char8_t)(0xE0 | (utf32 >> 12));
-    break;
-  case 1:
-    result.out = (char8_t)(0x80 | ((utf32 >> 6) & 0x3f));
-    break;
-  case 2:
-    result.out = (char8_t)(0x80 | (utf32 & 0x3f));
-    break;
-  default:
-    result.error = -1;
-    return result;
-  }
-
-  state->bytes_processed++;
-  return result;
-}
-
-utf_ret<char8_t> CharacterConverter::pop_utf8_seqlength4() {
-  utf_ret<char8_t> result;
-  result.error = 0;
+utf_ret<char8_t> CharacterConverter::pop_utf8() {
+  if (state->bytes_processed >= state->total_bytes)
+    return {.out = 0, .error = -1};
 
-  // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+  char8_t first_byte_headers[] = {0, 0xC0, 0xE0, 0xF0};
   char32_t utf32 = state->partial;
-  switch (state->bytes_processed) {
-  case 0:
-    result.out = (char8_t)(0xF0 | (utf32 >> 18));
-    break;
-  case 1:
-    result.out = (char8_t)(0x80 | ((utf32 >> 12) & 0x3f));
-    break;
-  case 2:
-    result.out = (char8_t)(0x80 | ((utf32 >> 6) & 0x3f));
-    break;
-  case 3:
-    result.out = (char8_t)(0x80 | (utf32 & 0x3f));
-    break;
-  default:
-    result.error = -1;
-    return result;
+  char32_t tb = state->total_bytes;
+  char32_t bp = state->bytes_processed;
+  char32_t output;
+  if (state->bytes_processed == 0) {
+    /*
+      Choose the correct set of most significant bits to encode the length
+      of the utf8 sequence. The remaining bits contain the most significant
+      bits of the unicode value of the character.
+    */
+    output = first_byte_headers[tb - 1] | (utf32 >> ((tb - 1) * 6));
+  } else {
+    // Get the next 6 bits and format it like so: 10xxxxxx
+    const char32_t shift_amount = (tb - bp - 1) * 6;
+    output = 0x80 | ((utf32 >> shift_amount) & 0x3f);
   }
 
   state->bytes_processed++;
-  return result;
-}
-
-utf_ret<char8_t> CharacterConverter::pop_utf8() {
-  switch (state->total_bytes) {
-  case 1:
-    return pop_utf8_seqlength1();
-  case 2:
-    return pop_utf8_seqlength2();
-  case 3:
-    return pop_utf8_seqlength3();
-  case 4:
-    return pop_utf8_seqlength4();
-  }
-
-  return {.out = 0, .error = -1};
+  return {.out = (char8_t)output, .error = 0};
 }
 
 } // namespace internal
diff --git a/libc/src/__support/wchar/character_converter.h b/libc/src/__support/wchar/character_converter.h
index f8302bb5e79b8..028a687f0c48f 100644
--- a/libc/src/__support/wchar/character_converter.h
+++ b/libc/src/__support/wchar/character_converter.h
@@ -22,11 +22,6 @@ class CharacterConverter {
 private:
   mbstate *state;
 
-  utf_ret<char8_t> pop_utf8_seqlength1();
-  utf_ret<char8_t> pop_utf8_seqlength2();
-  utf_ret<char8_t> pop_utf8_seqlength3();
-  utf_ret<char8_t> pop_utf8_seqlength4();
-
 public:
   CharacterConverter(mbstate *mbstate);
 

>From f3c4a47ed63ee7dcfd8fbe01dfc71e15e47e394d Mon Sep 17 00:00:00 2001
From: Uzair Nawaz <uzairnawaz at google.com>
Date: Fri, 13 Jun 2025 16:31:25 +0000
Subject: [PATCH 13/17] simplified conversion math; switched to error_or
 instead of utf_ret

---
 libc/src/__support/wchar/CMakeLists.txt       |  8 +-
 .../__support/wchar/character_converter.cpp   | 22 ++---
 .../src/__support/wchar/character_converter.h |  6 +-
 libc/src/__support/wchar/utf_ret.h            | 24 -----
 .../src/__support/wchar/utf32_to_8_test.cpp   | 88 +++++++++----------
 5 files changed, 60 insertions(+), 88 deletions(-)
 delete mode 100644 libc/src/__support/wchar/utf_ret.h

diff --git a/libc/src/__support/wchar/CMakeLists.txt b/libc/src/__support/wchar/CMakeLists.txt
index 5cca58400ff45..323e47a88e86c 100644
--- a/libc/src/__support/wchar/CMakeLists.txt
+++ b/libc/src/__support/wchar/CMakeLists.txt
@@ -15,12 +15,6 @@ add_object_library(
   DEPENDS
     libc.hdr.types.char8_t
     libc.hdr.types.char32_t
+    libc.src.__support.error_or
     .mbstate
-    .utf_ret
-)
-
-add_header_library(
-  utf_ret
-  HDRS
-    utf_ret.h
 )
diff --git a/libc/src/__support/wchar/character_converter.cpp b/libc/src/__support/wchar/character_converter.cpp
index f18c869c5788f..2861bbe0e466e 100644
--- a/libc/src/__support/wchar/character_converter.cpp
+++ b/libc/src/__support/wchar/character_converter.cpp
@@ -9,8 +9,8 @@
 #include "hdr/types/char32_t.h"
 #include "hdr/types/char8_t.h"
 #include "src/__support/common.h"
+#include "src/__support/error_or.h"
 #include "src/__support/wchar/mbstate.h"
-#include "src/__support/wchar/utf_ret.h"
 
 #include "character_converter.h"
 
@@ -43,30 +43,32 @@ int CharacterConverter::push(char32_t utf32) {
   return 0;
 }
 
-utf_ret<char8_t> CharacterConverter::pop_utf8() {
+ErrorOr<char8_t> CharacterConverter::pop_utf8() {
   if (state->bytes_processed >= state->total_bytes)
-    return {.out = 0, .error = -1};
+    return Error(-1);
+
+  const char8_t first_byte_headers[] = {0, 0xC0, 0xE0, 0xF0};
+  const char32_t utf32 = state->partial;
+  const char32_t tot_bytes = state->total_bytes;
+  const char32_t bytes_proc = state->bytes_processed;
 
-  char8_t first_byte_headers[] = {0, 0xC0, 0xE0, 0xF0};
-  char32_t utf32 = state->partial;
-  char32_t tb = state->total_bytes;
-  char32_t bp = state->bytes_processed;
   char32_t output;
+  // Shift to get the next 6 bits from the utf32 encoding
+  const char32_t shift_amount = (tot_bytes - bytes_proc - 1) * 6;
   if (state->bytes_processed == 0) {
     /*
       Choose the correct set of most significant bits to encode the length
       of the utf8 sequence. The remaining bits contain the most significant
       bits of the unicode value of the character.
     */
-    output = first_byte_headers[tb - 1] | (utf32 >> ((tb - 1) * 6));
+    output = first_byte_headers[tot_bytes - 1] | (utf32 >> shift_amount);
   } else {
     // Get the next 6 bits and format it like so: 10xxxxxx
-    const char32_t shift_amount = (tb - bp - 1) * 6;
     output = 0x80 | ((utf32 >> shift_amount) & 0x3f);
   }
 
   state->bytes_processed++;
-  return {.out = (char8_t)output, .error = 0};
+  return (char8_t)output;
 }
 
 } // namespace internal
diff --git a/libc/src/__support/wchar/character_converter.h b/libc/src/__support/wchar/character_converter.h
index 028a687f0c48f..4b798647aaa5b 100644
--- a/libc/src/__support/wchar/character_converter.h
+++ b/libc/src/__support/wchar/character_converter.h
@@ -12,8 +12,8 @@
 #include "hdr/types/char32_t.h"
 #include "hdr/types/char8_t.h"
 #include "src/__support/common.h"
+#include "src/__support/error_or.h"
 #include "src/__support/wchar/mbstate.h"
-#include "src/__support/wchar/utf_ret.h"
 
 namespace LIBC_NAMESPACE_DECL {
 namespace internal {
@@ -30,8 +30,8 @@ class CharacterConverter {
   int push(char8_t utf8_byte);
   int push(char32_t utf32);
 
-  utf_ret<char8_t> pop_utf8();
-  utf_ret<char32_t> pop_utf32();
+  ErrorOr<char8_t> pop_utf8();
+  ErrorOr<char32_t> pop_utf32();
 };
 
 } // namespace internal
diff --git a/libc/src/__support/wchar/utf_ret.h b/libc/src/__support/wchar/utf_ret.h
deleted file mode 100644
index fa99b76159bd8..0000000000000
--- a/libc/src/__support/wchar/utf_ret.h
+++ /dev/null
@@ -1,24 +0,0 @@
-//===-- Definition of utf_ret ----------------------------------*-- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIBC_SRC___SUPPORT_UTF_RET_H
-#define LLVM_LIBC_SRC___SUPPORT_UTF_RET_H
-
-#include "src/__support/common.h"
-
-namespace LIBC_NAMESPACE_DECL {
-namespace internal {
-template <typename T> struct utf_ret {
-  T out;
-  int error;
-};
-
-} // namespace internal
-} // namespace LIBC_NAMESPACE_DECL
-
-#endif // LLVM_LIBC_SRC___SUPPORT_UTF_RET_H
diff --git a/libc/test/src/__support/wchar/utf32_to_8_test.cpp b/libc/test/src/__support/wchar/utf32_to_8_test.cpp
index fb492c7a53f9a..26915ce43ffae 100644
--- a/libc/test/src/__support/wchar/utf32_to_8_test.cpp
+++ b/libc/test/src/__support/wchar/utf32_to_8_test.cpp
@@ -20,20 +20,20 @@ TEST(LlvmLibcCharacterConverterUTF32To8Test, OneByte) {
   char32_t utf32_A = 0x41; // 'A'
   cr.push(utf32_A);
   auto popped = cr.pop_utf8();
-  ASSERT_EQ(popped.error, 0);
-  ASSERT_EQ(static_cast<char>(popped.out), 'A');
+  ASSERT_TRUE(popped.has_value());
+  ASSERT_EQ(static_cast<char>(popped.value()), 'A');
   ASSERT_TRUE(cr.isComplete());
 
   char32_t utf32_B = 0x42; // 'B'
   cr.push(utf32_B);
   popped = cr.pop_utf8();
-  ASSERT_EQ(popped.error, 0);
-  ASSERT_EQ(static_cast<char>(popped.out), 'B');
+  ASSERT_TRUE(popped.has_value());
+  ASSERT_EQ(static_cast<char>(popped.value()), 'B');
   ASSERT_TRUE(cr.isComplete());
 
   // should error if we try to pop another utf8 byte out
   popped = cr.pop_utf8();
-  ASSERT_NE(popped.error, 0);
+  ASSERT_FALSE(popped.has_value());
 }
 
 TEST(LlvmLibcCharacterConverterUTF32To8Test, TwoByte) {
@@ -44,29 +44,29 @@ TEST(LlvmLibcCharacterConverterUTF32To8Test, TwoByte) {
   char32_t utf32 = 0xff;
   cr.push(utf32);
   auto popped = cr.pop_utf8();
-  ASSERT_EQ(popped.error, 0);
-  ASSERT_EQ(static_cast<int>(popped.out), 0xc3);
+  ASSERT_TRUE(popped.has_value());
+  ASSERT_EQ(static_cast<int>(popped.value()), 0xc3);
   ASSERT_TRUE(!cr.isComplete());
   popped = cr.pop_utf8();
-  ASSERT_EQ(popped.error, 0);
-  ASSERT_EQ(static_cast<int>(popped.out), 0xbf);
+  ASSERT_TRUE(popped.has_value());
+  ASSERT_EQ(static_cast<int>(popped.value()), 0xbf);
   ASSERT_TRUE(cr.isComplete());
 
   // testing utf32: 0x58e -> utf8: 0xd6 0x8e
   utf32 = 0x58e;
   cr.push(utf32);
   popped = cr.pop_utf8();
-  ASSERT_EQ(popped.error, 0);
-  ASSERT_EQ(static_cast<int>(popped.out), 0xd6);
+  ASSERT_TRUE(popped.has_value());
+  ASSERT_EQ(static_cast<int>(popped.value()), 0xd6);
   ASSERT_TRUE(!cr.isComplete());
   popped = cr.pop_utf8();
-  ASSERT_EQ(popped.error, 0);
-  ASSERT_EQ(static_cast<int>(popped.out), 0x8e);
+  ASSERT_TRUE(popped.has_value());
+  ASSERT_EQ(static_cast<int>(popped.value()), 0x8e);
   ASSERT_TRUE(cr.isComplete());
 
   // should error if we try to pop another utf8 byte out
   popped = cr.pop_utf8();
-  ASSERT_NE(popped.error, 0);
+  ASSERT_FALSE(popped.has_value());
 }
 
 TEST(LlvmLibcCharacterConverterUTF32To8Test, ThreeByte) {
@@ -77,37 +77,37 @@ TEST(LlvmLibcCharacterConverterUTF32To8Test, ThreeByte) {
   char32_t utf32 = 0xac15;
   cr.push(utf32);
   auto popped = cr.pop_utf8();
-  ASSERT_EQ(popped.error, 0);
-  ASSERT_EQ(static_cast<int>(popped.out), 0xea);
+  ASSERT_TRUE(popped.has_value());
+  ASSERT_EQ(static_cast<int>(popped.value()), 0xea);
   ASSERT_TRUE(!cr.isComplete());
   popped = cr.pop_utf8();
-  ASSERT_EQ(popped.error, 0);
-  ASSERT_EQ(static_cast<int>(popped.out), 0xb0);
+  ASSERT_TRUE(popped.has_value());
+  ASSERT_EQ(static_cast<int>(popped.value()), 0xb0);
   ASSERT_TRUE(!cr.isComplete());
   popped = cr.pop_utf8();
-  ASSERT_EQ(popped.error, 0);
-  ASSERT_EQ(static_cast<int>(popped.out), 0x95);
+  ASSERT_TRUE(popped.has_value());
+  ASSERT_EQ(static_cast<int>(popped.value()), 0x95);
   ASSERT_TRUE(cr.isComplete());
 
   // testing utf32: 0x267b -> utf8: 0xe2 0x99 0xbb
   utf32 = 0x267b;
   cr.push(utf32);
   popped = cr.pop_utf8();
-  ASSERT_EQ(popped.error, 0);
-  ASSERT_EQ(static_cast<int>(popped.out), 0xe2);
+  ASSERT_TRUE(popped.has_value());
+  ASSERT_EQ(static_cast<int>(popped.value()), 0xe2);
   ASSERT_TRUE(!cr.isComplete());
   popped = cr.pop_utf8();
-  ASSERT_EQ(popped.error, 0);
-  ASSERT_EQ(static_cast<int>(popped.out), 0x99);
+  ASSERT_TRUE(popped.has_value());
+  ASSERT_EQ(static_cast<int>(popped.value()), 0x99);
   ASSERT_TRUE(!cr.isComplete());
   popped = cr.pop_utf8();
-  ASSERT_EQ(popped.error, 0);
-  ASSERT_EQ(static_cast<int>(popped.out), 0xbb);
+  ASSERT_TRUE(popped.has_value());
+  ASSERT_EQ(static_cast<int>(popped.value()), 0xbb);
   ASSERT_TRUE(cr.isComplete());
 
   // should error if we try to pop another utf8 byte out
   popped = cr.pop_utf8();
-  ASSERT_NE(popped.error, 0);
+  ASSERT_FALSE(popped.has_value());
 }
 
 TEST(LlvmLibcCharacterConverterUTF32To8Test, FourByte) {
@@ -118,43 +118,43 @@ TEST(LlvmLibcCharacterConverterUTF32To8Test, FourByte) {
   char32_t utf32 = 0x1f921;
   cr.push(utf32);
   auto popped = cr.pop_utf8();
-  ASSERT_EQ(popped.error, 0);
-  ASSERT_EQ(static_cast<int>(popped.out), 0xf0);
+  ASSERT_TRUE(popped.has_value());
+  ASSERT_EQ(static_cast<int>(popped.value()), 0xf0);
   ASSERT_TRUE(!cr.isComplete());
   popped = cr.pop_utf8();
-  ASSERT_EQ(popped.error, 0);
-  ASSERT_EQ(static_cast<int>(popped.out), 0x9f);
+  ASSERT_TRUE(popped.has_value());
+  ASSERT_EQ(static_cast<int>(popped.value()), 0x9f);
   ASSERT_TRUE(!cr.isComplete());
   popped = cr.pop_utf8();
-  ASSERT_EQ(popped.error, 0);
-  ASSERT_EQ(static_cast<int>(popped.out), 0xa4);
+  ASSERT_TRUE(popped.has_value());
+  ASSERT_EQ(static_cast<int>(popped.value()), 0xa4);
   ASSERT_TRUE(!cr.isComplete());
   popped = cr.pop_utf8();
-  ASSERT_EQ(popped.error, 0);
-  ASSERT_EQ(static_cast<int>(popped.out), 0xa1);
+  ASSERT_TRUE(popped.has_value());
+  ASSERT_EQ(static_cast<int>(popped.value()), 0xa1);
   ASSERT_TRUE(cr.isComplete());
 
   // testing utf32: 0x12121 -> utf8: 0xf0 0x92 0x84 0xa1
   utf32 = 0x12121;
   cr.push(utf32);
   popped = cr.pop_utf8();
-  ASSERT_EQ(popped.error, 0);
-  ASSERT_EQ(static_cast<int>(popped.out), 0xf0);
+  ASSERT_TRUE(popped.has_value());
+  ASSERT_EQ(static_cast<int>(popped.value()), 0xf0);
   ASSERT_TRUE(!cr.isComplete());
   popped = cr.pop_utf8();
-  ASSERT_EQ(popped.error, 0);
-  ASSERT_EQ(static_cast<int>(popped.out), 0x92);
+  ASSERT_TRUE(popped.has_value());
+  ASSERT_EQ(static_cast<int>(popped.value()), 0x92);
   ASSERT_TRUE(!cr.isComplete());
   popped = cr.pop_utf8();
-  ASSERT_EQ(popped.error, 0);
-  ASSERT_EQ(static_cast<int>(popped.out), 0x84);
+  ASSERT_TRUE(popped.has_value());
+  ASSERT_EQ(static_cast<int>(popped.value()), 0x84);
   ASSERT_TRUE(!cr.isComplete());
   popped = cr.pop_utf8();
-  ASSERT_EQ(popped.error, 0);
-  ASSERT_EQ(static_cast<int>(popped.out), 0xa1);
+  ASSERT_TRUE(popped.has_value());
+  ASSERT_EQ(static_cast<int>(popped.value()), 0xa1);
   ASSERT_TRUE(cr.isComplete());
 
   // should error if we try to pop another utf8 byte out
   popped = cr.pop_utf8();
-  ASSERT_NE(popped.error, 0);
+  ASSERT_FALSE(popped.has_value());
 }

>From 864381da4b54ac1cec88df292794ac8b68378f1e Mon Sep 17 00:00:00 2001
From: Uzair Nawaz <uzairnawaz at google.com>
Date: Fri, 13 Jun 2025 17:14:28 +0000
Subject: [PATCH 14/17] removed magic numbers; added comments to mbstate for
 clarity

---
 .../__support/wchar/character_converter.cpp   | 23 ++++++++++++-------
 libc/src/__support/wchar/mbstate.h            |  9 ++++++++
 2 files changed, 24 insertions(+), 8 deletions(-)

diff --git a/libc/src/__support/wchar/character_converter.cpp b/libc/src/__support/wchar/character_converter.cpp
index 2861bbe0e466e..6f1a97c3a74d5 100644
--- a/libc/src/__support/wchar/character_converter.cpp
+++ b/libc/src/__support/wchar/character_converter.cpp
@@ -29,7 +29,7 @@ int CharacterConverter::push(char32_t utf32) {
   state->total_bytes = 0;
 
   // determine number of utf-8 bytes needed to represent this utf32 value
-  char32_t ranges[] = {0x7f, 0x7ff, 0xffff, 0x10ffff};
+  const char32_t ranges[] = {0x7f, 0x7ff, 0xffff, 0x10ffff};
   const int num_ranges = 4;
   for (uint8_t i = 0; i < num_ranges; i++) {
     if (state->partial <= ranges[i]) {
@@ -47,24 +47,31 @@ ErrorOr<char8_t> CharacterConverter::pop_utf8() {
   if (state->bytes_processed >= state->total_bytes)
     return Error(-1);
 
-  const char8_t first_byte_headers[] = {0, 0xC0, 0xE0, 0xF0};
-  const char32_t utf32 = state->partial;
-  const char32_t tot_bytes = state->total_bytes;
-  const char32_t bytes_proc = state->bytes_processed;
+  const char8_t FIRST_BYTE_HEADERS[] = {0, 0xC0, 0xE0, 0xF0};
+  const char8_t CONTINUING_BYTE_HEADER = 0x80;
+
+  // the number of bits per utf-8 byte that actually encode character
+  // information not metadata (# of bits excluding the byte headers)
+  const int ENCODED_BITS_PER_BYTE = 6;
+  const int MASK_LOWER_SIX = 0x3f;
 
   char32_t output;
+
   // Shift to get the next 6 bits from the utf32 encoding
-  const char32_t shift_amount = (tot_bytes - bytes_proc - 1) * 6;
+  const char32_t shift_amount =
+      (state->total_bytes - state->bytes_processed - 1) * ENCODED_BITS_PER_BYTE;
   if (state->bytes_processed == 0) {
     /*
       Choose the correct set of most significant bits to encode the length
       of the utf8 sequence. The remaining bits contain the most significant
       bits of the unicode value of the character.
     */
-    output = first_byte_headers[tot_bytes - 1] | (utf32 >> shift_amount);
+    output = FIRST_BYTE_HEADERS[state->total_bytes - 1] |
+             (state->partial >> shift_amount);
   } else {
     // Get the next 6 bits and format it like so: 10xxxxxx
-    output = 0x80 | ((utf32 >> shift_amount) & 0x3f);
+    output = CONTINUING_BYTE_HEADER |
+             ((state->partial >> shift_amount) & MASK_LOWER_SIX);
   }
 
   state->bytes_processed++;
diff --git a/libc/src/__support/wchar/mbstate.h b/libc/src/__support/wchar/mbstate.h
index d33ee354a5443..fb08fb4eaa188 100644
--- a/libc/src/__support/wchar/mbstate.h
+++ b/libc/src/__support/wchar/mbstate.h
@@ -17,8 +17,17 @@ namespace LIBC_NAMESPACE_DECL {
 namespace internal {
 
 struct mbstate {
+  // store a partial codepoint (in UTF-32)
   char32_t partial;
+
+  /*
+  Progress towards a conversion
+    For utf8  -> utf32, increases with each CharacterConverter::push(utf8_byte)
+    For utf32 ->  utf8, increases with each CharacterConverter::pop_utf8()
+  */
   uint8_t bytes_processed;
+
+  // Total number of bytes that will be needed to represent this character
   uint8_t total_bytes;
 };
 

>From 1a6604ee99ac5ed7d95269b4b3dfd9377ecacd80 Mon Sep 17 00:00:00 2001
From: Uzair Nawaz <uzairnawaz at google.com>
Date: Fri, 13 Jun 2025 17:27:20 +0000
Subject: [PATCH 15/17] used mask_trailing_ones instead of a predefined
 constant

---
 libc/src/__support/wchar/CMakeLists.txt          | 1 +
 libc/src/__support/wchar/character_converter.cpp | 4 +++-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/libc/src/__support/wchar/CMakeLists.txt b/libc/src/__support/wchar/CMakeLists.txt
index 323e47a88e86c..6715e354e23e5 100644
--- a/libc/src/__support/wchar/CMakeLists.txt
+++ b/libc/src/__support/wchar/CMakeLists.txt
@@ -16,5 +16,6 @@ add_object_library(
     libc.hdr.types.char8_t
     libc.hdr.types.char32_t
     libc.src.__support.error_or
+    libc.src.__support.math_extras
     .mbstate
 )
diff --git a/libc/src/__support/wchar/character_converter.cpp b/libc/src/__support/wchar/character_converter.cpp
index 6f1a97c3a74d5..49ed238daebbf 100644
--- a/libc/src/__support/wchar/character_converter.cpp
+++ b/libc/src/__support/wchar/character_converter.cpp
@@ -10,6 +10,7 @@
 #include "hdr/types/char8_t.h"
 #include "src/__support/common.h"
 #include "src/__support/error_or.h"
+#include "src/__support/math_extras.h"
 #include "src/__support/wchar/mbstate.h"
 
 #include "character_converter.h"
@@ -53,7 +54,8 @@ ErrorOr<char8_t> CharacterConverter::pop_utf8() {
   // the number of bits per utf-8 byte that actually encode character
   // information not metadata (# of bits excluding the byte headers)
   const int ENCODED_BITS_PER_BYTE = 6;
-  const int MASK_LOWER_SIX = 0x3f;
+  const int MASK_LOWER_SIX =
+      mask_trailing_ones<unsigned int, ENCODED_BITS_PER_BYTE>();
 
   char32_t output;
 

>From 61042324e62508f07032678a836bc05b484bf249 Mon Sep 17 00:00:00 2001
From: Uzair Nawaz <uzairnawaz at google.com>
Date: Fri, 13 Jun 2025 18:08:14 +0000
Subject: [PATCH 16/17] Update libc/src/__support/wchar/character_converter.cpp

Co-authored-by: Michael Jones <michaelrj at google.com>
---
 libc/src/__support/wchar/character_converter.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libc/src/__support/wchar/character_converter.cpp b/libc/src/__support/wchar/character_converter.cpp
index 49ed238daebbf..05ae25288357c 100644
--- a/libc/src/__support/wchar/character_converter.cpp
+++ b/libc/src/__support/wchar/character_converter.cpp
@@ -54,7 +54,7 @@ ErrorOr<char8_t> CharacterConverter::pop_utf8() {
   // the number of bits per utf-8 byte that actually encode character
   // information not metadata (# of bits excluding the byte headers)
   const int ENCODED_BITS_PER_BYTE = 6;
-  const int MASK_LOWER_SIX =
+  const int MASK_ENCODED_BITS =
       mask_trailing_ones<unsigned int, ENCODED_BITS_PER_BYTE>();
 
   char32_t output;

>From 50ec604ea911543bb3bb1b293384b00b1d47e498 Mon Sep 17 00:00:00 2001
From: Uzair Nawaz <uzairnawaz at google.com>
Date: Fri, 13 Jun 2025 18:08:53 +0000
Subject: [PATCH 17/17] Update libc/src/__support/wchar/character_converter.cpp

Co-authored-by: Michael Jones <michaelrj at google.com>
---
 libc/src/__support/wchar/character_converter.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libc/src/__support/wchar/character_converter.cpp b/libc/src/__support/wchar/character_converter.cpp
index 05ae25288357c..736168d74b140 100644
--- a/libc/src/__support/wchar/character_converter.cpp
+++ b/libc/src/__support/wchar/character_converter.cpp
@@ -53,7 +53,7 @@ ErrorOr<char8_t> CharacterConverter::pop_utf8() {
 
   // the number of bits per utf-8 byte that actually encode character
   // information not metadata (# of bits excluding the byte headers)
-  const int ENCODED_BITS_PER_BYTE = 6;
+  constexpr size_t ENCODED_BITS_PER_UTF8 = 6;
   const int MASK_ENCODED_BITS =
       mask_trailing_ones<unsigned int, ENCODED_BITS_PER_BYTE>();
 



More information about the libc-commits mailing list