[libc-commits] [libc] [libc] Implemented CharacterConverter push/pop for utf32->utf8 conversions (PR #143971)

Uzair Nawaz via libc-commits libc-commits at lists.llvm.org
Thu Jun 12 14:41:12 PDT 2025


https://github.com/uzairnawaz updated https://github.com/llvm/llvm-project/pull/143971

>From a4c095a664e5327b3fe473f8e82fb01987472fb1 Mon Sep 17 00:00:00 2001
From: Uzair Nawaz <uzairnawaz at google.com>
Date: Wed, 11 Jun 2025 23:16:06 +0000
Subject: [PATCH 01/11] build fixes

---
 libc/hdr/CMakeLists.txt                      |  2 ++
 libc/hdr/types/CMakeLists.txt                | 23 ++++++++++++++++++++
 libc/include/llvm-libc-types/char8_t.h       |  3 +--
 libc/src/__support/CMakeLists.txt            |  2 ++
 libc/src/__support/wchar/mbstate.h           |  1 +
 libc/src/__support/wchar/utf_ret.h           |  3 ++-
 libc/test/src/__support/CMakeLists.txt       |  1 +
 libc/test/src/__support/wchar/CMakeLists.txt | 11 ++++++++++
 8 files changed, 43 insertions(+), 3 deletions(-)
 create mode 100644 libc/test/src/__support/wchar/CMakeLists.txt

diff --git a/libc/hdr/CMakeLists.txt b/libc/hdr/CMakeLists.txt
index 209fcb965242f..1e40e3e4cc908 100644
--- a/libc/hdr/CMakeLists.txt
+++ b/libc/hdr/CMakeLists.txt
@@ -212,6 +212,8 @@ add_proxy_header_library(
 
 add_header_library(wchar_overlay HDRS wchar_overlay.h)
 
+add_header_library(uchar_overlay HDRS uchar_overlay.h)
+
 add_proxy_header_library(
   wchar_macros
   HDRS
diff --git a/libc/hdr/types/CMakeLists.txt b/libc/hdr/types/CMakeLists.txt
index 5f6197c93d445..89eabc0bc4b2e 100644
--- a/libc/hdr/types/CMakeLists.txt
+++ b/libc/hdr/types/CMakeLists.txt
@@ -1,3 +1,26 @@
+
+add_proxy_header_library(
+  char8_t 
+  HDRS
+    char8_t.h
+  DEPENDS
+    libc.hdr.uchar_overlay
+  FULL_BUILD_DEPENDS
+    libc.include.llvm-libc-types.char8_t
+    libc.include.uchar
+)
+
+add_proxy_header_library(
+  char32_t 
+  HDRS
+    char32_t.h
+  DEPENDS
+    libc.hdr.uchar_overlay
+  FULL_BUILD_DEPENDS
+    libc.include.llvm-libc-types.char32_t
+    libc.include.uchar
+)
+
 add_proxy_header_library(
   div_t
   HDRS
diff --git a/libc/include/llvm-libc-types/char8_t.h b/libc/include/llvm-libc-types/char8_t.h
index ddadab1afa219..a343be77d810b 100644
--- a/libc/include/llvm-libc-types/char8_t.h
+++ b/libc/include/llvm-libc-types/char8_t.h
@@ -9,8 +9,7 @@
 #ifndef LLVM_LIBC_TYPES_CHAR8_T_H
 #define LLVM_LIBC_TYPES_CHAR8_T_H
 
-#if !defined(__cplusplus) && defined(__STDC_VERSION__) &&                      \
-    __STDC_VERSION__ >= 202311L
+#if !(defined(__cplusplus) && defined(__cpp_char8_t))
 typedef unsigned char char8_t;
 #endif
 
diff --git a/libc/src/__support/CMakeLists.txt b/libc/src/__support/CMakeLists.txt
index f92499fdbf451..201967fe05f0e 100644
--- a/libc/src/__support/CMakeLists.txt
+++ b/libc/src/__support/CMakeLists.txt
@@ -381,3 +381,5 @@ add_subdirectory(HashTable)
 add_subdirectory(fixed_point)
 
 add_subdirectory(time)
+
+add_subdirectory(wchar)
diff --git a/libc/src/__support/wchar/mbstate.h b/libc/src/__support/wchar/mbstate.h
index 72ec727560003..0a0c5e2ad4ce1 100644
--- a/libc/src/__support/wchar/mbstate.h
+++ b/libc/src/__support/wchar/mbstate.h
@@ -9,6 +9,7 @@
 #ifndef LLVM_LIBC_SRC___SUPPORT_MBSTATE_H
 #define LLVM_LIBC_SRC___SUPPORT_MBSTATE_H
 
+#include "src/__support/common.h"
 #include "hdr/types/char32_t.h"
 #include <stdint.h>
 
diff --git a/libc/src/__support/wchar/utf_ret.h b/libc/src/__support/wchar/utf_ret.h
index b8a8f6f094143..ad9690ec1f646 100644
--- a/libc/src/__support/wchar/utf_ret.h
+++ b/libc/src/__support/wchar/utf_ret.h
@@ -10,12 +10,13 @@
 #define LLVM_LIBC_SRC___SUPPORT_UTF_RET_H
 
 namespace LIBC_NAMESPACE_DECL {
-
+namespace internal {
 template <typename T> struct utf_ret {
   T out;
   int error;
 };
 
+} // namespace internal
 } // namespace LIBC_NAMESPACE_DECL
 
 #endif // LLVM_LIBC_SRC___SUPPORT_UTF_RET_H
diff --git a/libc/test/src/__support/CMakeLists.txt b/libc/test/src/__support/CMakeLists.txt
index 4fb0dae86e5ca..8905ac2127620 100644
--- a/libc/test/src/__support/CMakeLists.txt
+++ b/libc/test/src/__support/CMakeLists.txt
@@ -275,3 +275,4 @@ add_subdirectory(fixed_point)
 add_subdirectory(HashTable)
 add_subdirectory(time)
 add_subdirectory(threads)
+add_subdirectory(wchar)
diff --git a/libc/test/src/__support/wchar/CMakeLists.txt b/libc/test/src/__support/wchar/CMakeLists.txt
new file mode 100644
index 0000000000000..109f3ab3c85db
--- /dev/null
+++ b/libc/test/src/__support/wchar/CMakeLists.txt
@@ -0,0 +1,11 @@
+add_custom_target(libc-support-wchar-tests)
+
+add_libc_test(
+  utf32_to_8_test 
+  SUITE
+    libc-support-tests
+  SRCS
+    utf32_to_8_test.cpp 
+  DEPENDS
+    libc.src.__support.wchar.character_converter
+ )
\ No newline at end of file

>From e5ff004c47a60811417c6c0da6524f946a84c1f2 Mon Sep 17 00:00:00 2001
From: Uzair Nawaz <uzairnawaz at google.com>
Date: Wed, 11 Jun 2025 23:26:08 +0000
Subject: [PATCH 02/11] style fix, included empty test file to avoid failures

---
 libc/src/__support/wchar/mbstate.h                |  2 +-
 libc/test/src/__support/wchar/utf32_to_8_test.cpp | 15 +++++++++++++++
 2 files changed, 16 insertions(+), 1 deletion(-)
 create mode 100644 libc/test/src/__support/wchar/utf32_to_8_test.cpp

diff --git a/libc/src/__support/wchar/mbstate.h b/libc/src/__support/wchar/mbstate.h
index 0a0c5e2ad4ce1..cb8950374de41 100644
--- a/libc/src/__support/wchar/mbstate.h
+++ b/libc/src/__support/wchar/mbstate.h
@@ -9,8 +9,8 @@
 #ifndef LLVM_LIBC_SRC___SUPPORT_MBSTATE_H
 #define LLVM_LIBC_SRC___SUPPORT_MBSTATE_H
 
-#include "src/__support/common.h"
 #include "hdr/types/char32_t.h"
+#include "src/__support/common.h"
 #include <stdint.h>
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/test/src/__support/wchar/utf32_to_8_test.cpp b/libc/test/src/__support/wchar/utf32_to_8_test.cpp
new file mode 100644
index 0000000000000..3638439862fbb
--- /dev/null
+++ b/libc/test/src/__support/wchar/utf32_to_8_test.cpp
@@ -0,0 +1,15 @@
+//===-- Unittests for the CharacterConverter class (utf32 -> 8) -----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/common.h"
+#include "src/__support/wchar/character_converter.h"
+#include "src/__support/wchar/mbstate.h"
+
+#include "test/UnitTest/Test.h"
+
+TEST(LlvmLibcCharacterConverterUTF32To8Test, OneByte) {}

>From 36127243b164086270ca8779aec60c54570735a9 Mon Sep 17 00:00:00 2001
From: Uzair Nawaz <uzairnawaz at google.com>
Date: Thu, 12 Jun 2025 16:17:16 +0000
Subject: [PATCH 03/11] style; missing include

---
 libc/hdr/types/CMakeLists.txt                | 1 -
 libc/src/__support/wchar/utf_ret.h           | 2 ++
 libc/test/src/__support/wchar/CMakeLists.txt | 2 +-
 3 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/libc/hdr/types/CMakeLists.txt b/libc/hdr/types/CMakeLists.txt
index 89eabc0bc4b2e..c88c357009072 100644
--- a/libc/hdr/types/CMakeLists.txt
+++ b/libc/hdr/types/CMakeLists.txt
@@ -1,4 +1,3 @@
-
 add_proxy_header_library(
   char8_t 
   HDRS
diff --git a/libc/src/__support/wchar/utf_ret.h b/libc/src/__support/wchar/utf_ret.h
index ad9690ec1f646..fa99b76159bd8 100644
--- a/libc/src/__support/wchar/utf_ret.h
+++ b/libc/src/__support/wchar/utf_ret.h
@@ -9,6 +9,8 @@
 #ifndef LLVM_LIBC_SRC___SUPPORT_UTF_RET_H
 #define LLVM_LIBC_SRC___SUPPORT_UTF_RET_H
 
+#include "src/__support/common.h"
+
 namespace LIBC_NAMESPACE_DECL {
 namespace internal {
 template <typename T> struct utf_ret {
diff --git a/libc/test/src/__support/wchar/CMakeLists.txt b/libc/test/src/__support/wchar/CMakeLists.txt
index 109f3ab3c85db..f6abdbcc54bc3 100644
--- a/libc/test/src/__support/wchar/CMakeLists.txt
+++ b/libc/test/src/__support/wchar/CMakeLists.txt
@@ -8,4 +8,4 @@ add_libc_test(
     utf32_to_8_test.cpp 
   DEPENDS
     libc.src.__support.wchar.character_converter
- )
\ No newline at end of file
+)

>From e7ebd785df0ccf68d4759687dcdbf41708c14012 Mon Sep 17 00:00:00 2001
From: Uzair Nawaz <uzairnawaz at google.com>
Date: Thu, 12 Jun 2025 16:19:50 +0000
Subject: [PATCH 04/11] removed incomplete test from build

---
 libc/test/src/__support/wchar/CMakeLists.txt | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/libc/test/src/__support/wchar/CMakeLists.txt b/libc/test/src/__support/wchar/CMakeLists.txt
index f6abdbcc54bc3..0ed384571f232 100644
--- a/libc/test/src/__support/wchar/CMakeLists.txt
+++ b/libc/test/src/__support/wchar/CMakeLists.txt
@@ -1,11 +1 @@
 add_custom_target(libc-support-wchar-tests)
-
-add_libc_test(
-  utf32_to_8_test 
-  SUITE
-    libc-support-tests
-  SRCS
-    utf32_to_8_test.cpp 
-  DEPENDS
-    libc.src.__support.wchar.character_converter
-)

>From fe7f23d59c50f0aa221c04fff1c8ef2758ee5070 Mon Sep 17 00:00:00 2001
From: Uzair Nawaz <uzairnawaz at google.com>
Date: Thu, 12 Jun 2025 17:05:00 +0000
Subject: [PATCH 05/11] implemented characterconverter push/pop for utf32 -> 8;
 added tests

---
 .../__support/wchar/character_converter.cpp   | 140 +++++++++++++++++-
 .../src/__support/wchar/character_converter.h |  11 +-
 libc/test/src/__support/wchar/CMakeLists.txt  |  10 ++
 .../src/__support/wchar/utf32_to_8_test.cpp   | 128 +++++++++++++++-
 4 files changed, 280 insertions(+), 9 deletions(-)

diff --git a/libc/src/__support/wchar/character_converter.cpp b/libc/src/__support/wchar/character_converter.cpp
index 0afc2a6f59e64..88528960b1a2f 100644
--- a/libc/src/__support/wchar/character_converter.cpp
+++ b/libc/src/__support/wchar/character_converter.cpp
@@ -8,6 +8,7 @@
 
 #include "hdr/types/char32_t.h"
 #include "hdr/types/char8_t.h"
+#include "src/__support/common.h"
 #include "src/__support/wchar/mbstate.h"
 #include "src/__support/wchar/utf_ret.h"
 
@@ -16,17 +17,144 @@
 namespace LIBC_NAMESPACE_DECL {
 namespace internal {
 
-CharacterConverter::CharacterConverter(mbstate_t *mbstate) { state = mbstate; }
+CharacterConverter::CharacterConverter(mbstate *mbstate) { state = mbstate; }
 
-bool CharacterConverter::isComplete() {}
+bool CharacterConverter::isComplete() {
+  return state->bits_processed / 8 == state->total_bytes;
+}
 
-int CharacterConverter::push(char8_t utf8_byte) {}
+int CharacterConverter::push(char8_t utf8_byte) { return utf8_byte; }
 
-int CharacterConverter::push(char32_t utf32) {}
+int CharacterConverter::push(char32_t utf32) {
+  state->partial = utf32;
+  state->bits_processed = 0;
+  state->total_bytes = 0;
 
-utf_ret<char8_t> CharacterConverter::pop_utf8() {}
+  // determine number of utf-8 bytes needed to represent this utf32 value
+  char32_t ranges[] = {0x7f, 0x7ff, 0xffff, 0x10ffff};
+  const int num_ranges = 4;
+  for (uint8_t i = 0; i < num_ranges; i++) {
+    if (state->partial <= ranges[i]) {
+      state->total_bytes = i + 1;
+      break;
+    }
+  }
+  if (state->total_bytes == 0) {
+    return -1;
+  }
 
-utf_ret<char32_t> CharacterConverter::pop_utf32() {}
+  return 0;
+}
+
+utf_ret<char8_t> CharacterConverter::pop_utf8_seqlength1() {
+  utf_ret<char8_t> result;
+  result.error = 0;
+
+  // 0xxxxxxx
+  switch (state->bits_processed) {
+  case 0:
+    result.out = (char8_t)(state->partial);
+    break;
+  default:
+    result.error = -1;
+    return result;
+  }
+
+  state->bits_processed += 8;
+  return result;
+}
+
+utf_ret<char8_t> CharacterConverter::pop_utf8_seqlength2() {
+  utf_ret<char8_t> result;
+  result.error = 0;
+
+  // 110xxxxx 10xxxxxx
+  char32_t utf32 = state->partial;
+  switch (state->bits_processed) {
+  case 0:
+    result.out = (char8_t)(0xC0 | (utf32 >> 6));
+    break;
+  case 8:
+    result.out = (char8_t)(0x80 | (utf32 & 0x3f));
+    break;
+  default:
+    result.error = -1;
+    return result;
+  }
+
+  state->bits_processed += 8;
+  return result;
+}
+
+utf_ret<char8_t> CharacterConverter::pop_utf8_seqlength3() {
+  utf_ret<char8_t> result;
+  result.error = 0;
+
+  // 1110xxxx 10xxxxxx 10xxxxxx
+  char32_t utf32 = state->partial;
+  switch (state->bits_processed) {
+  case 0:
+    result.out = (char8_t)(0xE0 | (utf32 >> 12));
+    break;
+  case 8:
+    result.out = (char8_t)(0x80 | ((utf32 >> 6) & 0x3f));
+    break;
+  case 16:
+    result.out = (char8_t)(0x80 | (utf32 & 0x3f));
+    break;
+  default:
+    result.error = -1;
+    return result;
+  }
+
+  state->bits_processed += 8;
+  return result;
+}
+
+utf_ret<char8_t> CharacterConverter::pop_utf8_seqlength4() {
+  utf_ret<char8_t> result;
+  result.error = 0;
+
+  // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+  char32_t utf32 = state->partial;
+  switch (state->bits_processed) {
+  case 0:
+    result.out = (char8_t)(0xF0 | (utf32 >> 18));
+    break;
+  case 8:
+    result.out = (char8_t)(0x80 | ((utf32 >> 12) & 0x3f));
+    break;
+  case 16:
+    result.out = (char8_t)(0x80 | ((utf32 >> 6) & 0x3f));
+    break;
+  case 24:
+    result.out = (char8_t)(0x80 | (utf32 & 0x3f));
+    break;
+  default:
+    result.error = -1;
+    return result;
+  }
+
+  state->bits_processed += 8;
+  return result;
+}
+
+utf_ret<char8_t> CharacterConverter::pop_utf8() {
+  switch (state->total_bytes) {
+  case 1:
+    return pop_utf8_seqlength1();
+  case 2:
+    return pop_utf8_seqlength2();
+  case 3:
+    return pop_utf8_seqlength3();
+  case 4:
+    return pop_utf8_seqlength4();
+  }
+
+  return {.out = 0, .error = -1};
+}
+
+utf_ret<char32_t> CharacterConverter::pop_utf32() { return {0, -1}; }
 
 } // namespace internal
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/__support/wchar/character_converter.h b/libc/src/__support/wchar/character_converter.h
index a6bac43805376..a082efadc973a 100644
--- a/libc/src/__support/wchar/character_converter.h
+++ b/libc/src/__support/wchar/character_converter.h
@@ -13,16 +13,23 @@
 #include "hdr/types/char8_t.h"
 #include "src/__support/wchar/mbstate.h"
 #include "src/__support/wchar/utf_ret.h"
+#include "src/__support/common.h"
+
 
 namespace LIBC_NAMESPACE_DECL {
 namespace internal {
 
 class CharacterConverter {
 private:
-  mbstate_t *state;
+  mbstate *state;
+
+  utf_ret<char8_t> pop_utf8_seqlength1();
+  utf_ret<char8_t> pop_utf8_seqlength2();
+  utf_ret<char8_t> pop_utf8_seqlength3();
+  utf_ret<char8_t> pop_utf8_seqlength4();
 
 public:
-  CharacterConverter(mbstate_t *mbstate);
+  CharacterConverter(mbstate *mbstate);
 
   bool isComplete();
 
diff --git a/libc/test/src/__support/wchar/CMakeLists.txt b/libc/test/src/__support/wchar/CMakeLists.txt
index 0ed384571f232..f6abdbcc54bc3 100644
--- a/libc/test/src/__support/wchar/CMakeLists.txt
+++ b/libc/test/src/__support/wchar/CMakeLists.txt
@@ -1 +1,11 @@
 add_custom_target(libc-support-wchar-tests)
+
+add_libc_test(
+  utf32_to_8_test 
+  SUITE
+    libc-support-tests
+  SRCS
+    utf32_to_8_test.cpp 
+  DEPENDS
+    libc.src.__support.wchar.character_converter
+)
diff --git a/libc/test/src/__support/wchar/utf32_to_8_test.cpp b/libc/test/src/__support/wchar/utf32_to_8_test.cpp
index 3638439862fbb..8e90ded93704c 100644
--- a/libc/test/src/__support/wchar/utf32_to_8_test.cpp
+++ b/libc/test/src/__support/wchar/utf32_to_8_test.cpp
@@ -12,4 +12,130 @@
 
 #include "test/UnitTest/Test.h"
 
-TEST(LlvmLibcCharacterConverterUTF32To8Test, OneByte) {}
+TEST(LlvmLibcCharacterConverterUTF32To8Test, OneByte) {
+  LIBC_NAMESPACE::internal::mbstate state;
+  LIBC_NAMESPACE::internal::CharacterConverter cr(&state);
+
+  char32_t utf32_A = 0x41;
+  cr.push(utf32_A);
+  auto popped = cr.pop_utf8();
+  ASSERT_EQ(popped.error, 0);
+  ASSERT_EQ(static_cast<char>(popped.out), 'A');
+  ASSERT_TRUE(cr.isComplete());
+
+  char32_t utf32_B = 0x42;
+  cr.push(utf32_B);
+  popped = cr.pop_utf8();
+  ASSERT_EQ(popped.error, 0);
+  ASSERT_EQ(static_cast<char>(popped.out), 'B');
+  ASSERT_TRUE(cr.isComplete());
+
+  popped = cr.pop_utf8();
+  ASSERT_NE(popped.error, 0);
+}
+
+TEST(LlvmLibcCharacterConverterUTF32To8Test, TwoByte) {
+  LIBC_NAMESPACE::internal::mbstate state;
+  LIBC_NAMESPACE::internal::CharacterConverter cr(&state);
+
+  char32_t utf32 = 0xff;
+  cr.push(utf32);
+  auto popped = cr.pop_utf8();
+  ASSERT_EQ(popped.error, 0);
+  ASSERT_EQ(static_cast<int>(popped.out), 0xc3);
+  ASSERT_TRUE(!cr.isComplete());
+  popped = cr.pop_utf8();
+  ASSERT_EQ(popped.error, 0);
+  ASSERT_EQ(static_cast<int>(popped.out), 0xbf);
+  ASSERT_TRUE(cr.isComplete());
+
+  utf32 = 0x58e;
+  cr.push(utf32);
+  popped = cr.pop_utf8();
+  ASSERT_EQ(popped.error, 0);
+  ASSERT_EQ(static_cast<int>(popped.out), 0xd6);
+  ASSERT_TRUE(!cr.isComplete());
+  popped = cr.pop_utf8();
+  ASSERT_EQ(popped.error, 0);
+  ASSERT_EQ(static_cast<int>(popped.out), 0x8e);
+  ASSERT_TRUE(cr.isComplete());
+
+  popped = cr.pop_utf8();
+  ASSERT_NE(popped.error, 0);
+}
+
+TEST(LlvmLibcCharacterConverterUTF32To8Test, ThreeByte) {
+  LIBC_NAMESPACE::internal::mbstate state;
+  LIBC_NAMESPACE::internal::CharacterConverter cr(&state);
+
+  char32_t utf32 = 0xac15;
+  cr.push(utf32);
+  auto popped = cr.pop_utf8();
+  ASSERT_EQ(popped.error, 0);
+  ASSERT_EQ(static_cast<int>(popped.out), 0xea);
+  ASSERT_TRUE(!cr.isComplete());
+  popped = cr.pop_utf8();
+  ASSERT_EQ(popped.error, 0);
+  ASSERT_EQ(static_cast<int>(popped.out), 0xb0);
+  ASSERT_TRUE(!cr.isComplete());
+  popped = cr.pop_utf8();
+  ASSERT_EQ(popped.error, 0);
+  ASSERT_EQ(static_cast<int>(popped.out), 0x95);
+  ASSERT_TRUE(cr.isComplete());
+
+  utf32 = 0x267b;
+  cr.push(utf32);
+  popped = cr.pop_utf8();
+  ASSERT_EQ(popped.error, 0);
+  ASSERT_EQ(static_cast<int>(popped.out), 0xe2);
+  ASSERT_TRUE(!cr.isComplete());
+  popped = cr.pop_utf8();
+  ASSERT_EQ(popped.error, 0);
+  ASSERT_EQ(static_cast<int>(popped.out), 0x99);
+  ASSERT_TRUE(!cr.isComplete());
+  popped = cr.pop_utf8();
+  ASSERT_EQ(popped.error, 0);
+  ASSERT_EQ(static_cast<int>(popped.out), 0xbb);
+  ASSERT_TRUE(cr.isComplete());
+
+  popped = cr.pop_utf8();
+  ASSERT_NE(popped.error, 0);
+}
+
+TEST(LlvmLibcCharacterConverterUTF32To8Test, FourByte) {
+  LIBC_NAMESPACE::internal::mbstate state;
+  LIBC_NAMESPACE::internal::CharacterConverter cr(&state);
+
+  char32_t utf32 = 0xac15;
+  cr.push(utf32);
+  auto popped = cr.pop_utf8();
+  ASSERT_EQ(popped.error, 0);
+  ASSERT_EQ(static_cast<int>(popped.out), 0xea);
+  ASSERT_TRUE(!cr.isComplete());
+  popped = cr.pop_utf8();
+  ASSERT_EQ(popped.error, 0);
+  ASSERT_EQ(static_cast<int>(popped.out), 0xb0);
+  ASSERT_TRUE(!cr.isComplete());
+  popped = cr.pop_utf8();
+  ASSERT_EQ(popped.error, 0);
+  ASSERT_EQ(static_cast<int>(popped.out), 0x95);
+  ASSERT_TRUE(cr.isComplete());
+
+  utf32 = 0x267b;
+  cr.push(utf32);
+  popped = cr.pop_utf8();
+  ASSERT_EQ(popped.error, 0);
+  ASSERT_EQ(static_cast<int>(popped.out), 0xe2);
+  ASSERT_TRUE(!cr.isComplete());
+  popped = cr.pop_utf8();
+  ASSERT_EQ(popped.error, 0);
+  ASSERT_EQ(static_cast<int>(popped.out), 0x99);
+  ASSERT_TRUE(!cr.isComplete());
+  popped = cr.pop_utf8();
+  ASSERT_EQ(popped.error, 0);
+  ASSERT_EQ(static_cast<int>(popped.out), 0xbb);
+  ASSERT_TRUE(cr.isComplete());
+
+  popped = cr.pop_utf8();
+  ASSERT_NE(popped.error, 0);
+}

>From 3b6fccf53b40b8b5e2375e92aae2f7e69dd6d553 Mon Sep 17 00:00:00 2001
From: Uzair Nawaz <uzairnawaz at google.com>
Date: Thu, 12 Jun 2025 20:30:27 +0000
Subject: [PATCH 06/11] switched to bytes_processed

---
 .../__support/wchar/character_converter.cpp   | 34 +++++++---------
 .../src/__support/wchar/utf32_to_8_test.cpp   | 39 ++++++++++++++-----
 2 files changed, 44 insertions(+), 29 deletions(-)

diff --git a/libc/src/__support/wchar/character_converter.cpp b/libc/src/__support/wchar/character_converter.cpp
index 88528960b1a2f..2bf77d1b54178 100644
--- a/libc/src/__support/wchar/character_converter.cpp
+++ b/libc/src/__support/wchar/character_converter.cpp
@@ -19,15 +19,11 @@ namespace internal {
 
 CharacterConverter::CharacterConverter(mbstate *mbstate) { state = mbstate; }
 
-bool CharacterConverter::isComplete() {
-  return state->bits_processed / 8 == state->total_bytes;
-}
-
 int CharacterConverter::push(char8_t utf8_byte) { return utf8_byte; }
 
 int CharacterConverter::push(char32_t utf32) {
   state->partial = utf32;
-  state->bits_processed = 0;
+  state->bytes_processed = 0;
   state->total_bytes = 0;
 
   // determine number of utf-8 bytes needed to represent this utf32 value
@@ -51,7 +47,7 @@ utf_ret<char8_t> CharacterConverter::pop_utf8_seqlength1() {
   result.error = 0;
 
   // 0xxxxxxx
-  switch (state->bits_processed) {
+  switch (state->bytes_processed) {
   case 0:
     result.out = (char8_t)(state->partial);
     break;
@@ -60,7 +56,7 @@ utf_ret<char8_t> CharacterConverter::pop_utf8_seqlength1() {
     return result;
   }
 
-  state->bits_processed += 8;
+  state->bytes_processed++;
   return result;
 }
 
@@ -70,11 +66,11 @@ utf_ret<char8_t> CharacterConverter::pop_utf8_seqlength2() {
 
   // 110xxxxx 10xxxxxx
   char32_t utf32 = state->partial;
-  switch (state->bits_processed) {
+  switch (state->bytes_processed) {
   case 0:
     result.out = (char8_t)(0xC0 | (utf32 >> 6));
     break;
-  case 8:
+  case 1:
     result.out = (char8_t)(0x80 | (utf32 & 0x3f));
     break;
   default:
@@ -82,7 +78,7 @@ utf_ret<char8_t> CharacterConverter::pop_utf8_seqlength2() {
     return result;
   }
 
-  state->bits_processed += 8;
+  state->bytes_processed++;
   return result;
 }
 
@@ -92,14 +88,14 @@ utf_ret<char8_t> CharacterConverter::pop_utf8_seqlength3() {
 
   // 1110xxxx 10xxxxxx 10xxxxxx
   char32_t utf32 = state->partial;
-  switch (state->bits_processed) {
+  switch (state->bytes_processed) {
   case 0:
     result.out = (char8_t)(0xE0 | (utf32 >> 12));
     break;
-  case 8:
+  case 1:
     result.out = (char8_t)(0x80 | ((utf32 >> 6) & 0x3f));
     break;
-  case 16:
+  case 2:
     result.out = (char8_t)(0x80 | (utf32 & 0x3f));
     break;
   default:
@@ -107,7 +103,7 @@ utf_ret<char8_t> CharacterConverter::pop_utf8_seqlength3() {
     return result;
   }
 
-  state->bits_processed += 8;
+  state->bytes_processed++;
   return result;
 }
 
@@ -117,17 +113,17 @@ utf_ret<char8_t> CharacterConverter::pop_utf8_seqlength4() {
 
   // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
   char32_t utf32 = state->partial;
-  switch (state->bits_processed) {
+  switch (state->bytes_processed) {
   case 0:
     result.out = (char8_t)(0xF0 | (utf32 >> 18));
     break;
-  case 8:
+  case 1:
     result.out = (char8_t)(0x80 | ((utf32 >> 12) & 0x3f));
     break;
-  case 16:
+  case 2:
     result.out = (char8_t)(0x80 | ((utf32 >> 6) & 0x3f));
     break;
-  case 24:
+  case 3:
     result.out = (char8_t)(0x80 | (utf32 & 0x3f));
     break;
   default:
@@ -135,7 +131,7 @@ utf_ret<char8_t> CharacterConverter::pop_utf8_seqlength4() {
     return result;
   }
 
-  state->bits_processed += 8;
+  state->bytes_processed++;
   return result;
 }
 
diff --git a/libc/test/src/__support/wchar/utf32_to_8_test.cpp b/libc/test/src/__support/wchar/utf32_to_8_test.cpp
index 8e90ded93704c..28b4464eb1228 100644
--- a/libc/test/src/__support/wchar/utf32_to_8_test.cpp
+++ b/libc/test/src/__support/wchar/utf32_to_8_test.cpp
@@ -16,20 +16,22 @@ TEST(LlvmLibcCharacterConverterUTF32To8Test, OneByte) {
   LIBC_NAMESPACE::internal::mbstate state;
   LIBC_NAMESPACE::internal::CharacterConverter cr(&state);
 
-  char32_t utf32_A = 0x41;
+  // utf8 1-byte encodings are identical to their utf32 representations
+  char32_t utf32_A = 0x41; // 'A'
   cr.push(utf32_A);
   auto popped = cr.pop_utf8();
   ASSERT_EQ(popped.error, 0);
   ASSERT_EQ(static_cast<char>(popped.out), 'A');
   ASSERT_TRUE(cr.isComplete());
 
-  char32_t utf32_B = 0x42;
+  char32_t utf32_B = 0x42; // 'B'
   cr.push(utf32_B);
   popped = cr.pop_utf8();
   ASSERT_EQ(popped.error, 0);
   ASSERT_EQ(static_cast<char>(popped.out), 'B');
   ASSERT_TRUE(cr.isComplete());
 
+  // should error if we try to pop another utf8 byte out
   popped = cr.pop_utf8();
   ASSERT_NE(popped.error, 0);
 }
@@ -38,6 +40,7 @@ TEST(LlvmLibcCharacterConverterUTF32To8Test, TwoByte) {
   LIBC_NAMESPACE::internal::mbstate state;
   LIBC_NAMESPACE::internal::CharacterConverter cr(&state);
 
+  // testing utf32: 0xff -> utf8: 0xc3 0xbf
   char32_t utf32 = 0xff;
   cr.push(utf32);
   auto popped = cr.pop_utf8();
@@ -49,6 +52,7 @@ TEST(LlvmLibcCharacterConverterUTF32To8Test, TwoByte) {
   ASSERT_EQ(static_cast<int>(popped.out), 0xbf);
   ASSERT_TRUE(cr.isComplete());
 
+  // testing utf32: 0x58e -> utf8: 0xd6 0x8e
   utf32 = 0x58e;
   cr.push(utf32);
   popped = cr.pop_utf8();
@@ -60,6 +64,7 @@ TEST(LlvmLibcCharacterConverterUTF32To8Test, TwoByte) {
   ASSERT_EQ(static_cast<int>(popped.out), 0x8e);
   ASSERT_TRUE(cr.isComplete());
 
+  // should error if we try to pop another utf8 byte out
   popped = cr.pop_utf8();
   ASSERT_NE(popped.error, 0);
 }
@@ -68,6 +73,7 @@ TEST(LlvmLibcCharacterConverterUTF32To8Test, ThreeByte) {
   LIBC_NAMESPACE::internal::mbstate state;
   LIBC_NAMESPACE::internal::CharacterConverter cr(&state);
 
+  // testing utf32: 0xac15 -> utf8: 0xea 0xb0 0x95
   char32_t utf32 = 0xac15;
   cr.push(utf32);
   auto popped = cr.pop_utf8();
@@ -83,6 +89,7 @@ TEST(LlvmLibcCharacterConverterUTF32To8Test, ThreeByte) {
   ASSERT_EQ(static_cast<int>(popped.out), 0x95);
   ASSERT_TRUE(cr.isComplete());
 
+  // testing utf32: 0x267b -> utf8: 0xe2 0x99 0xbb
   utf32 = 0x267b;
   cr.push(utf32);
   popped = cr.pop_utf8();
@@ -98,6 +105,7 @@ TEST(LlvmLibcCharacterConverterUTF32To8Test, ThreeByte) {
   ASSERT_EQ(static_cast<int>(popped.out), 0xbb);
   ASSERT_TRUE(cr.isComplete());
 
+  // should error if we try to pop another utf8 byte out
   popped = cr.pop_utf8();
   ASSERT_NE(popped.error, 0);
 }
@@ -106,36 +114,47 @@ TEST(LlvmLibcCharacterConverterUTF32To8Test, FourByte) {
   LIBC_NAMESPACE::internal::mbstate state;
   LIBC_NAMESPACE::internal::CharacterConverter cr(&state);
 
-  char32_t utf32 = 0xac15;
+  // testing utf32: 0x1f921 -> utf8: 0xf0 0x9f 0xa4 0xa1
+  char32_t utf32 = 0x1f921;
   cr.push(utf32);
   auto popped = cr.pop_utf8();
   ASSERT_EQ(popped.error, 0);
-  ASSERT_EQ(static_cast<int>(popped.out), 0xea);
+  ASSERT_EQ(static_cast<int>(popped.out), 0xf0);
   ASSERT_TRUE(!cr.isComplete());
   popped = cr.pop_utf8();
   ASSERT_EQ(popped.error, 0);
-  ASSERT_EQ(static_cast<int>(popped.out), 0xb0);
+  ASSERT_EQ(static_cast<int>(popped.out), 0x9f);
   ASSERT_TRUE(!cr.isComplete());
   popped = cr.pop_utf8();
   ASSERT_EQ(popped.error, 0);
-  ASSERT_EQ(static_cast<int>(popped.out), 0x95);
+  ASSERT_EQ(static_cast<int>(popped.out), 0xa4);
+  ASSERT_TRUE(!cr.isComplete());
+  popped = cr.pop_utf8();
+  ASSERT_EQ(popped.error, 0);
+  ASSERT_EQ(static_cast<int>(popped.out), 0xa1);
   ASSERT_TRUE(cr.isComplete());
 
-  utf32 = 0x267b;
+  // testing utf32: 0x12121 -> utf8: 0xf0 0x92 0x84 0xa1
+  utf32 = 0x12121;
   cr.push(utf32);
   popped = cr.pop_utf8();
   ASSERT_EQ(popped.error, 0);
-  ASSERT_EQ(static_cast<int>(popped.out), 0xe2);
+  ASSERT_EQ(static_cast<int>(popped.out), 0xf0);
   ASSERT_TRUE(!cr.isComplete());
   popped = cr.pop_utf8();
   ASSERT_EQ(popped.error, 0);
-  ASSERT_EQ(static_cast<int>(popped.out), 0x99);
+  ASSERT_EQ(static_cast<int>(popped.out), 0x92);
   ASSERT_TRUE(!cr.isComplete());
   popped = cr.pop_utf8();
   ASSERT_EQ(popped.error, 0);
-  ASSERT_EQ(static_cast<int>(popped.out), 0xbb);
+  ASSERT_EQ(static_cast<int>(popped.out), 0x84);
+  ASSERT_TRUE(!cr.isComplete());
+  popped = cr.pop_utf8();
+  ASSERT_EQ(popped.error, 0);
+  ASSERT_EQ(static_cast<int>(popped.out), 0xa1);
   ASSERT_TRUE(cr.isComplete());
 
+  // should error if we try to pop another utf8 byte out
   popped = cr.pop_utf8();
   ASSERT_NE(popped.error, 0);
 }

>From 842a50550b20cd60dcf74f9e0969e173ec4206ef Mon Sep 17 00:00:00 2001
From: Uzair Nawaz <uzairnawaz at google.com>
Date: Thu, 12 Jun 2025 20:37:28 +0000
Subject: [PATCH 07/11] style

---
 libc/src/__support/wchar/character_converter.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/libc/src/__support/wchar/character_converter.cpp b/libc/src/__support/wchar/character_converter.cpp
index 481f7e61b6575..fd11b6843598a 100644
--- a/libc/src/__support/wchar/character_converter.cpp
+++ b/libc/src/__support/wchar/character_converter.cpp
@@ -154,7 +154,9 @@ utf_ret<char8_t> CharacterConverter::pop_utf8() {
   return {.out = 0, .error = -1};
 }
 
-utf_ret<char32_t> CharacterConverter::pop_utf32() { return {0, -1}; }
+utf_ret<char32_t> CharacterConverter::pop_utf32() {
+  return {.out = 0, .error = -1};
+}
 
 } // namespace internal
 } // namespace LIBC_NAMESPACE_DECL

>From 21b2c04fd99c5c87e3a9f5b31f1f0fae223e8849 Mon Sep 17 00:00:00 2001
From: Uzair Nawaz <uzairnawaz at google.com>
Date: Thu, 12 Jun 2025 20:40:18 +0000
Subject: [PATCH 08/11] style

---
 libc/test/src/__support/wchar/CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libc/test/src/__support/wchar/CMakeLists.txt b/libc/test/src/__support/wchar/CMakeLists.txt
index f6abdbcc54bc3..5dff6e9115f7d 100644
--- a/libc/test/src/__support/wchar/CMakeLists.txt
+++ b/libc/test/src/__support/wchar/CMakeLists.txt
@@ -1,11 +1,11 @@
 add_custom_target(libc-support-wchar-tests)
 
 add_libc_test(
-  utf32_to_8_test 
+  utf32_to_8_test
   SUITE
     libc-support-tests
   SRCS
-    utf32_to_8_test.cpp 
+    utf32_to_8_test.cpp
   DEPENDS
     libc.src.__support.wchar.character_converter
 )

>From fb758a3b7cca5eb11d1a8108204e8ebde7a98444 Mon Sep 17 00:00:00 2001
From: Uzair Nawaz <uzairnawaz at google.com>
Date: Thu, 12 Jun 2025 20:47:22 +0000
Subject: [PATCH 09/11] removed unimplemented functions from cpp

---
 libc/src/__support/wchar/character_converter.cpp | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/libc/src/__support/wchar/character_converter.cpp b/libc/src/__support/wchar/character_converter.cpp
index fd11b6843598a..ec78282d7f035 100644
--- a/libc/src/__support/wchar/character_converter.cpp
+++ b/libc/src/__support/wchar/character_converter.cpp
@@ -23,8 +23,6 @@ bool CharacterConverter::isComplete() {
   return state->bytes_processed == state->total_bytes;
 }
 
-int CharacterConverter::push(char8_t utf8_byte) { return utf8_byte; }
-
 int CharacterConverter::push(char32_t utf32) {
   state->partial = utf32;
   state->bytes_processed = 0;
@@ -154,9 +152,5 @@ utf_ret<char8_t> CharacterConverter::pop_utf8() {
   return {.out = 0, .error = -1};
 }
 
-utf_ret<char32_t> CharacterConverter::pop_utf32() {
-  return {.out = 0, .error = -1};
-}
-
 } // namespace internal
 } // namespace LIBC_NAMESPACE_DECL

>From e1cbdc389b20b7d895e56659b6de75c7df013f64 Mon Sep 17 00:00:00 2001
From: Uzair Nawaz <uzairnawaz at google.com>
Date: Thu, 12 Jun 2025 20:51:51 +0000
Subject: [PATCH 10/11] ensure mbstate is 0-initialized in tests

---
 libc/test/src/__support/wchar/utf32_to_8_test.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/libc/test/src/__support/wchar/utf32_to_8_test.cpp b/libc/test/src/__support/wchar/utf32_to_8_test.cpp
index 28b4464eb1228..fb492c7a53f9a 100644
--- a/libc/test/src/__support/wchar/utf32_to_8_test.cpp
+++ b/libc/test/src/__support/wchar/utf32_to_8_test.cpp
@@ -13,7 +13,7 @@
 #include "test/UnitTest/Test.h"
 
 TEST(LlvmLibcCharacterConverterUTF32To8Test, OneByte) {
-  LIBC_NAMESPACE::internal::mbstate state;
+  LIBC_NAMESPACE::internal::mbstate state{0, 0, 0};
   LIBC_NAMESPACE::internal::CharacterConverter cr(&state);
 
   // utf8 1-byte encodings are identical to their utf32 representations
@@ -37,7 +37,7 @@ TEST(LlvmLibcCharacterConverterUTF32To8Test, OneByte) {
 }
 
 TEST(LlvmLibcCharacterConverterUTF32To8Test, TwoByte) {
-  LIBC_NAMESPACE::internal::mbstate state;
+  LIBC_NAMESPACE::internal::mbstate state{0, 0, 0};
   LIBC_NAMESPACE::internal::CharacterConverter cr(&state);
 
   // testing utf32: 0xff -> utf8: 0xc3 0xbf
@@ -70,7 +70,7 @@ TEST(LlvmLibcCharacterConverterUTF32To8Test, TwoByte) {
 }
 
 TEST(LlvmLibcCharacterConverterUTF32To8Test, ThreeByte) {
-  LIBC_NAMESPACE::internal::mbstate state;
+  LIBC_NAMESPACE::internal::mbstate state{0, 0, 0};
   LIBC_NAMESPACE::internal::CharacterConverter cr(&state);
 
   // testing utf32: 0xac15 -> utf8: 0xea 0xb0 0x95
@@ -111,7 +111,7 @@ TEST(LlvmLibcCharacterConverterUTF32To8Test, ThreeByte) {
 }
 
 TEST(LlvmLibcCharacterConverterUTF32To8Test, FourByte) {
-  LIBC_NAMESPACE::internal::mbstate state;
+  LIBC_NAMESPACE::internal::mbstate state{0, 0, 0};
   LIBC_NAMESPACE::internal::CharacterConverter cr(&state);
 
   // testing utf32: 0x1f921 -> utf8: 0xf0 0x9f 0xa4 0xa1

>From ac92fff0c06db9772a7dde3a137bb9e3eec787da Mon Sep 17 00:00:00 2001
From: Uzair Nawaz <uzairnawaz at google.com>
Date: Thu, 12 Jun 2025 21:40:38 +0000
Subject: [PATCH 11/11] remove wchar test from macos

---
 libc/test/src/__support/CMakeLists.txt | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/libc/test/src/__support/CMakeLists.txt b/libc/test/src/__support/CMakeLists.txt
index 8905ac2127620..76218a16e0cf7 100644
--- a/libc/test/src/__support/CMakeLists.txt
+++ b/libc/test/src/__support/CMakeLists.txt
@@ -275,4 +275,9 @@ add_subdirectory(fixed_point)
 add_subdirectory(HashTable)
 add_subdirectory(time)
 add_subdirectory(threads)
-add_subdirectory(wchar)
+
+# Requires access to uchar header which is not on macos
+# Therefore, cannot currently build this on macos in overlay mode
+if(NOT(LIBC_TARGET_OS_IS_DARWIN))
+  add_subdirectory(wchar)
+endif()



More information about the libc-commits mailing list