[libc-commits] [libc] [libc] Wchar Stringconverter (PR #146388)

Uzair Nawaz via libc-commits libc-commits at lists.llvm.org
Mon Jul 14 09:57:59 PDT 2025


https://github.com/uzairnawaz updated https://github.com/llvm/llvm-project/pull/146388

>From c1e1650a822d6b8867cf05304937aa4a2c046fd3 Mon Sep 17 00:00:00 2001
From: Uzair Nawaz <uzairnawaz at google.com>
Date: Mon, 30 Jun 2025 16:24:13 +0000
Subject: [PATCH 01/12] added tests

---
 libc/src/__support/wchar/CMakeLists.txt       |  12 ++
 libc/src/__support/wchar/string_converter.h   |  85 +++++++++
 libc/test/src/__support/wchar/CMakeLists.txt  |  10 ++
 .../__support/wchar/string_converter_test.cpp | 167 ++++++++++++++++++
 4 files changed, 274 insertions(+)
 create mode 100644 libc/src/__support/wchar/string_converter.h
 create mode 100644 libc/test/src/__support/wchar/string_converter_test.cpp

diff --git a/libc/src/__support/wchar/CMakeLists.txt b/libc/src/__support/wchar/CMakeLists.txt
index 86a47319f278a..a62fd8fbb64cd 100644
--- a/libc/src/__support/wchar/CMakeLists.txt
+++ b/libc/src/__support/wchar/CMakeLists.txt
@@ -6,6 +6,18 @@ add_header_library(
     libc.hdr.types.char32_t    
 )
 
+add_header_library(
+  string_converter
+  HDRS
+    string_converter.h
+  DEPENDS
+    libc.hdr.types.char8_t
+    libc.hdr.types.char32_t
+    libc.src.__support.error_or
+    .mbstate
+    .character_converter 
+)
+
 add_object_library(
   character_converter
   HDRS
diff --git a/libc/src/__support/wchar/string_converter.h b/libc/src/__support/wchar/string_converter.h
new file mode 100644
index 0000000000000..cfbf116fd112e
--- /dev/null
+++ b/libc/src/__support/wchar/string_converter.h
@@ -0,0 +1,85 @@
+//===-- Definition of a class for mbstate_t and conversion -----*-- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_STRING_CONVERTER_H
+#define LLVM_LIBC_SRC___SUPPORT_STRING_CONVERTER_H
+
+#include "hdr/types/char32_t.h"
+#include "hdr/types/char8_t.h"
+#include "hdr/types/size_t.h"
+#include "src/__support/common.h"
+#include "src/__support/error_or.h"
+#include "src/__support/wchar/character_converter.h"
+#include "src/__support/wchar/mbstate.h"
+
+namespace LIBC_NAMESPACE_DECL {
+namespace internal {
+
+template <typename T> class StringConverter {
+private:
+  CharacterConverter cr;
+  const T *src;
+  size_t src_len;
+  size_t src_idx;
+
+  int pushFullCharacter() {
+    if (!cr.isEmpty())
+      return 0;
+
+    int original_idx = src_idx;
+    while (!cr.isFull() && src_idx < src_len) {
+      int err = cr.push(src[src_idx++]);
+      if (err != 0) {
+        // point to the beginning of the invalid sequence
+        src_idx = original_idx;
+        return err;
+      }
+    }
+
+    if (src_idx == src_len && !cr.isFull()) {
+      // src points to the beginning of the character
+      src_idx = original_idx;
+      return -1;
+    }
+
+    return 0;
+  }
+
+public:
+  StringConverter(const T *s, mbstate *ps)
+      : cr(ps), src(s), src_len(SIZE_MAX), src_idx(0) {}
+  StringConverter(const T *s, size_t len, mbstate *ps)
+      : cr(ps), src(s), src_len(len), src_idx(0) {}
+
+  ErrorOr<char32_t> popUTF32() {
+    if (pushFullCharacter() != 0)
+      return Error(0);
+
+    auto out = cr.pop_utf32();
+    if (out.has_value() && out.value() == L'\0')
+      src_len = src_idx;
+    
+    return out;
+  }
+
+  ErrorOr<char8_t> popUTF8() {
+    if (pushFullCharacter() != 0)
+      return Error(0);
+
+    auto out = cr.pop_utf8();
+    if (out.has_value() && out.value() == '\0')
+      src_len = src_idx;
+    
+    return out;
+  }
+};
+
+} // namespace internal
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC___SUPPORT_STRING_CONVERTER_H
diff --git a/libc/test/src/__support/wchar/CMakeLists.txt b/libc/test/src/__support/wchar/CMakeLists.txt
index 5176bfd4b024b..e8790df9a73cf 100644
--- a/libc/test/src/__support/wchar/CMakeLists.txt
+++ b/libc/test/src/__support/wchar/CMakeLists.txt
@@ -19,3 +19,13 @@ add_libc_test(
   DEPENDS
     libc.src.__support.wchar.character_converter
 )
+
+add_libc_test(
+  string_converter_test.cpp
+  SUITE
+    libc-support-tests
+  SRCS
+    string_converter_test.cpp
+  DEPENDS
+    libc.src.__support.wchar.string_converter
+)
diff --git a/libc/test/src/__support/wchar/string_converter_test.cpp b/libc/test/src/__support/wchar/string_converter_test.cpp
new file mode 100644
index 0000000000000..2532b54f9ab89
--- /dev/null
+++ b/libc/test/src/__support/wchar/string_converter_test.cpp
@@ -0,0 +1,167 @@
+//===-- Unittests for StringConverter class -------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "hdr/errno_macros.h"
+#include "hdr/types/char32_t.h"
+#include "hdr/types/char8_t.h"
+#include "src/__support/error_or.h"
+#include "src/__support/wchar/mbstate.h"
+#include "src/__support/wchar/string_converter.h"
+#include "test/UnitTest/Test.h"
+
+TEST(LlvmLibcStringConverterTest, UTF8To32) {
+  // first 4 bytes are clown emoji, then next 3 are sigma symbol
+  const char *src = "\xF0\x9F\xA4\xA1\xE2\x88\x91";
+  LIBC_NAMESPACE::internal::mbstate state;
+  LIBC_NAMESPACE::internal::StringConverter<char8_t> sc(
+      reinterpret_cast<const char8_t *>(src), &state);
+
+  auto res = sc.popUTF32();
+  ASSERT_TRUE(res.has_value());
+  ASSERT_EQ(static_cast<int>(res.value()), 0x1f921);
+
+  res = sc.popUTF32();
+  ASSERT_TRUE(res.has_value());
+  ASSERT_EQ(static_cast<int>(res.value()), 0x2211);
+
+  res = sc.popUTF32();
+  ASSERT_TRUE(res.has_value());
+  ASSERT_EQ(static_cast<int>(res.value()), 0);
+
+  res = sc.popUTF32();
+  ASSERT_FALSE(res.has_value());
+  ASSERT_EQ(res.error(), 0);
+}
+
+TEST(LlvmLibcStringConverterTest, UTF32To8) {
+  const wchar_t *src = L"\x1f921\x2211";
+  LIBC_NAMESPACE::internal::mbstate state;
+  LIBC_NAMESPACE::internal::StringConverter<char32_t> sc(
+      reinterpret_cast<const char32_t *>(src), &state);
+
+  auto res = sc.popUTF8();
+  ASSERT_TRUE(res.has_value());
+  ASSERT_EQ(static_cast<int>(res.value()), 0xF0);
+
+  res = sc.popUTF8();
+  ASSERT_TRUE(res.has_value());
+  ASSERT_EQ(static_cast<int>(res.value()), 0x9F);
+
+  res = sc.popUTF8();
+  ASSERT_TRUE(res.has_value());
+  ASSERT_EQ(static_cast<int>(res.value()), 0xA4);
+
+  res = sc.popUTF8();
+  ASSERT_TRUE(res.has_value());
+  ASSERT_EQ(static_cast<int>(res.value()), 0xA1);
+
+  res = sc.popUTF8();
+  ASSERT_TRUE(res.has_value());
+  ASSERT_EQ(static_cast<int>(res.value()), 0xE2);
+
+  res = sc.popUTF8();
+  ASSERT_TRUE(res.has_value());
+  ASSERT_EQ(static_cast<int>(res.value()), 0x88);
+
+  res = sc.popUTF8();
+  ASSERT_TRUE(res.has_value());
+  ASSERT_EQ(static_cast<int>(res.value()), 0x91);
+
+  res = sc.popUTF8();
+  ASSERT_TRUE(res.has_value());
+  ASSERT_EQ(static_cast<int>(res.value()), 0);
+
+  res = sc.popUTF8();
+  ASSERT_FALSE(res.has_value());
+  ASSERT_EQ(res.error(), 0);
+}
+
+TEST(LlvmLibcStringConverterTest, UTF32To8PartialRead) {
+  const wchar_t *src = L"\x1f921\x2211";
+  LIBC_NAMESPACE::internal::mbstate state;
+  LIBC_NAMESPACE::internal::StringConverter<char32_t> sc(
+      reinterpret_cast<const char32_t *>(src), 1, &state);
+
+  auto res = sc.popUTF8();
+  ASSERT_TRUE(res.has_value());
+  ASSERT_EQ(static_cast<int>(res.value()), 0xF0);
+
+  res = sc.popUTF8();
+  ASSERT_TRUE(res.has_value());
+  ASSERT_EQ(static_cast<int>(res.value()), 0x9F);
+
+  res = sc.popUTF8();
+  ASSERT_TRUE(res.has_value());
+  ASSERT_EQ(static_cast<int>(res.value()), 0xA4);
+
+  res = sc.popUTF8();
+  ASSERT_TRUE(res.has_value());
+  ASSERT_EQ(static_cast<int>(res.value()), 0xA1);
+
+  res = sc.popUTF8();
+  ASSERT_FALSE(res.has_value());
+  ASSERT_EQ(res.error(), 0);
+}
+
+TEST(LlvmLibcStringConverterTest, UTF8To32PartialRead) {
+  // first 4 bytes are clown emoji, then next 3 are sigma symbol
+  const char *src = "\xF0\x9F\xA4\xA1\xE2\x88\x91";
+  LIBC_NAMESPACE::internal::mbstate state;
+  LIBC_NAMESPACE::internal::StringConverter<char8_t> sc(
+      reinterpret_cast<const char8_t *>(src), 5, &state);
+ 
+  auto res = sc.popUTF32();
+  ASSERT_TRUE(res.has_value());
+  ASSERT_EQ(static_cast<int>(res.value()), 0x1f921);
+
+  res = sc.popUTF32();
+  ASSERT_FALSE(res.has_value());
+  ASSERT_EQ(static_cast<int>(res.error()), 0);
+}
+
+TEST(LlvmLibcStringConverterTest, UTF32To8ErrorHandling) {
+  const wchar_t *src = L"\x1f921\xffffff";
+  LIBC_NAMESPACE::internal::mbstate state;
+  LIBC_NAMESPACE::internal::StringConverter<char32_t> sc(
+      reinterpret_cast<const char32_t *>(src), &state);
+
+  auto res = sc.popUTF8();
+  ASSERT_TRUE(res.has_value());
+  ASSERT_EQ(static_cast<int>(res.value()), 0xF0);
+
+  res = sc.popUTF8();
+  ASSERT_TRUE(res.has_value());
+  ASSERT_EQ(static_cast<int>(res.value()), 0x9F);
+
+  res = sc.popUTF8();
+  ASSERT_TRUE(res.has_value());
+  ASSERT_EQ(static_cast<int>(res.value()), 0xA4);
+
+  res = sc.popUTF8();
+  ASSERT_TRUE(res.has_value());
+  ASSERT_EQ(static_cast<int>(res.value()), 0xA1);
+
+  res = sc.popUTF8();
+  ASSERT_FALSE(res.has_value());
+  ASSERT_EQ(static_cast<int>(res.error()), EILSEQ);
+}
+
+TEST(LlvmLibcStringConverterTest, UTF8To32ErrorHandling) {
+  const char *src = "\xF0\x9F\xA4\xA1\xE2\x88\x91";
+  LIBC_NAMESPACE::internal::mbstate state;
+  LIBC_NAMESPACE::internal::StringConverter<char8_t> sc(
+      reinterpret_cast<const char8_t *>(src), &state);
+
+  auto res = sc.popUTF32();
+  ASSERT_TRUE(res.has_value());
+  ASSERT_EQ(static_cast<int>(res.value()), 0x1f921);
+
+  res = sc.popUTF32();
+  ASSERT_FALSE(res.has_value());
+  ASSERT_EQ(static_cast<int>(res.error()), EILSEQ);
+}

>From f18221f23593f2e6a17e6a22a42c65f844e2f69f Mon Sep 17 00:00:00 2001
From: Uzair Nawaz <uzairnawaz at google.com>
Date: Mon, 30 Jun 2025 16:45:57 +0000
Subject: [PATCH 02/12] fixed test

---
 libc/src/__support/wchar/string_converter.h            | 10 ++++++----
 .../test/src/__support/wchar/string_converter_test.cpp | 10 +++++-----
 2 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/libc/src/__support/wchar/string_converter.h b/libc/src/__support/wchar/string_converter.h
index cfbf116fd112e..769691e5e2a87 100644
--- a/libc/src/__support/wchar/string_converter.h
+++ b/libc/src/__support/wchar/string_converter.h
@@ -57,8 +57,9 @@ template <typename T> class StringConverter {
       : cr(ps), src(s), src_len(len), src_idx(0) {}
 
   ErrorOr<char32_t> popUTF32() {
-    if (pushFullCharacter() != 0)
-      return Error(0);
+    int err = pushFullCharacter();
+    if (err != 0)
+      return Error(err);
 
     auto out = cr.pop_utf32();
     if (out.has_value() && out.value() == L'\0')
@@ -68,8 +69,9 @@ template <typename T> class StringConverter {
   }
 
   ErrorOr<char8_t> popUTF8() {
-    if (pushFullCharacter() != 0)
-      return Error(0);
+    int err = pushFullCharacter();
+    if (err != 0)
+      return Error(err);
 
     auto out = cr.pop_utf8();
     if (out.has_value() && out.value() == '\0')
diff --git a/libc/test/src/__support/wchar/string_converter_test.cpp b/libc/test/src/__support/wchar/string_converter_test.cpp
index 2532b54f9ab89..54e2d86585a71 100644
--- a/libc/test/src/__support/wchar/string_converter_test.cpp
+++ b/libc/test/src/__support/wchar/string_converter_test.cpp
@@ -35,7 +35,7 @@ TEST(LlvmLibcStringConverterTest, UTF8To32) {
 
   res = sc.popUTF32();
   ASSERT_FALSE(res.has_value());
-  ASSERT_EQ(res.error(), 0);
+  ASSERT_EQ(res.error(), -1);
 }
 
 TEST(LlvmLibcStringConverterTest, UTF32To8) {
@@ -78,7 +78,7 @@ TEST(LlvmLibcStringConverterTest, UTF32To8) {
 
   res = sc.popUTF8();
   ASSERT_FALSE(res.has_value());
-  ASSERT_EQ(res.error(), 0);
+  ASSERT_EQ(res.error(), -1);
 }
 
 TEST(LlvmLibcStringConverterTest, UTF32To8PartialRead) {
@@ -105,7 +105,7 @@ TEST(LlvmLibcStringConverterTest, UTF32To8PartialRead) {
 
   res = sc.popUTF8();
   ASSERT_FALSE(res.has_value());
-  ASSERT_EQ(res.error(), 0);
+  ASSERT_EQ(res.error(), -1);
 }
 
 TEST(LlvmLibcStringConverterTest, UTF8To32PartialRead) {
@@ -121,7 +121,7 @@ TEST(LlvmLibcStringConverterTest, UTF8To32PartialRead) {
 
   res = sc.popUTF32();
   ASSERT_FALSE(res.has_value());
-  ASSERT_EQ(static_cast<int>(res.error()), 0);
+  ASSERT_EQ(static_cast<int>(res.error()), -1);
 }
 
 TEST(LlvmLibcStringConverterTest, UTF32To8ErrorHandling) {
@@ -152,7 +152,7 @@ TEST(LlvmLibcStringConverterTest, UTF32To8ErrorHandling) {
 }
 
 TEST(LlvmLibcStringConverterTest, UTF8To32ErrorHandling) {
-  const char *src = "\xF0\x9F\xA4\xA1\xE2\x88\x91";
+  const char *src = "\xF0\x9F\xA4\xA1\xE2\x88";
   LIBC_NAMESPACE::internal::mbstate state;
   LIBC_NAMESPACE::internal::StringConverter<char8_t> sc(
       reinterpret_cast<const char8_t *>(src), &state);

>From 41e7e31d195535f1566bf0092030e86897ef5448 Mon Sep 17 00:00:00 2001
From: Uzair Nawaz <uzairnawaz at google.com>
Date: Mon, 30 Jun 2025 20:13:26 +0000
Subject: [PATCH 03/12] fixed dependencies; added comments

---
 libc/src/__support/wchar/CMakeLists.txt              |  1 +
 libc/src/__support/wchar/string_converter.h          |  5 +++--
 libc/test/src/__support/wchar/CMakeLists.txt         |  4 ++++
 .../src/__support/wchar/string_converter_test.cpp    | 12 +++++++-----
 4 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/libc/src/__support/wchar/CMakeLists.txt b/libc/src/__support/wchar/CMakeLists.txt
index 836fecde8d6df..0828e4057c172 100644
--- a/libc/src/__support/wchar/CMakeLists.txt
+++ b/libc/src/__support/wchar/CMakeLists.txt
@@ -13,6 +13,7 @@ add_header_library(
   DEPENDS
     libc.hdr.types.char8_t
     libc.hdr.types.char32_t
+    libc.hdr.types.size_t
     libc.src.__support.error_or
     .mbstate
     .character_converter 
diff --git a/libc/src/__support/wchar/string_converter.h b/libc/src/__support/wchar/string_converter.h
index 769691e5e2a87..100828cd45fab 100644
--- a/libc/src/__support/wchar/string_converter.h
+++ b/libc/src/__support/wchar/string_converter.h
@@ -41,6 +41,7 @@ template <typename T> class StringConverter {
       }
     }
 
+    // if we aren't able to read a full character from the source string
     if (src_idx == src_len && !cr.isFull()) {
       // src points to the beginning of the character
       src_idx = original_idx;
@@ -64,7 +65,7 @@ template <typename T> class StringConverter {
     auto out = cr.pop_utf32();
     if (out.has_value() && out.value() == L'\0')
       src_len = src_idx;
-    
+
     return out;
   }
 
@@ -76,7 +77,7 @@ template <typename T> class StringConverter {
     auto out = cr.pop_utf8();
     if (out.has_value() && out.value() == '\0')
       src_len = src_idx;
-    
+
     return out;
   }
 };
diff --git a/libc/test/src/__support/wchar/CMakeLists.txt b/libc/test/src/__support/wchar/CMakeLists.txt
index e8790df9a73cf..6332d6469e71e 100644
--- a/libc/test/src/__support/wchar/CMakeLists.txt
+++ b/libc/test/src/__support/wchar/CMakeLists.txt
@@ -28,4 +28,8 @@ add_libc_test(
     string_converter_test.cpp
   DEPENDS
     libc.src.__support.wchar.string_converter
+    libc.src.__support.wchar.mbstate
+    libc.src.__support.wchar.error_or
+    libc.hdr.errno_macros
+    libc.hdr.types.char32_t
 )
diff --git a/libc/test/src/__support/wchar/string_converter_test.cpp b/libc/test/src/__support/wchar/string_converter_test.cpp
index 54e2d86585a71..268cd6f17b590 100644
--- a/libc/test/src/__support/wchar/string_converter_test.cpp
+++ b/libc/test/src/__support/wchar/string_converter_test.cpp
@@ -15,7 +15,7 @@
 #include "test/UnitTest/Test.h"
 
 TEST(LlvmLibcStringConverterTest, UTF8To32) {
-  // first 4 bytes are clown emoji, then next 3 are sigma symbol
+  // first 4 bytes are clown emoji (🤡), then next 3 are sigma symbol (∑)
   const char *src = "\xF0\x9F\xA4\xA1\xE2\x88\x91";
   LIBC_NAMESPACE::internal::mbstate state;
   LIBC_NAMESPACE::internal::StringConverter<char8_t> sc(
@@ -39,7 +39,7 @@ TEST(LlvmLibcStringConverterTest, UTF8To32) {
 }
 
 TEST(LlvmLibcStringConverterTest, UTF32To8) {
-  const wchar_t *src = L"\x1f921\x2211";
+  const wchar_t *src = L"\x1f921\x2211"; // clown emoji, sigma symbol
   LIBC_NAMESPACE::internal::mbstate state;
   LIBC_NAMESPACE::internal::StringConverter<char32_t> sc(
       reinterpret_cast<const char32_t *>(src), &state);
@@ -82,7 +82,7 @@ TEST(LlvmLibcStringConverterTest, UTF32To8) {
 }
 
 TEST(LlvmLibcStringConverterTest, UTF32To8PartialRead) {
-  const wchar_t *src = L"\x1f921\x2211";
+  const wchar_t *src = L"\x1f921\x2211"; // clown emoji, sigma symbol
   LIBC_NAMESPACE::internal::mbstate state;
   LIBC_NAMESPACE::internal::StringConverter<char32_t> sc(
       reinterpret_cast<const char32_t *>(src), 1, &state);
@@ -114,7 +114,7 @@ TEST(LlvmLibcStringConverterTest, UTF8To32PartialRead) {
   LIBC_NAMESPACE::internal::mbstate state;
   LIBC_NAMESPACE::internal::StringConverter<char8_t> sc(
       reinterpret_cast<const char8_t *>(src), 5, &state);
- 
+
   auto res = sc.popUTF32();
   ASSERT_TRUE(res.has_value());
   ASSERT_EQ(static_cast<int>(res.value()), 0x1f921);
@@ -125,7 +125,7 @@ TEST(LlvmLibcStringConverterTest, UTF8To32PartialRead) {
 }
 
 TEST(LlvmLibcStringConverterTest, UTF32To8ErrorHandling) {
-  const wchar_t *src = L"\x1f921\xffffff";
+  const wchar_t *src = L"\x1f921\xffffff"; // clown emoji, invalid utf32
   LIBC_NAMESPACE::internal::mbstate state;
   LIBC_NAMESPACE::internal::StringConverter<char32_t> sc(
       reinterpret_cast<const char32_t *>(src), &state);
@@ -152,6 +152,8 @@ TEST(LlvmLibcStringConverterTest, UTF32To8ErrorHandling) {
 }
 
 TEST(LlvmLibcStringConverterTest, UTF8To32ErrorHandling) {
+  // first 4 bytes are clown emoji (🤡)
+  // next 2 don't form a complete character
   const char *src = "\xF0\x9F\xA4\xA1\xE2\x88";
   LIBC_NAMESPACE::internal::mbstate state;
   LIBC_NAMESPACE::internal::StringConverter<char8_t> sc(

>From e6453a23ac6ce1173e39319f4775af52e86c3109 Mon Sep 17 00:00:00 2001
From: Uzair Nawaz <uzairnawaz at google.com>
Date: Wed, 2 Jul 2025 17:20:55 +0000
Subject: [PATCH 04/12] StringConverter class complete!

---
 .../__support/wchar/character_converter.cpp   |  11 ++
 .../src/__support/wchar/character_converter.h |   4 +
 libc/src/__support/wchar/string_converter.h   |  71 +++++---
 libc/test/src/__support/wchar/CMakeLists.txt  |   2 +-
 .../__support/wchar/string_converter_test.cpp | 155 +++++++++++++++++-
 5 files changed, 211 insertions(+), 32 deletions(-)

diff --git a/libc/src/__support/wchar/character_converter.cpp b/libc/src/__support/wchar/character_converter.cpp
index 3cacfa5689e4d..8d7c4183c5998 100644
--- a/libc/src/__support/wchar/character_converter.cpp
+++ b/libc/src/__support/wchar/character_converter.cpp
@@ -14,6 +14,7 @@
 #include "src/__support/error_or.h"
 #include "src/__support/math_extras.h"
 #include "src/__support/wchar/mbstate.h"
+#include <stddef.h>
 
 #include "character_converter.h"
 
@@ -92,6 +93,7 @@ int CharacterConverter::push(char8_t utf8_byte) {
     state->bytes_stored++;
     return 0;
   }
+
   // Invalid byte -> reset the state
   clear();
   return EILSEQ;
@@ -130,6 +132,12 @@ ErrorOr<char32_t> CharacterConverter::pop_utf32() {
   return utf32;
 }
 
+size_t CharacterConverter::sizeAsUTF32() {
+  return 1; // a single utf-32 value can fit an entire character
+}
+
+size_t CharacterConverter::sizeAsUTF8() { return state->total_bytes; }
+
 ErrorOr<char8_t> CharacterConverter::pop_utf8() {
   if (isEmpty())
     return Error(-1);
@@ -156,6 +164,9 @@ ErrorOr<char8_t> CharacterConverter::pop_utf8() {
   }
 
   state->bytes_stored--;
+  if (state->bytes_stored == 0)
+    clear();
+
   return static_cast<char8_t>(output);
 }
 
diff --git a/libc/src/__support/wchar/character_converter.h b/libc/src/__support/wchar/character_converter.h
index d9a63fdc0522c..9e8dd71028002 100644
--- a/libc/src/__support/wchar/character_converter.h
+++ b/libc/src/__support/wchar/character_converter.h
@@ -14,6 +14,7 @@
 #include "src/__support/common.h"
 #include "src/__support/error_or.h"
 #include "src/__support/wchar/mbstate.h"
+#include <stddef.h>
 
 namespace LIBC_NAMESPACE_DECL {
 namespace internal {
@@ -30,6 +31,9 @@ class CharacterConverter {
   bool isEmpty();
   bool isValidState();
 
+  size_t sizeAsUTF32();
+  size_t sizeAsUTF8();
+
   int push(char8_t utf8_byte);
   int push(char32_t utf32);
 
diff --git a/libc/src/__support/wchar/string_converter.h b/libc/src/__support/wchar/string_converter.h
index 100828cd45fab..d7c735a8c7b17 100644
--- a/libc/src/__support/wchar/string_converter.h
+++ b/libc/src/__support/wchar/string_converter.h
@@ -26,25 +26,20 @@ template <typename T> class StringConverter {
   const T *src;
   size_t src_len;
   size_t src_idx;
+  size_t bytes_pushed;
+  size_t num_to_write;
 
   int pushFullCharacter() {
-    if (!cr.isEmpty())
-      return 0;
-
-    int original_idx = src_idx;
-    while (!cr.isFull() && src_idx < src_len) {
-      int err = cr.push(src[src_idx++]);
-      if (err != 0) {
-        // point to the beginning of the invalid sequence
-        src_idx = original_idx;
+    for (bytes_pushed = 0; !cr.isFull() && src_idx + bytes_pushed < src_len;
+         ++bytes_pushed) {
+      int err = cr.push(src[src_idx + bytes_pushed]);
+      if (err != 0)
         return err;
-      }
     }
 
     // if we aren't able to read a full character from the source string
-    if (src_idx == src_len && !cr.isFull()) {
-      // src points to the beginning of the character
-      src_idx = original_idx;
+    if (src_idx + bytes_pushed == src_len && !cr.isFull()) {
+      src_idx += bytes_pushed;
       return -1;
     }
 
@@ -52,34 +47,64 @@ template <typename T> class StringConverter {
   }
 
 public:
-  StringConverter(const T *s, mbstate *ps)
-      : cr(ps), src(s), src_len(SIZE_MAX), src_idx(0) {}
-  StringConverter(const T *s, size_t len, mbstate *ps)
-      : cr(ps), src(s), src_len(len), src_idx(0) {}
+  StringConverter(const T *s, size_t srclen, size_t dstlen, mbstate *ps)
+      : cr(ps), src(s), src_len(srclen), src_idx(0), bytes_pushed(0),
+        num_to_write(dstlen) {
+    pushFullCharacter();
+  }
+
+  StringConverter(const T *s, size_t dstlen, mbstate *ps)
+      : StringConverter(s, SIZE_MAX, dstlen, ps) {}
 
   ErrorOr<char32_t> popUTF32() {
-    int err = pushFullCharacter();
-    if (err != 0)
-      return Error(err);
+    if (cr.isEmpty()) {
+      int err = pushFullCharacter();
+      if (err != 0)
+        return Error(err);
+
+      if (cr.sizeAsUTF32() > num_to_write) {
+        cr.clear();
+        return Error(-1);
+      }
+    }
 
     auto out = cr.pop_utf32();
+    if (cr.isEmpty())
+      src_idx += bytes_pushed;
+
     if (out.has_value() && out.value() == L'\0')
       src_len = src_idx;
 
+    num_to_write--;
+
     return out;
   }
 
   ErrorOr<char8_t> popUTF8() {
-    int err = pushFullCharacter();
-    if (err != 0)
-      return Error(err);
+    if (cr.isEmpty()) {
+      int err = pushFullCharacter();
+      if (err != 0)
+        return Error(err);
+
+      if (cr.sizeAsUTF8() > num_to_write) {
+        cr.clear();
+        return Error(-1);
+      }
+    }
 
     auto out = cr.pop_utf8();
+    if (cr.isEmpty())
+      src_idx += bytes_pushed;
+
     if (out.has_value() && out.value() == '\0')
       src_len = src_idx;
 
+    num_to_write--;
+
     return out;
   }
+
+  size_t getSourceIndex() { return src_idx; }
 };
 
 } // namespace internal
diff --git a/libc/test/src/__support/wchar/CMakeLists.txt b/libc/test/src/__support/wchar/CMakeLists.txt
index 6332d6469e71e..6982232d67544 100644
--- a/libc/test/src/__support/wchar/CMakeLists.txt
+++ b/libc/test/src/__support/wchar/CMakeLists.txt
@@ -29,7 +29,7 @@ add_libc_test(
   DEPENDS
     libc.src.__support.wchar.string_converter
     libc.src.__support.wchar.mbstate
-    libc.src.__support.wchar.error_or
+    libc.src.__support.error_or
     libc.hdr.errno_macros
     libc.hdr.types.char32_t
 )
diff --git a/libc/test/src/__support/wchar/string_converter_test.cpp b/libc/test/src/__support/wchar/string_converter_test.cpp
index 268cd6f17b590..cb908e2fec2a8 100644
--- a/libc/test/src/__support/wchar/string_converter_test.cpp
+++ b/libc/test/src/__support/wchar/string_converter_test.cpp
@@ -19,89 +19,106 @@ TEST(LlvmLibcStringConverterTest, UTF8To32) {
   const char *src = "\xF0\x9F\xA4\xA1\xE2\x88\x91";
   LIBC_NAMESPACE::internal::mbstate state;
   LIBC_NAMESPACE::internal::StringConverter<char8_t> sc(
-      reinterpret_cast<const char8_t *>(src), &state);
+      reinterpret_cast<const char8_t *>(src), SIZE_MAX, &state);
 
   auto res = sc.popUTF32();
   ASSERT_TRUE(res.has_value());
   ASSERT_EQ(static_cast<int>(res.value()), 0x1f921);
+  ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 4);
 
   res = sc.popUTF32();
   ASSERT_TRUE(res.has_value());
   ASSERT_EQ(static_cast<int>(res.value()), 0x2211);
+  ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 7);
 
   res = sc.popUTF32();
   ASSERT_TRUE(res.has_value());
   ASSERT_EQ(static_cast<int>(res.value()), 0);
+  ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 8);
 
   res = sc.popUTF32();
   ASSERT_FALSE(res.has_value());
   ASSERT_EQ(res.error(), -1);
+  ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 8);
 }
 
 TEST(LlvmLibcStringConverterTest, UTF32To8) {
   const wchar_t *src = L"\x1f921\x2211"; // clown emoji, sigma symbol
   LIBC_NAMESPACE::internal::mbstate state;
   LIBC_NAMESPACE::internal::StringConverter<char32_t> sc(
-      reinterpret_cast<const char32_t *>(src), &state);
+      reinterpret_cast<const char32_t *>(src), SIZE_MAX, &state);
 
   auto res = sc.popUTF8();
   ASSERT_TRUE(res.has_value());
   ASSERT_EQ(static_cast<int>(res.value()), 0xF0);
+  ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 0);
 
   res = sc.popUTF8();
   ASSERT_TRUE(res.has_value());
   ASSERT_EQ(static_cast<int>(res.value()), 0x9F);
+  ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 0);
 
   res = sc.popUTF8();
   ASSERT_TRUE(res.has_value());
   ASSERT_EQ(static_cast<int>(res.value()), 0xA4);
+  ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 0);
 
   res = sc.popUTF8();
   ASSERT_TRUE(res.has_value());
   ASSERT_EQ(static_cast<int>(res.value()), 0xA1);
+  ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 1);
 
   res = sc.popUTF8();
   ASSERT_TRUE(res.has_value());
   ASSERT_EQ(static_cast<int>(res.value()), 0xE2);
+  ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 1);
 
   res = sc.popUTF8();
   ASSERT_TRUE(res.has_value());
   ASSERT_EQ(static_cast<int>(res.value()), 0x88);
+  ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 1);
 
   res = sc.popUTF8();
   ASSERT_TRUE(res.has_value());
   ASSERT_EQ(static_cast<int>(res.value()), 0x91);
+  ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 2);
 
   res = sc.popUTF8();
   ASSERT_TRUE(res.has_value());
   ASSERT_EQ(static_cast<int>(res.value()), 0);
+  ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 3);
 
   res = sc.popUTF8();
   ASSERT_FALSE(res.has_value());
   ASSERT_EQ(res.error(), -1);
+  ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 3);
 }
 
 TEST(LlvmLibcStringConverterTest, UTF32To8PartialRead) {
   const wchar_t *src = L"\x1f921\x2211"; // clown emoji, sigma symbol
   LIBC_NAMESPACE::internal::mbstate state;
   LIBC_NAMESPACE::internal::StringConverter<char32_t> sc(
-      reinterpret_cast<const char32_t *>(src), 1, &state);
+      reinterpret_cast<const char32_t *>(src), 1, SIZE_MAX, &state);
 
   auto res = sc.popUTF8();
   ASSERT_TRUE(res.has_value());
   ASSERT_EQ(static_cast<int>(res.value()), 0xF0);
+  ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 0);
 
   res = sc.popUTF8();
   ASSERT_TRUE(res.has_value());
   ASSERT_EQ(static_cast<int>(res.value()), 0x9F);
+  ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 0);
 
   res = sc.popUTF8();
   ASSERT_TRUE(res.has_value());
   ASSERT_EQ(static_cast<int>(res.value()), 0xA4);
+  ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 0);
 
   res = sc.popUTF8();
   ASSERT_TRUE(res.has_value());
   ASSERT_EQ(static_cast<int>(res.value()), 0xA1);
+  ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 1);
 
   res = sc.popUTF8();
   ASSERT_FALSE(res.has_value());
@@ -113,57 +130,179 @@ TEST(LlvmLibcStringConverterTest, UTF8To32PartialRead) {
   const char *src = "\xF0\x9F\xA4\xA1\xE2\x88\x91";
   LIBC_NAMESPACE::internal::mbstate state;
   LIBC_NAMESPACE::internal::StringConverter<char8_t> sc(
-      reinterpret_cast<const char8_t *>(src), 5, &state);
+      reinterpret_cast<const char8_t *>(src), 5, SIZE_MAX, &state);
 
   auto res = sc.popUTF32();
   ASSERT_TRUE(res.has_value());
   ASSERT_EQ(static_cast<int>(res.value()), 0x1f921);
+  ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 4);
 
   res = sc.popUTF32();
   ASSERT_FALSE(res.has_value());
   ASSERT_EQ(static_cast<int>(res.error()), -1);
+  ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 5);
 }
 
 TEST(LlvmLibcStringConverterTest, UTF32To8ErrorHandling) {
   const wchar_t *src = L"\x1f921\xffffff"; // clown emoji, invalid utf32
   LIBC_NAMESPACE::internal::mbstate state;
   LIBC_NAMESPACE::internal::StringConverter<char32_t> sc(
-      reinterpret_cast<const char32_t *>(src), &state);
+      reinterpret_cast<const char32_t *>(src), SIZE_MAX, &state);
 
   auto res = sc.popUTF8();
   ASSERT_TRUE(res.has_value());
   ASSERT_EQ(static_cast<int>(res.value()), 0xF0);
+  ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 0);
 
   res = sc.popUTF8();
   ASSERT_TRUE(res.has_value());
   ASSERT_EQ(static_cast<int>(res.value()), 0x9F);
+  ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 0);
 
   res = sc.popUTF8();
   ASSERT_TRUE(res.has_value());
   ASSERT_EQ(static_cast<int>(res.value()), 0xA4);
+  ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 0);
 
   res = sc.popUTF8();
   ASSERT_TRUE(res.has_value());
   ASSERT_EQ(static_cast<int>(res.value()), 0xA1);
+  ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 1);
 
   res = sc.popUTF8();
   ASSERT_FALSE(res.has_value());
   ASSERT_EQ(static_cast<int>(res.error()), EILSEQ);
+  ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 1);
 }
 
 TEST(LlvmLibcStringConverterTest, UTF8To32ErrorHandling) {
   // first 4 bytes are clown emoji (🤡)
-  // next 2 don't form a complete character
-  const char *src = "\xF0\x9F\xA4\xA1\xE2\x88";
+  // next 3 form an invalid character
+  const char *src = "\xF0\x9F\xA4\xA1\x90\x88\x30";
   LIBC_NAMESPACE::internal::mbstate state;
   LIBC_NAMESPACE::internal::StringConverter<char8_t> sc(
-      reinterpret_cast<const char8_t *>(src), &state);
+      reinterpret_cast<const char8_t *>(src), SIZE_MAX, &state);
 
   auto res = sc.popUTF32();
   ASSERT_TRUE(res.has_value());
   ASSERT_EQ(static_cast<int>(res.value()), 0x1f921);
+  ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 4);
 
   res = sc.popUTF32();
   ASSERT_FALSE(res.has_value());
   ASSERT_EQ(static_cast<int>(res.error()), EILSEQ);
+  ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 4);
+}
+
+TEST(LlvmLibcStringConverterTest, MultipleStringConverters32To8) {
+  /*
+  We do NOT test partially popping a character and expecting the next
+  StringConverter to continue where we left off. This is not expected to work
+  and considered invalid.
+  */
+  const wchar_t *src = L"\x1f921\xff"; // clown emoji, sigma symbol
+  LIBC_NAMESPACE::internal::mbstate state;
+  LIBC_NAMESPACE::internal::StringConverter<char32_t> sc1(
+      reinterpret_cast<const char32_t *>(src), 1, SIZE_MAX, &state);
+
+  auto res = sc1.popUTF8();
+  ASSERT_TRUE(res.has_value());
+  ASSERT_EQ(static_cast<int>(res.value()), 0xF0);
+  ASSERT_EQ(static_cast<int>(sc1.getSourceIndex()), 0);
+
+  res = sc1.popUTF8();
+  ASSERT_TRUE(res.has_value());
+  ASSERT_EQ(static_cast<int>(res.value()), 0x9F);
+  ASSERT_EQ(static_cast<int>(sc1.getSourceIndex()), 0);
+
+  res = sc1.popUTF8();
+  ASSERT_TRUE(res.has_value());
+  ASSERT_EQ(static_cast<int>(res.value()), 0xA4);
+  ASSERT_EQ(static_cast<int>(sc1.getSourceIndex()), 0);
+
+  res = sc1.popUTF8();
+  ASSERT_TRUE(res.has_value());
+  ASSERT_EQ(static_cast<int>(res.value()), 0xA1);
+  ASSERT_EQ(static_cast<int>(sc1.getSourceIndex()), 1);
+
+  LIBC_NAMESPACE::internal::StringConverter<char32_t> sc2(
+      reinterpret_cast<const char32_t *>(src) + sc1.getSourceIndex(), 1,
+      SIZE_MAX, &state);
+
+  res = sc2.popUTF8();
+  ASSERT_TRUE(res.has_value());
+  ASSERT_EQ(static_cast<int>(res.value()), 0xC3);
+  ASSERT_EQ(static_cast<int>(sc2.getSourceIndex()), 0);
+
+  res = sc2.popUTF8();
+  ASSERT_TRUE(res.has_value());
+  ASSERT_EQ(static_cast<int>(res.value()), 0xBF);
+  ASSERT_EQ(static_cast<int>(sc2.getSourceIndex()), 1);
+}
+
+TEST(LlvmLibcStringConverterTest, MultipleStringConverters8To32) {
+  const char *src = "\xF0\x9F\xA4\xA1"; // clown emoji
+  LIBC_NAMESPACE::internal::mbstate state;
+  LIBC_NAMESPACE::internal::StringConverter<char8_t> sc1(
+      reinterpret_cast<const char8_t *>(src), 2, SIZE_MAX, &state);
+
+  auto res = sc1.popUTF32();
+  ASSERT_FALSE(res.has_value());
+  ASSERT_EQ(static_cast<int>(res.error()), -1);
+  ASSERT_EQ(static_cast<int>(sc1.getSourceIndex()), 2);
+
+  LIBC_NAMESPACE::internal::StringConverter<char8_t> sc2(
+      reinterpret_cast<const char8_t *>(src) + sc1.getSourceIndex(), 3,
+      SIZE_MAX, &state);
+
+  res = sc2.popUTF32();
+  ASSERT_TRUE(res.has_value());
+  ASSERT_EQ(static_cast<int>(res.value()), 0x1f921);
+  ASSERT_EQ(static_cast<int>(sc2.getSourceIndex()), 2);
+
+  res = sc2.popUTF32();
+  ASSERT_TRUE(res.has_value());
+  ASSERT_EQ(static_cast<int>(res.value()), 0);
+  ASSERT_EQ(static_cast<int>(sc2.getSourceIndex()), 3);
+}
+
+TEST(LlvmLibcStringConverterTest, DstLimitUTF8To32) {
+  const char *src = "\xF0\x9F\xA4\xA1\xF0\x9F\xA4\xA1"; // 2 clown emojis
+  LIBC_NAMESPACE::internal::mbstate state;
+  LIBC_NAMESPACE::internal::StringConverter<char8_t> sc(
+      reinterpret_cast<const char8_t *>(src), SIZE_MAX, 1, &state);
+
+  auto res = sc.popUTF32();
+  ASSERT_TRUE(res.has_value());
+  ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 4);
+
+  res = sc.popUTF32(); // no space to pop this into
+  ASSERT_FALSE(res.has_value());
+}
+
+TEST(LlvmLibcStringConverterTest, DstLimitUTF32To8) {
+  const wchar_t *src = L"\x1f921\x1f921"; // 2 clown emojis
+  LIBC_NAMESPACE::internal::mbstate state;
+  LIBC_NAMESPACE::internal::StringConverter<char32_t> sc(
+      reinterpret_cast<const char32_t *>(src), SIZE_MAX, 5, &state);
+
+  auto res = sc.popUTF8();
+  ASSERT_TRUE(res.has_value());
+  ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 0);
+
+  res = sc.popUTF8();
+  ASSERT_TRUE(res.has_value());
+  ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 0);
+
+  res = sc.popUTF8();
+  ASSERT_TRUE(res.has_value());
+  ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 0);
+
+  res = sc.popUTF8();
+  ASSERT_TRUE(res.has_value());
+  ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 1);
+
+  res = sc.popUTF8();
+  ASSERT_FALSE(res.has_value());
+  ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 1);
 }

>From 86fb76f469ecdc82c74ae49411098cd9405537fb Mon Sep 17 00:00:00 2001
From: Uzair Nawaz <uzairnawaz at google.com>
Date: Mon, 7 Jul 2025 17:22:52 +0000
Subject: [PATCH 05/12] Updated internal member name (bytes_pushed isn't
 accurate)

---
 libc/src/__support/wchar/string_converter.h | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/libc/src/__support/wchar/string_converter.h b/libc/src/__support/wchar/string_converter.h
index d7c735a8c7b17..8c0417a3b7df4 100644
--- a/libc/src/__support/wchar/string_converter.h
+++ b/libc/src/__support/wchar/string_converter.h
@@ -26,20 +26,20 @@ template <typename T> class StringConverter {
   const T *src;
   size_t src_len;
   size_t src_idx;
-  size_t bytes_pushed;
+  size_t num_pushed;
   size_t num_to_write;
 
   int pushFullCharacter() {
-    for (bytes_pushed = 0; !cr.isFull() && src_idx + bytes_pushed < src_len;
-         ++bytes_pushed) {
-      int err = cr.push(src[src_idx + bytes_pushed]);
+    for (num_pushed = 0; !cr.isFull() && src_idx + num_pushed < src_len;
+         ++num_pushed) {
+      int err = cr.push(src[src_idx + num_pushed]);
       if (err != 0)
         return err;
     }
 
     // if we aren't able to read a full character from the source string
-    if (src_idx + bytes_pushed == src_len && !cr.isFull()) {
-      src_idx += bytes_pushed;
+    if (src_idx + num_pushed == src_len && !cr.isFull()) {
+      src_idx += num_pushed;
       return -1;
     }
 
@@ -48,7 +48,7 @@ template <typename T> class StringConverter {
 
 public:
   StringConverter(const T *s, size_t srclen, size_t dstlen, mbstate *ps)
-      : cr(ps), src(s), src_len(srclen), src_idx(0), bytes_pushed(0),
+      : cr(ps), src(s), src_len(srclen), src_idx(0), num_pushed(0),
         num_to_write(dstlen) {
     pushFullCharacter();
   }
@@ -70,7 +70,7 @@ template <typename T> class StringConverter {
 
     auto out = cr.pop_utf32();
     if (cr.isEmpty())
-      src_idx += bytes_pushed;
+      src_idx += num_pushed;
 
     if (out.has_value() && out.value() == L'\0')
       src_len = src_idx;
@@ -94,7 +94,7 @@ template <typename T> class StringConverter {
 
     auto out = cr.pop_utf8();
     if (cr.isEmpty())
-      src_idx += bytes_pushed;
+      src_idx += num_pushed;
 
     if (out.has_value() && out.value() == '\0')
       src_len = src_idx;

>From 5e4b5e26fbe62a1a6b0ed506e31bc832db69cd0a Mon Sep 17 00:00:00 2001
From: Uzair Nawaz <uzairnawaz at google.com>
Date: Tue, 8 Jul 2025 22:19:57 +0000
Subject: [PATCH 06/12] used proxy headers; improved tests

---
 libc/src/__support/wchar/CMakeLists.txt       |  1 +
 .../__support/wchar/character_converter.cpp   |  2 +-
 .../src/__support/wchar/character_converter.h |  2 +-
 libc/src/__support/wchar/string_converter.h   |  6 ++
 .../__support/wchar/string_converter_test.cpp | 56 +++++++++++++++----
 5 files changed, 55 insertions(+), 12 deletions(-)

diff --git a/libc/src/__support/wchar/CMakeLists.txt b/libc/src/__support/wchar/CMakeLists.txt
index 0828e4057c172..802441d37fe92 100644
--- a/libc/src/__support/wchar/CMakeLists.txt
+++ b/libc/src/__support/wchar/CMakeLists.txt
@@ -29,6 +29,7 @@ add_object_library(
     libc.hdr.errno_macros
     libc.hdr.types.char8_t
     libc.hdr.types.char32_t
+    libc.hdr.types.size_t
     libc.src.__support.error_or
     libc.src.__support.math_extras
     .mbstate
diff --git a/libc/src/__support/wchar/character_converter.cpp b/libc/src/__support/wchar/character_converter.cpp
index 8d7c4183c5998..15d0f478a18a9 100644
--- a/libc/src/__support/wchar/character_converter.cpp
+++ b/libc/src/__support/wchar/character_converter.cpp
@@ -9,12 +9,12 @@
 #include "hdr/errno_macros.h"
 #include "hdr/types/char32_t.h"
 #include "hdr/types/char8_t.h"
+#include "hdr/types/size_t.h"
 #include "src/__support/CPP/bit.h"
 #include "src/__support/common.h"
 #include "src/__support/error_or.h"
 #include "src/__support/math_extras.h"
 #include "src/__support/wchar/mbstate.h"
-#include <stddef.h>
 
 #include "character_converter.h"
 
diff --git a/libc/src/__support/wchar/character_converter.h b/libc/src/__support/wchar/character_converter.h
index 9e8dd71028002..b6d918f2d2edc 100644
--- a/libc/src/__support/wchar/character_converter.h
+++ b/libc/src/__support/wchar/character_converter.h
@@ -11,10 +11,10 @@
 
 #include "hdr/types/char32_t.h"
 #include "hdr/types/char8_t.h"
+#include "hdr/types/size_t.h"
 #include "src/__support/common.h"
 #include "src/__support/error_or.h"
 #include "src/__support/wchar/mbstate.h"
-#include <stddef.h>
 
 namespace LIBC_NAMESPACE_DECL {
 namespace internal {
diff --git a/libc/src/__support/wchar/string_converter.h b/libc/src/__support/wchar/string_converter.h
index 8c0417a3b7df4..d471c3541a79a 100644
--- a/libc/src/__support/wchar/string_converter.h
+++ b/libc/src/__support/wchar/string_converter.h
@@ -26,7 +26,11 @@ template <typename T> class StringConverter {
   const T *src;
   size_t src_len;
   size_t src_idx;
+
+  // # of src elements pushed to cr needed to represent the current character
   size_t num_pushed;
+
+  // # of pops we are allowed to perform (essentially size of the dest buffer)
   size_t num_to_write;
 
   int pushFullCharacter() {
@@ -56,6 +60,8 @@ template <typename T> class StringConverter {
   StringConverter(const T *s, size_t dstlen, mbstate *ps)
       : StringConverter(s, SIZE_MAX, dstlen, ps) {}
 
+  // TODO: following functions are almost identical
+  // look into templating CharacterConverter pop functions
   ErrorOr<char32_t> popUTF32() {
     if (cr.isEmpty()) {
       int err = pushFullCharacter();
diff --git a/libc/test/src/__support/wchar/string_converter_test.cpp b/libc/test/src/__support/wchar/string_converter_test.cpp
index cb908e2fec2a8..5066d468d4ede 100644
--- a/libc/test/src/__support/wchar/string_converter_test.cpp
+++ b/libc/test/src/__support/wchar/string_converter_test.cpp
@@ -15,8 +15,11 @@
 #include "test/UnitTest/Test.h"
 
 TEST(LlvmLibcStringConverterTest, UTF8To32) {
-  // first 4 bytes are clown emoji (🤡), then next 3 are sigma symbol (∑)
-  const char *src = "\xF0\x9F\xA4\xA1\xE2\x88\x91";
+  // first 4 bytes are clown emoji (🤡)
+  // next 3 bytes are sigma symbol (∑)
+  // next 2 bytes are y with diaeresis (ÿ)
+  // last byte is the letter A
+  const char *src = "\xF0\x9F\xA4\xA1\xE2\x88\x91\xC3\xBF\x41";
   LIBC_NAMESPACE::internal::mbstate state;
   LIBC_NAMESPACE::internal::StringConverter<char8_t> sc(
       reinterpret_cast<const char8_t *>(src), SIZE_MAX, &state);
@@ -31,19 +34,30 @@ TEST(LlvmLibcStringConverterTest, UTF8To32) {
   ASSERT_EQ(static_cast<int>(res.value()), 0x2211);
   ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 7);
 
+  res = sc.popUTF32();
+  ASSERT_TRUE(res.has_value());
+  ASSERT_EQ(static_cast<int>(res.value()), 0xff);
+  ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 9);
+
+  res = sc.popUTF32();
+  ASSERT_TRUE(res.has_value());
+  ASSERT_EQ(static_cast<int>(res.value()), 0x41);
+  ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 10);
+
   res = sc.popUTF32();
   ASSERT_TRUE(res.has_value());
   ASSERT_EQ(static_cast<int>(res.value()), 0);
-  ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 8);
+  ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 11);
 
   res = sc.popUTF32();
   ASSERT_FALSE(res.has_value());
   ASSERT_EQ(res.error(), -1);
-  ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 8);
+  ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 11);
 }
 
 TEST(LlvmLibcStringConverterTest, UTF32To8) {
-  const wchar_t *src = L"\x1f921\x2211"; // clown emoji, sigma symbol
+  // clown emoji, sigma symbol, y with diaeresis, letter A
+  const wchar_t *src = L"\x1f921\x2211\xff\x41";
   LIBC_NAMESPACE::internal::mbstate state;
   LIBC_NAMESPACE::internal::StringConverter<char32_t> sc(
       reinterpret_cast<const char32_t *>(src), SIZE_MAX, &state);
@@ -68,6 +82,7 @@ TEST(LlvmLibcStringConverterTest, UTF32To8) {
   ASSERT_EQ(static_cast<int>(res.value()), 0xA1);
   ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 1);
 
+  // end of clown emoji, sigma symbol begins
   res = sc.popUTF8();
   ASSERT_TRUE(res.has_value());
   ASSERT_EQ(static_cast<int>(res.value()), 0xE2);
@@ -83,15 +98,33 @@ TEST(LlvmLibcStringConverterTest, UTF32To8) {
   ASSERT_EQ(static_cast<int>(res.value()), 0x91);
   ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 2);
 
+  // end of sigma symbol, y with diaeresis begins
   res = sc.popUTF8();
   ASSERT_TRUE(res.has_value());
-  ASSERT_EQ(static_cast<int>(res.value()), 0);
+  ASSERT_EQ(static_cast<int>(res.value()), 0xC3);
+  ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 2);
+
+  res = sc.popUTF8();
+  ASSERT_TRUE(res.has_value());
+  ASSERT_EQ(static_cast<int>(res.value()), 0xBF);
   ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 3);
 
+  // end of y with diaeresis, letter A begins
+  res = sc.popUTF8();
+  ASSERT_TRUE(res.has_value());
+  ASSERT_EQ(static_cast<int>(res.value()), 0x41);
+  ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 4);
+
+  // null byte
+  res = sc.popUTF8();
+  ASSERT_TRUE(res.has_value());
+  ASSERT_EQ(static_cast<int>(res.value()), 0);
+  ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 5);
+
   res = sc.popUTF8();
   ASSERT_FALSE(res.has_value());
   ASSERT_EQ(res.error(), -1);
-  ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 3);
+  ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 5);
 }
 
 TEST(LlvmLibcStringConverterTest, UTF32To8PartialRead) {
@@ -120,6 +153,7 @@ TEST(LlvmLibcStringConverterTest, UTF32To8PartialRead) {
   ASSERT_EQ(static_cast<int>(res.value()), 0xA1);
   ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 1);
 
+  // can only read 1 character from source string, so error on next pop
   res = sc.popUTF8();
   ASSERT_FALSE(res.has_value());
   ASSERT_EQ(res.error(), -1);
@@ -200,7 +234,7 @@ TEST(LlvmLibcStringConverterTest, MultipleStringConverters32To8) {
   StringConverter to continue where we left off. This is not expected to work
   and considered invalid.
   */
-  const wchar_t *src = L"\x1f921\xff"; // clown emoji, sigma symbol
+  const wchar_t *src = L"\x1f921\xff"; // clown emoji, y with diaeresis (ÿ)
   LIBC_NAMESPACE::internal::mbstate state;
   LIBC_NAMESPACE::internal::StringConverter<char32_t> sc1(
       reinterpret_cast<const char32_t *>(src), 1, SIZE_MAX, &state);
@@ -225,6 +259,7 @@ TEST(LlvmLibcStringConverterTest, MultipleStringConverters32To8) {
   ASSERT_EQ(static_cast<int>(res.value()), 0xA1);
   ASSERT_EQ(static_cast<int>(sc1.getSourceIndex()), 1);
 
+  // sc2 should pick up where sc1 left off and continue the conversion
   LIBC_NAMESPACE::internal::StringConverter<char32_t> sc2(
       reinterpret_cast<const char32_t *>(src) + sc1.getSourceIndex(), 1,
       SIZE_MAX, &state);
@@ -251,6 +286,7 @@ TEST(LlvmLibcStringConverterTest, MultipleStringConverters8To32) {
   ASSERT_EQ(static_cast<int>(res.error()), -1);
   ASSERT_EQ(static_cast<int>(sc1.getSourceIndex()), 2);
 
+  // sc2 should pick up where sc1 left off and continue the conversion
   LIBC_NAMESPACE::internal::StringConverter<char8_t> sc2(
       reinterpret_cast<const char8_t *>(src) + sc1.getSourceIndex(), 3,
       SIZE_MAX, &state);
@@ -266,7 +302,7 @@ TEST(LlvmLibcStringConverterTest, MultipleStringConverters8To32) {
   ASSERT_EQ(static_cast<int>(sc2.getSourceIndex()), 3);
 }
 
-TEST(LlvmLibcStringConverterTest, DstLimitUTF8To32) {
+TEST(LlvmLibcStringConverterTest, DestLimitUTF8To32) {
   const char *src = "\xF0\x9F\xA4\xA1\xF0\x9F\xA4\xA1"; // 2 clown emojis
   LIBC_NAMESPACE::internal::mbstate state;
   LIBC_NAMESPACE::internal::StringConverter<char8_t> sc(
@@ -280,7 +316,7 @@ TEST(LlvmLibcStringConverterTest, DstLimitUTF8To32) {
   ASSERT_FALSE(res.has_value());
 }
 
-TEST(LlvmLibcStringConverterTest, DstLimitUTF32To8) {
+TEST(LlvmLibcStringConverterTest, DestLimitUTF32To8) {
   const wchar_t *src = L"\x1f921\x1f921"; // 2 clown emojis
   LIBC_NAMESPACE::internal::mbstate state;
   LIBC_NAMESPACE::internal::StringConverter<char32_t> sc(

>From 19b9397e33f25aa2309142e7d2bc0f76d969b3d2 Mon Sep 17 00:00:00 2001
From: Uzair Nawaz <uzairnawaz at google.com>
Date: Wed, 9 Jul 2025 21:04:06 +0000
Subject: [PATCH 07/12] incremented src_idx on first pop rather than last

---
 libc/src/__support/wchar/string_converter.h   | 56 ++++++++--------
 .../__support/wchar/string_converter_test.cpp | 66 +++++++++----------
 2 files changed, 60 insertions(+), 62 deletions(-)

diff --git a/libc/src/__support/wchar/string_converter.h b/libc/src/__support/wchar/string_converter.h
index d471c3541a79a..34561d975d41e 100644
--- a/libc/src/__support/wchar/string_converter.h
+++ b/libc/src/__support/wchar/string_converter.h
@@ -27,57 +27,55 @@ template <typename T> class StringConverter {
   size_t src_len;
   size_t src_idx;
 
-  // # of src elements pushed to cr needed to represent the current character
-  size_t num_pushed;
-
   // # of pops we are allowed to perform (essentially size of the dest buffer)
   size_t num_to_write;
 
-  int pushFullCharacter() {
+  // on the very first pop, we need to make sure that we always
+  // pushFullCharacter in case a previous StringConverter pushed part of a
+  // character to the mbstate
+  bool first_pop;
+
+  ErrorOr<size_t> pushFullCharacter() {
+    size_t num_pushed;
     for (num_pushed = 0; !cr.isFull() && src_idx + num_pushed < src_len;
          ++num_pushed) {
       int err = cr.push(src[src_idx + num_pushed]);
       if (err != 0)
-        return err;
+        return Error(err);
     }
 
     // if we aren't able to read a full character from the source string
     if (src_idx + num_pushed == src_len && !cr.isFull()) {
       src_idx += num_pushed;
-      return -1;
+      return Error(-1);
     }
 
-    return 0;
+    return num_pushed;
   }
 
 public:
-  StringConverter(const T *s, size_t srclen, size_t dstlen, mbstate *ps)
-      : cr(ps), src(s), src_len(srclen), src_idx(0), num_pushed(0),
-        num_to_write(dstlen) {
-    pushFullCharacter();
-  }
-
-  StringConverter(const T *s, size_t dstlen, mbstate *ps)
-      : StringConverter(s, SIZE_MAX, dstlen, ps) {}
+  StringConverter(const T *s, mbstate *ps, size_t dstlen, size_t srclen=SIZE_MAX)
+      : cr(ps), src(s), src_len(srclen), src_idx(0), num_to_write(dstlen),
+        first_pop(true) {}
 
   // TODO: following functions are almost identical
   // look into templating CharacterConverter pop functions
   ErrorOr<char32_t> popUTF32() {
-    if (cr.isEmpty()) {
-      int err = pushFullCharacter();
-      if (err != 0)
-        return Error(err);
+    if (cr.isEmpty() || first_pop) {
+      first_pop = false;
+      auto src_elements_read = pushFullCharacter();
+      if (!src_elements_read.has_value())
+        return Error(src_elements_read.error());
 
       if (cr.sizeAsUTF32() > num_to_write) {
         cr.clear();
         return Error(-1);
       }
+
+      src_idx += src_elements_read.value();
     }
 
     auto out = cr.pop_utf32();
-    if (cr.isEmpty())
-      src_idx += num_pushed;
-
     if (out.has_value() && out.value() == L'\0')
       src_len = src_idx;
 
@@ -87,21 +85,21 @@ template <typename T> class StringConverter {
   }
 
   ErrorOr<char8_t> popUTF8() {
-    if (cr.isEmpty()) {
-      int err = pushFullCharacter();
-      if (err != 0)
-        return Error(err);
+    if (cr.isEmpty() || first_pop) {
+      first_pop = false;
+      auto src_elements_read = pushFullCharacter();
+      if (!src_elements_read.has_value())
+        return Error(src_elements_read.error());
 
       if (cr.sizeAsUTF8() > num_to_write) {
         cr.clear();
         return Error(-1);
       }
+
+      src_idx += src_elements_read.value();
     }
 
     auto out = cr.pop_utf8();
-    if (cr.isEmpty())
-      src_idx += num_pushed;
-
     if (out.has_value() && out.value() == '\0')
       src_len = src_idx;
 
diff --git a/libc/test/src/__support/wchar/string_converter_test.cpp b/libc/test/src/__support/wchar/string_converter_test.cpp
index 5066d468d4ede..8dd743085120c 100644
--- a/libc/test/src/__support/wchar/string_converter_test.cpp
+++ b/libc/test/src/__support/wchar/string_converter_test.cpp
@@ -22,7 +22,7 @@ TEST(LlvmLibcStringConverterTest, UTF8To32) {
   const char *src = "\xF0\x9F\xA4\xA1\xE2\x88\x91\xC3\xBF\x41";
   LIBC_NAMESPACE::internal::mbstate state;
   LIBC_NAMESPACE::internal::StringConverter<char8_t> sc(
-      reinterpret_cast<const char8_t *>(src), SIZE_MAX, &state);
+      reinterpret_cast<const char8_t *>(src), &state, SIZE_MAX);
 
   auto res = sc.popUTF32();
   ASSERT_TRUE(res.has_value());
@@ -60,22 +60,22 @@ TEST(LlvmLibcStringConverterTest, UTF32To8) {
   const wchar_t *src = L"\x1f921\x2211\xff\x41";
   LIBC_NAMESPACE::internal::mbstate state;
   LIBC_NAMESPACE::internal::StringConverter<char32_t> sc(
-      reinterpret_cast<const char32_t *>(src), SIZE_MAX, &state);
+      reinterpret_cast<const char32_t *>(src), &state, SIZE_MAX);
 
   auto res = sc.popUTF8();
   ASSERT_TRUE(res.has_value());
   ASSERT_EQ(static_cast<int>(res.value()), 0xF0);
-  ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 0);
+  ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 1);
 
   res = sc.popUTF8();
   ASSERT_TRUE(res.has_value());
   ASSERT_EQ(static_cast<int>(res.value()), 0x9F);
-  ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 0);
+  ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 1);
 
   res = sc.popUTF8();
   ASSERT_TRUE(res.has_value());
   ASSERT_EQ(static_cast<int>(res.value()), 0xA4);
-  ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 0);
+  ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 1);
 
   res = sc.popUTF8();
   ASSERT_TRUE(res.has_value());
@@ -86,12 +86,12 @@ TEST(LlvmLibcStringConverterTest, UTF32To8) {
   res = sc.popUTF8();
   ASSERT_TRUE(res.has_value());
   ASSERT_EQ(static_cast<int>(res.value()), 0xE2);
-  ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 1);
+  ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 2);
 
   res = sc.popUTF8();
   ASSERT_TRUE(res.has_value());
   ASSERT_EQ(static_cast<int>(res.value()), 0x88);
-  ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 1);
+  ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 2);
 
   res = sc.popUTF8();
   ASSERT_TRUE(res.has_value());
@@ -102,7 +102,7 @@ TEST(LlvmLibcStringConverterTest, UTF32To8) {
   res = sc.popUTF8();
   ASSERT_TRUE(res.has_value());
   ASSERT_EQ(static_cast<int>(res.value()), 0xC3);
-  ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 2);
+  ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 3);
 
   res = sc.popUTF8();
   ASSERT_TRUE(res.has_value());
@@ -131,22 +131,22 @@ TEST(LlvmLibcStringConverterTest, UTF32To8PartialRead) {
   const wchar_t *src = L"\x1f921\x2211"; // clown emoji, sigma symbol
   LIBC_NAMESPACE::internal::mbstate state;
   LIBC_NAMESPACE::internal::StringConverter<char32_t> sc(
-      reinterpret_cast<const char32_t *>(src), 1, SIZE_MAX, &state);
+      reinterpret_cast<const char32_t *>(src), &state, SIZE_MAX, 1);
 
   auto res = sc.popUTF8();
   ASSERT_TRUE(res.has_value());
   ASSERT_EQ(static_cast<int>(res.value()), 0xF0);
-  ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 0);
+  ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 1);
 
   res = sc.popUTF8();
   ASSERT_TRUE(res.has_value());
   ASSERT_EQ(static_cast<int>(res.value()), 0x9F);
-  ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 0);
+  ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 1);
 
   res = sc.popUTF8();
   ASSERT_TRUE(res.has_value());
   ASSERT_EQ(static_cast<int>(res.value()), 0xA4);
-  ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 0);
+  ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 1);
 
   res = sc.popUTF8();
   ASSERT_TRUE(res.has_value());
@@ -164,7 +164,7 @@ TEST(LlvmLibcStringConverterTest, UTF8To32PartialRead) {
   const char *src = "\xF0\x9F\xA4\xA1\xE2\x88\x91";
   LIBC_NAMESPACE::internal::mbstate state;
   LIBC_NAMESPACE::internal::StringConverter<char8_t> sc(
-      reinterpret_cast<const char8_t *>(src), 5, SIZE_MAX, &state);
+      reinterpret_cast<const char8_t *>(src), &state, SIZE_MAX, 5);
 
   auto res = sc.popUTF32();
   ASSERT_TRUE(res.has_value());
@@ -181,22 +181,22 @@ TEST(LlvmLibcStringConverterTest, UTF32To8ErrorHandling) {
   const wchar_t *src = L"\x1f921\xffffff"; // clown emoji, invalid utf32
   LIBC_NAMESPACE::internal::mbstate state;
   LIBC_NAMESPACE::internal::StringConverter<char32_t> sc(
-      reinterpret_cast<const char32_t *>(src), SIZE_MAX, &state);
+      reinterpret_cast<const char32_t *>(src), &state, SIZE_MAX);
 
   auto res = sc.popUTF8();
   ASSERT_TRUE(res.has_value());
   ASSERT_EQ(static_cast<int>(res.value()), 0xF0);
-  ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 0);
+  ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 1);
 
   res = sc.popUTF8();
   ASSERT_TRUE(res.has_value());
   ASSERT_EQ(static_cast<int>(res.value()), 0x9F);
-  ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 0);
+  ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 1);
 
   res = sc.popUTF8();
   ASSERT_TRUE(res.has_value());
   ASSERT_EQ(static_cast<int>(res.value()), 0xA4);
-  ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 0);
+  ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 1);
 
   res = sc.popUTF8();
   ASSERT_TRUE(res.has_value());
@@ -215,7 +215,7 @@ TEST(LlvmLibcStringConverterTest, UTF8To32ErrorHandling) {
   const char *src = "\xF0\x9F\xA4\xA1\x90\x88\x30";
   LIBC_NAMESPACE::internal::mbstate state;
   LIBC_NAMESPACE::internal::StringConverter<char8_t> sc(
-      reinterpret_cast<const char8_t *>(src), SIZE_MAX, &state);
+      reinterpret_cast<const char8_t *>(src), &state, SIZE_MAX);
 
   auto res = sc.popUTF32();
   ASSERT_TRUE(res.has_value());
@@ -237,22 +237,22 @@ TEST(LlvmLibcStringConverterTest, MultipleStringConverters32To8) {
   const wchar_t *src = L"\x1f921\xff"; // clown emoji, y with diaeresis (ÿ)
   LIBC_NAMESPACE::internal::mbstate state;
   LIBC_NAMESPACE::internal::StringConverter<char32_t> sc1(
-      reinterpret_cast<const char32_t *>(src), 1, SIZE_MAX, &state);
+      reinterpret_cast<const char32_t *>(src), &state, SIZE_MAX, 1);
 
   auto res = sc1.popUTF8();
   ASSERT_TRUE(res.has_value());
   ASSERT_EQ(static_cast<int>(res.value()), 0xF0);
-  ASSERT_EQ(static_cast<int>(sc1.getSourceIndex()), 0);
+  ASSERT_EQ(static_cast<int>(sc1.getSourceIndex()), 1);
 
   res = sc1.popUTF8();
   ASSERT_TRUE(res.has_value());
   ASSERT_EQ(static_cast<int>(res.value()), 0x9F);
-  ASSERT_EQ(static_cast<int>(sc1.getSourceIndex()), 0);
+  ASSERT_EQ(static_cast<int>(sc1.getSourceIndex()), 1);
 
   res = sc1.popUTF8();
   ASSERT_TRUE(res.has_value());
   ASSERT_EQ(static_cast<int>(res.value()), 0xA4);
-  ASSERT_EQ(static_cast<int>(sc1.getSourceIndex()), 0);
+  ASSERT_EQ(static_cast<int>(sc1.getSourceIndex()), 1);
 
   res = sc1.popUTF8();
   ASSERT_TRUE(res.has_value());
@@ -261,13 +261,13 @@ TEST(LlvmLibcStringConverterTest, MultipleStringConverters32To8) {
 
   // sc2 should pick up where sc1 left off and continue the conversion
   LIBC_NAMESPACE::internal::StringConverter<char32_t> sc2(
-      reinterpret_cast<const char32_t *>(src) + sc1.getSourceIndex(), 1,
-      SIZE_MAX, &state);
+      reinterpret_cast<const char32_t *>(src) + sc1.getSourceIndex(), &state,
+      SIZE_MAX, 1);
 
   res = sc2.popUTF8();
   ASSERT_TRUE(res.has_value());
   ASSERT_EQ(static_cast<int>(res.value()), 0xC3);
-  ASSERT_EQ(static_cast<int>(sc2.getSourceIndex()), 0);
+  ASSERT_EQ(static_cast<int>(sc2.getSourceIndex()), 1);
 
   res = sc2.popUTF8();
   ASSERT_TRUE(res.has_value());
@@ -279,7 +279,7 @@ TEST(LlvmLibcStringConverterTest, MultipleStringConverters8To32) {
   const char *src = "\xF0\x9F\xA4\xA1"; // clown emoji
   LIBC_NAMESPACE::internal::mbstate state;
   LIBC_NAMESPACE::internal::StringConverter<char8_t> sc1(
-      reinterpret_cast<const char8_t *>(src), 2, SIZE_MAX, &state);
+      reinterpret_cast<const char8_t *>(src), &state, SIZE_MAX, 2);
 
   auto res = sc1.popUTF32();
   ASSERT_FALSE(res.has_value());
@@ -288,8 +288,8 @@ TEST(LlvmLibcStringConverterTest, MultipleStringConverters8To32) {
 
   // sc2 should pick up where sc1 left off and continue the conversion
   LIBC_NAMESPACE::internal::StringConverter<char8_t> sc2(
-      reinterpret_cast<const char8_t *>(src) + sc1.getSourceIndex(), 3,
-      SIZE_MAX, &state);
+      reinterpret_cast<const char8_t *>(src) + sc1.getSourceIndex(), &state,
+      SIZE_MAX, 3);
 
   res = sc2.popUTF32();
   ASSERT_TRUE(res.has_value());
@@ -306,7 +306,7 @@ TEST(LlvmLibcStringConverterTest, DestLimitUTF8To32) {
   const char *src = "\xF0\x9F\xA4\xA1\xF0\x9F\xA4\xA1"; // 2 clown emojis
   LIBC_NAMESPACE::internal::mbstate state;
   LIBC_NAMESPACE::internal::StringConverter<char8_t> sc(
-      reinterpret_cast<const char8_t *>(src), SIZE_MAX, 1, &state);
+      reinterpret_cast<const char8_t *>(src), &state, 1, SIZE_MAX);
 
   auto res = sc.popUTF32();
   ASSERT_TRUE(res.has_value());
@@ -320,19 +320,19 @@ TEST(LlvmLibcStringConverterTest, DestLimitUTF32To8) {
   const wchar_t *src = L"\x1f921\x1f921"; // 2 clown emojis
   LIBC_NAMESPACE::internal::mbstate state;
   LIBC_NAMESPACE::internal::StringConverter<char32_t> sc(
-      reinterpret_cast<const char32_t *>(src), SIZE_MAX, 5, &state);
+      reinterpret_cast<const char32_t *>(src), &state, 5, SIZE_MAX);
 
   auto res = sc.popUTF8();
   ASSERT_TRUE(res.has_value());
-  ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 0);
+  ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 1);
 
   res = sc.popUTF8();
   ASSERT_TRUE(res.has_value());
-  ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 0);
+  ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 1);
 
   res = sc.popUTF8();
   ASSERT_TRUE(res.has_value());
-  ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 0);
+  ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 1);
 
   res = sc.popUTF8();
   ASSERT_TRUE(res.has_value());

>From 8e969d9eabd6297fb4f4489d1f708bf8eddcb9f6 Mon Sep 17 00:00:00 2001
From: Uzair Nawaz <uzairnawaz at google.com>
Date: Wed, 9 Jul 2025 21:23:01 +0000
Subject: [PATCH 08/12] formatting

---
 libc/src/__support/wchar/string_converter.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/libc/src/__support/wchar/string_converter.h b/libc/src/__support/wchar/string_converter.h
index 34561d975d41e..9a9b7d2ec670a 100644
--- a/libc/src/__support/wchar/string_converter.h
+++ b/libc/src/__support/wchar/string_converter.h
@@ -54,7 +54,8 @@ template <typename T> class StringConverter {
   }
 
 public:
-  StringConverter(const T *s, mbstate *ps, size_t dstlen, size_t srclen=SIZE_MAX)
+  StringConverter(const T *s, mbstate *ps, size_t dstlen,
+                  size_t srclen = SIZE_MAX)
       : cr(ps), src(s), src_len(srclen), src_idx(0), num_to_write(dstlen),
         first_pop(true) {}
 

>From a5e8646d3f703b914628b00c702357d46edc169e Mon Sep 17 00:00:00 2001
From: Uzair Nawaz <uzairnawaz at google.com>
Date: Wed, 9 Jul 2025 22:19:07 +0000
Subject: [PATCH 09/12] removed redundant bool

---
 libc/src/__support/wchar/string_converter.h        | 14 +++-----------
 .../src/__support/wchar/string_converter_test.cpp  |  4 ++--
 2 files changed, 5 insertions(+), 13 deletions(-)

diff --git a/libc/src/__support/wchar/string_converter.h b/libc/src/__support/wchar/string_converter.h
index 9a9b7d2ec670a..0635bc57bf3e2 100644
--- a/libc/src/__support/wchar/string_converter.h
+++ b/libc/src/__support/wchar/string_converter.h
@@ -30,11 +30,6 @@ template <typename T> class StringConverter {
   // # of pops we are allowed to perform (essentially size of the dest buffer)
   size_t num_to_write;
 
-  // on the very first pop, we need to make sure that we always
-  // pushFullCharacter in case a previous StringConverter pushed part of a
-  // character to the mbstate
-  bool first_pop;
-
   ErrorOr<size_t> pushFullCharacter() {
     size_t num_pushed;
     for (num_pushed = 0; !cr.isFull() && src_idx + num_pushed < src_len;
@@ -56,14 +51,12 @@ template <typename T> class StringConverter {
 public:
   StringConverter(const T *s, mbstate *ps, size_t dstlen,
                   size_t srclen = SIZE_MAX)
-      : cr(ps), src(s), src_len(srclen), src_idx(0), num_to_write(dstlen),
-        first_pop(true) {}
+      : cr(ps), src(s), src_len(srclen), src_idx(0), num_to_write(dstlen) {}
 
   // TODO: following functions are almost identical
   // look into templating CharacterConverter pop functions
   ErrorOr<char32_t> popUTF32() {
-    if (cr.isEmpty() || first_pop) {
-      first_pop = false;
+    if (cr.isEmpty() || src_idx == 0) {
       auto src_elements_read = pushFullCharacter();
       if (!src_elements_read.has_value())
         return Error(src_elements_read.error());
@@ -86,8 +79,7 @@ template <typename T> class StringConverter {
   }
 
   ErrorOr<char8_t> popUTF8() {
-    if (cr.isEmpty() || first_pop) {
-      first_pop = false;
+    if (cr.isEmpty() || src_idx == 0) {
       auto src_elements_read = pushFullCharacter();
       if (!src_elements_read.has_value())
         return Error(src_elements_read.error());
diff --git a/libc/test/src/__support/wchar/string_converter_test.cpp b/libc/test/src/__support/wchar/string_converter_test.cpp
index 8dd743085120c..b11df19f0dafb 100644
--- a/libc/test/src/__support/wchar/string_converter_test.cpp
+++ b/libc/test/src/__support/wchar/string_converter_test.cpp
@@ -306,7 +306,7 @@ TEST(LlvmLibcStringConverterTest, DestLimitUTF8To32) {
   const char *src = "\xF0\x9F\xA4\xA1\xF0\x9F\xA4\xA1"; // 2 clown emojis
   LIBC_NAMESPACE::internal::mbstate state;
   LIBC_NAMESPACE::internal::StringConverter<char8_t> sc(
-      reinterpret_cast<const char8_t *>(src), &state, 1, SIZE_MAX);
+      reinterpret_cast<const char8_t *>(src), &state, 1);
 
   auto res = sc.popUTF32();
   ASSERT_TRUE(res.has_value());
@@ -320,7 +320,7 @@ TEST(LlvmLibcStringConverterTest, DestLimitUTF32To8) {
   const wchar_t *src = L"\x1f921\x1f921"; // 2 clown emojis
   LIBC_NAMESPACE::internal::mbstate state;
   LIBC_NAMESPACE::internal::StringConverter<char32_t> sc(
-      reinterpret_cast<const char32_t *>(src), &state, 5, SIZE_MAX);
+      reinterpret_cast<const char32_t *>(src), &state, 5);
 
   auto res = sc.popUTF8();
   ASSERT_TRUE(res.has_value());

>From 7ef9579b4a556cbcc938e50dd2a78d70fc463d6b Mon Sep 17 00:00:00 2001
From: Uzair Nawaz <uzairnawaz at google.com>
Date: Thu, 10 Jul 2025 23:36:50 +0000
Subject: [PATCH 10/12] exclude windows for wchar tests

---
 libc/src/__support/CMakeLists.txt      | 2 +-
 libc/test/src/__support/CMakeLists.txt | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/libc/src/__support/CMakeLists.txt b/libc/src/__support/CMakeLists.txt
index 7e85136c08851..9de6971b73294 100644
--- a/libc/src/__support/CMakeLists.txt
+++ b/libc/src/__support/CMakeLists.txt
@@ -393,7 +393,7 @@ add_subdirectory(time)
 
 # Requires access to uchar header which is not on macos
 # Therefore, cannot currently build this on macos in overlay mode
-if(NOT(LIBC_TARGET_OS_IS_DARWIN))
+if(NOT (LIBC_TARGET_OS_IS_DARWIN OR LIBC_TARGET_OS_IS_WINDOWS))
   add_subdirectory(wchar)
 endif()
 
diff --git a/libc/test/src/__support/CMakeLists.txt b/libc/test/src/__support/CMakeLists.txt
index 9f626ed31cc07..578bef871fed5 100644
--- a/libc/test/src/__support/CMakeLists.txt
+++ b/libc/test/src/__support/CMakeLists.txt
@@ -277,6 +277,6 @@ add_subdirectory(time)
 add_subdirectory(threads)
 # Requires access to uchar header which is not on MacOS
 # Cannot currently build this on MacOS in overlay mode
-if(NOT(LIBC_TARGET_OS_IS_DARWIN))
+if(NOT (LIBC_TARGET_OS_IS_DARWIN OR LIBC_TARGET_OS_IS_WINDOWS))
   add_subdirectory(wchar)
 endif()

>From 14fe00af545713978363031d6ac9d558883ab71d Mon Sep 17 00:00:00 2001
From: Uzair Nawaz <uzairnawaz at google.com>
Date: Mon, 14 Jul 2025 16:41:25 +0000
Subject: [PATCH 11/12] fixed stringconverter windows build

---
 libc/src/__support/CMakeLists.txt                      |  2 +-
 libc/test/src/__support/CMakeLists.txt                 |  2 +-
 .../test/src/__support/wchar/string_converter_test.cpp | 10 +++++-----
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/libc/src/__support/CMakeLists.txt b/libc/src/__support/CMakeLists.txt
index 9de6971b73294..7186948f823b2 100644
--- a/libc/src/__support/CMakeLists.txt
+++ b/libc/src/__support/CMakeLists.txt
@@ -393,7 +393,7 @@ add_subdirectory(time)
 
 # Requires access to uchar header which is not on macos
 # Therefore, cannot currently build this on macos in overlay mode
-if(NOT (LIBC_TARGET_OS_IS_DARWIN OR LIBC_TARGET_OS_IS_WINDOWS))
+if(NOT (LIBC_TARGET_OS_IS_DARWIN))
   add_subdirectory(wchar)
 endif()
 
diff --git a/libc/test/src/__support/CMakeLists.txt b/libc/test/src/__support/CMakeLists.txt
index 578bef871fed5..1a184d428d2ae 100644
--- a/libc/test/src/__support/CMakeLists.txt
+++ b/libc/test/src/__support/CMakeLists.txt
@@ -277,6 +277,6 @@ add_subdirectory(time)
 add_subdirectory(threads)
 # Requires access to uchar header which is not on MacOS
 # Cannot currently build this on MacOS in overlay mode
-if(NOT (LIBC_TARGET_OS_IS_DARWIN OR LIBC_TARGET_OS_IS_WINDOWS))
+if(NOT (LIBC_TARGET_OS_IS_DARWIN))
   add_subdirectory(wchar)
 endif()
diff --git a/libc/test/src/__support/wchar/string_converter_test.cpp b/libc/test/src/__support/wchar/string_converter_test.cpp
index b11df19f0dafb..cad8735791fa2 100644
--- a/libc/test/src/__support/wchar/string_converter_test.cpp
+++ b/libc/test/src/__support/wchar/string_converter_test.cpp
@@ -57,7 +57,7 @@ TEST(LlvmLibcStringConverterTest, UTF8To32) {
 
 TEST(LlvmLibcStringConverterTest, UTF32To8) {
   // clown emoji, sigma symbol, y with diaeresis, letter A
-  const wchar_t *src = L"\x1f921\x2211\xff\x41";
+  const wchar_t src[] = {0x1f921, 0x2211, 0xff, 0x41, 0x0};
   LIBC_NAMESPACE::internal::mbstate state;
   LIBC_NAMESPACE::internal::StringConverter<char32_t> sc(
       reinterpret_cast<const char32_t *>(src), &state, SIZE_MAX);
@@ -128,7 +128,7 @@ TEST(LlvmLibcStringConverterTest, UTF32To8) {
 }
 
 TEST(LlvmLibcStringConverterTest, UTF32To8PartialRead) {
-  const wchar_t *src = L"\x1f921\x2211"; // clown emoji, sigma symbol
+  const wchar_t src[] = {0x1f921, 0x2211, 0x0}; // clown emoji, sigma symbol
   LIBC_NAMESPACE::internal::mbstate state;
   LIBC_NAMESPACE::internal::StringConverter<char32_t> sc(
       reinterpret_cast<const char32_t *>(src), &state, SIZE_MAX, 1);
@@ -178,7 +178,7 @@ TEST(LlvmLibcStringConverterTest, UTF8To32PartialRead) {
 }
 
 TEST(LlvmLibcStringConverterTest, UTF32To8ErrorHandling) {
-  const wchar_t *src = L"\x1f921\xffffff"; // clown emoji, invalid utf32
+  const wchar_t src[] = {0x1f921, 0xffffff, 0x0}; // clown emoji, invalid utf32
   LIBC_NAMESPACE::internal::mbstate state;
   LIBC_NAMESPACE::internal::StringConverter<char32_t> sc(
       reinterpret_cast<const char32_t *>(src), &state, SIZE_MAX);
@@ -234,7 +234,7 @@ TEST(LlvmLibcStringConverterTest, MultipleStringConverters32To8) {
   StringConverter to continue where we left off. This is not expected to work
   and considered invalid.
   */
-  const wchar_t *src = L"\x1f921\xff"; // clown emoji, y with diaeresis (ÿ)
+  const wchar_t src[] = {0x1f921, 0xff, 0x0}; // clown emoji, y with diaeresis (ÿ)
   LIBC_NAMESPACE::internal::mbstate state;
   LIBC_NAMESPACE::internal::StringConverter<char32_t> sc1(
       reinterpret_cast<const char32_t *>(src), &state, SIZE_MAX, 1);
@@ -317,7 +317,7 @@ TEST(LlvmLibcStringConverterTest, DestLimitUTF8To32) {
 }
 
 TEST(LlvmLibcStringConverterTest, DestLimitUTF32To8) {
-  const wchar_t *src = L"\x1f921\x1f921"; // 2 clown emojis
+  const wchar_t src[] = {0x1f921, 0x1f921}; // 2 clown emojis
   LIBC_NAMESPACE::internal::mbstate state;
   LIBC_NAMESPACE::internal::StringConverter<char32_t> sc(
       reinterpret_cast<const char32_t *>(src), &state, 5);

>From d09f18ed7cc761029c330087cfc24f9f3c93cc77 Mon Sep 17 00:00:00 2001
From: Uzair Nawaz <uzairnawaz at google.com>
Date: Mon, 14 Jul 2025 16:56:10 +0000
Subject: [PATCH 12/12] formatting + explicit casts

---
 .../__support/wchar/string_converter_test.cpp | 20 ++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/libc/test/src/__support/wchar/string_converter_test.cpp b/libc/test/src/__support/wchar/string_converter_test.cpp
index cad8735791fa2..1b0ba64401c51 100644
--- a/libc/test/src/__support/wchar/string_converter_test.cpp
+++ b/libc/test/src/__support/wchar/string_converter_test.cpp
@@ -57,7 +57,10 @@ TEST(LlvmLibcStringConverterTest, UTF8To32) {
 
 TEST(LlvmLibcStringConverterTest, UTF32To8) {
   // clown emoji, sigma symbol, y with diaeresis, letter A
-  const wchar_t src[] = {0x1f921, 0x2211, 0xff, 0x41, 0x0};
+  const wchar_t src[] = {static_cast<wchar_t>(0x1f921),
+                         static_cast<wchar_t>(0x2211),
+                         static_cast<wchar_t>(0xff), static_cast<wchar_t>(0x41),
+                         static_cast<wchar_t>(0x0)};
   LIBC_NAMESPACE::internal::mbstate state;
   LIBC_NAMESPACE::internal::StringConverter<char32_t> sc(
       reinterpret_cast<const char32_t *>(src), &state, SIZE_MAX);
@@ -128,7 +131,9 @@ TEST(LlvmLibcStringConverterTest, UTF32To8) {
 }
 
 TEST(LlvmLibcStringConverterTest, UTF32To8PartialRead) {
-  const wchar_t src[] = {0x1f921, 0x2211, 0x0}; // clown emoji, sigma symbol
+  const wchar_t src[] = {
+      static_cast<wchar_t>(0x1f921), static_cast<wchar_t>(0x2211),
+      static_cast<wchar_t>(0x0)}; // clown emoji, sigma symbol
   LIBC_NAMESPACE::internal::mbstate state;
   LIBC_NAMESPACE::internal::StringConverter<char32_t> sc(
       reinterpret_cast<const char32_t *>(src), &state, SIZE_MAX, 1);
@@ -178,7 +183,9 @@ TEST(LlvmLibcStringConverterTest, UTF8To32PartialRead) {
 }
 
 TEST(LlvmLibcStringConverterTest, UTF32To8ErrorHandling) {
-  const wchar_t src[] = {0x1f921, 0xffffff, 0x0}; // clown emoji, invalid utf32
+  const wchar_t src[] = {
+      static_cast<wchar_t>(0x1f921), static_cast<wchar_t>(0xffffff),
+      static_cast<wchar_t>(0x0)}; // clown emoji, invalid utf32
   LIBC_NAMESPACE::internal::mbstate state;
   LIBC_NAMESPACE::internal::StringConverter<char32_t> sc(
       reinterpret_cast<const char32_t *>(src), &state, SIZE_MAX);
@@ -234,7 +241,9 @@ TEST(LlvmLibcStringConverterTest, MultipleStringConverters32To8) {
   StringConverter to continue where we left off. This is not expected to work
   and considered invalid.
   */
-  const wchar_t src[] = {0x1f921, 0xff, 0x0}; // clown emoji, y with diaeresis (ÿ)
+  const wchar_t src[] = {
+      static_cast<wchar_t>(0x1f921), static_cast<wchar_t>(0xff),
+      static_cast<wchar_t>(0x0)}; // clown emoji, y with diaeresis (ÿ)
   LIBC_NAMESPACE::internal::mbstate state;
   LIBC_NAMESPACE::internal::StringConverter<char32_t> sc1(
       reinterpret_cast<const char32_t *>(src), &state, SIZE_MAX, 1);
@@ -317,7 +326,8 @@ TEST(LlvmLibcStringConverterTest, DestLimitUTF8To32) {
 }
 
 TEST(LlvmLibcStringConverterTest, DestLimitUTF32To8) {
-  const wchar_t src[] = {0x1f921, 0x1f921}; // 2 clown emojis
+  const wchar_t src[] = {static_cast<wchar_t>(0x1f921),
+                         static_cast<wchar_t>(0x1f921)}; // 2 clown emojis
   LIBC_NAMESPACE::internal::mbstate state;
   LIBC_NAMESPACE::internal::StringConverter<char32_t> sc(
       reinterpret_cast<const char32_t *>(src), &state, 5);



More information about the libc-commits mailing list