[libc-commits] [libc] [libc] Refactored internal wcrtomb function (PR #145923)

Uzair Nawaz via libc-commits libc-commits at lists.llvm.org
Thu Jun 26 09:34:40 PDT 2025


https://github.com/uzairnawaz created https://github.com/llvm/llvm-project/pull/145923

The current interface of the internal wcrtomb is very similar to the public api.
To make implementing functions like wcsrtombs easier, this patch adds a max_written parameter to the internal wcrtomb (now called wcrtomb_bounded) to set an upper bound on the number of bytes that can be written. If not all bytes are written, the function can be called again with the same mbstate to continue the conversion. 


>From 7b9b84a43c6356e442a2f78b0b842ac9f9dccef3 Mon Sep 17 00:00:00 2001
From: Uzair Nawaz <uzairnawaz at google.com>
Date: Thu, 26 Jun 2025 16:31:31 +0000
Subject: [PATCH] Generalized internal wcrtomb function

---
 libc/src/__support/wchar/CMakeLists.txt       |   6 +-
 .../{wcrtomb.cpp => wcrtomb_bounded.cpp}      |  24 ++--
 .../wchar/{wcrtomb.h => wcrtomb_bounded.h}    |   9 +-
 libc/src/wchar/CMakeLists.txt                 |   4 +-
 libc/src/wchar/wcrtomb.cpp                    |  12 +-
 libc/src/wchar/wctomb.cpp                     |   4 +-
 libc/test/src/__support/wchar/CMakeLists.txt  |  11 ++
 .../__support/wchar/wcrtomb_bounded_test.cpp  | 126 ++++++++++++++++++
 8 files changed, 170 insertions(+), 26 deletions(-)
 rename libc/src/__support/wchar/{wcrtomb.cpp => wcrtomb_bounded.cpp} (67%)
 rename libc/src/__support/wchar/{wcrtomb.h => wcrtomb_bounded.h} (68%)
 create mode 100644 libc/test/src/__support/wchar/wcrtomb_bounded_test.cpp

diff --git a/libc/src/__support/wchar/CMakeLists.txt b/libc/src/__support/wchar/CMakeLists.txt
index 86a47319f278a..db57d6488bd37 100644
--- a/libc/src/__support/wchar/CMakeLists.txt
+++ b/libc/src/__support/wchar/CMakeLists.txt
@@ -21,11 +21,11 @@ add_object_library(
 )
 
 add_object_library(
-  wcrtomb
+  wcrtomb_bounded
   HDRS
-    wcrtomb.h
+    wcrtomb_bounded.h
   SRCS 
-    wcrtomb.cpp
+    wcrtomb_bounded.cpp
   DEPENDS
     libc.hdr.errno_macros
     libc.hdr.types.char32_t
diff --git a/libc/src/__support/wchar/wcrtomb.cpp b/libc/src/__support/wchar/wcrtomb_bounded.cpp
similarity index 67%
rename from libc/src/__support/wchar/wcrtomb.cpp
rename to libc/src/__support/wchar/wcrtomb_bounded.cpp
index a74a6f3ec34a6..3355c08a938ef 100644
--- a/libc/src/__support/wchar/wcrtomb.cpp
+++ b/libc/src/__support/wchar/wcrtomb_bounded.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "src/__support/wchar/wcrtomb.h"
+#include "src/__support/wchar/wcrtomb_bounded.h"
 #include "src/__support/error_or.h"
 #include "src/__support/wchar/character_converter.h"
 #include "src/__support/wchar/mbstate.h"
@@ -21,8 +21,8 @@
 namespace LIBC_NAMESPACE_DECL {
 namespace internal {
 
-ErrorOr<size_t> wcrtomb(char *__restrict s, wchar_t wc,
-                        mbstate *__restrict ps) {
+ErrorOr<size_t> wcrtomb_bounded(char *__restrict s, wchar_t wc,
+                                mbstate *__restrict ps, size_t max_written) {
   static_assert(sizeof(wchar_t) == 4);
 
   CharacterConverter cr(ps);
@@ -30,15 +30,19 @@ ErrorOr<size_t> wcrtomb(char *__restrict s, wchar_t wc,
   if (!cr.isValidState())
     return Error(EINVAL);
 
+  char buf[sizeof(wchar_t) / sizeof(char)];
   if (s == nullptr)
-    return Error(EILSEQ);
+    s = buf;
 
-  int status = cr.push(static_cast<char32_t>(wc));
-  if (status != 0)
-    return Error(EILSEQ);
+  // if cr isnt empty, it should be represented in mbstate already
+  if (cr.isEmpty()) {
+    int status = cr.push(static_cast<char32_t>(wc));
+    if (status != 0)
+      return Error(EILSEQ);
+  }
 
   size_t count = 0;
-  while (!cr.isEmpty()) {
+  while (!cr.isEmpty() && count < max_written) {
     auto utf8 = cr.pop_utf8(); // can never fail as long as the push succeeded
     LIBC_ASSERT(utf8.has_value());
 
@@ -46,6 +50,10 @@ ErrorOr<size_t> wcrtomb(char *__restrict s, wchar_t wc,
     s++;
     count++;
   }
+
+  if (!cr.isEmpty()) // didn't complete the conversion
+    return -1;
+
   return count;
 }
 
diff --git a/libc/src/__support/wchar/wcrtomb.h b/libc/src/__support/wchar/wcrtomb_bounded.h
similarity index 68%
rename from libc/src/__support/wchar/wcrtomb.h
rename to libc/src/__support/wchar/wcrtomb_bounded.h
index bcd39a92a3b76..fb310dcef40a8 100644
--- a/libc/src/__support/wchar/wcrtomb.h
+++ b/libc/src/__support/wchar/wcrtomb_bounded.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LIBC_SRC__SUPPORT_WCHAR_WCRTOMB_H
-#define LLVM_LIBC_SRC__SUPPORT_WCHAR_WCRTOMB_H
+#ifndef LLVM_LIBC_SRC__SUPPORT_WCHAR_WCRTOMB_BOUNDED_H
+#define LLVM_LIBC_SRC__SUPPORT_WCHAR_WCRTOMB_BOUNDED_H
 
 #include "hdr/types/size_t.h"
 #include "hdr/types/wchar_t.h"
@@ -18,9 +18,10 @@
 namespace LIBC_NAMESPACE_DECL {
 namespace internal {
 
-ErrorOr<size_t> wcrtomb(char *__restrict s, wchar_t wc, mbstate *__restrict ps);
+ErrorOr<size_t> wcrtomb_bounded(char *__restrict s, wchar_t wc,
+                                mbstate *__restrict ps, size_t max_written);
 
 } // namespace internal
 } // namespace LIBC_NAMESPACE_DECL
 
-#endif // LLVM_LIBC_SRC__SUPPORT_WCHAR_WCRTOMB_H
+#endif // LLVM_LIBC_SRC__SUPPORT_WCHAR_WCRTOMB_BOUNDED_H
diff --git a/libc/src/wchar/CMakeLists.txt b/libc/src/wchar/CMakeLists.txt
index 16664100d42c7..cc610502f5c7d 100644
--- a/libc/src/wchar/CMakeLists.txt
+++ b/libc/src/wchar/CMakeLists.txt
@@ -44,7 +44,7 @@ add_entrypoint_object(
     libc.hdr.types.wchar_t
     libc.hdr.types.mbstate_t
     libc.src.__support.libc_errno
-    libc.src.__support.wchar.wcrtomb
+    libc.src.__support.wchar.wcrtomb_bounded
     libc.src.__support.wchar.mbstate
 )
 
@@ -56,7 +56,7 @@ add_entrypoint_object(
     wctomb.h
   DEPENDS
     libc.hdr.types.wchar_t
-    libc.src.__support.wchar.wcrtomb
+    libc.src.__support.wchar.wcrtomb_bounded
     libc.src.__support.wchar.mbstate
     libc.src.__support.libc_errno
 )
diff --git a/libc/src/wchar/wcrtomb.cpp b/libc/src/wchar/wcrtomb.cpp
index 3e9df0599431e..56d03e9fe5ebf 100644
--- a/libc/src/wchar/wcrtomb.cpp
+++ b/libc/src/wchar/wcrtomb.cpp
@@ -14,7 +14,7 @@
 #include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/wchar/mbstate.h"
-#include "src/__support/wchar/wcrtomb.h"
+#include "src/__support/wchar/wcrtomb_bounded.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
@@ -23,16 +23,14 @@ LLVM_LIBC_FUNCTION(size_t, wcrtomb,
   static internal::mbstate internal_mbstate;
 
   // when s is nullptr, this is equivalent to wcrtomb(buf, L'\0', ps)
-  char buf[sizeof(wchar_t) / sizeof(char)];
-  if (s == nullptr) {
-    s = buf;
+  if (s == nullptr)
     wc = L'\0';
-  }
 
-  auto result = internal::wcrtomb(
+  auto result = internal::wcrtomb_bounded(
       s, wc,
       ps == nullptr ? &internal_mbstate
-                    : reinterpret_cast<internal::mbstate *>(ps));
+                    : reinterpret_cast<internal::mbstate *>(ps),
+      sizeof(wchar_t));
 
   if (!result.has_value()) {
     libc_errno = result.error();
diff --git a/libc/src/wchar/wctomb.cpp b/libc/src/wchar/wctomb.cpp
index 142302e6ae09b..1f4babecfa1cf 100644
--- a/libc/src/wchar/wctomb.cpp
+++ b/libc/src/wchar/wctomb.cpp
@@ -13,7 +13,7 @@
 #include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/wchar/mbstate.h"
-#include "src/__support/wchar/wcrtomb.h"
+#include "src/__support/wchar/wcrtomb_bounded.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
@@ -22,7 +22,7 @@ LLVM_LIBC_FUNCTION(int, wctomb, (char *s, wchar_t wc)) {
   if (s == nullptr)
     return 0;
 
-  auto result = internal::wcrtomb(s, wc, &internal_mbstate);
+  auto result = internal::wcrtomb_bounded(s, wc, &internal_mbstate, sizeof(wchar_t));
 
   if (!result.has_value()) { // invalid wide character
     libc_errno = EILSEQ;
diff --git a/libc/test/src/__support/wchar/CMakeLists.txt b/libc/test/src/__support/wchar/CMakeLists.txt
index 5176bfd4b024b..44bf4c7038d80 100644
--- a/libc/test/src/__support/wchar/CMakeLists.txt
+++ b/libc/test/src/__support/wchar/CMakeLists.txt
@@ -19,3 +19,14 @@ add_libc_test(
   DEPENDS
     libc.src.__support.wchar.character_converter
 )
+
+add_libc_test(
+  wcrtomb_bounded_test
+  SUITE
+    libc-support-tests
+  SRCS
+    wcrtomb_bounded_test.cpp
+  DEPENDS
+    libc.src.__support.wchar.wcrtomb_bounded
+    libc.src.__support.wchar.mbstate
+)
diff --git a/libc/test/src/__support/wchar/wcrtomb_bounded_test.cpp b/libc/test/src/__support/wchar/wcrtomb_bounded_test.cpp
new file mode 100644
index 0000000000000..d191f4d3be7c3
--- /dev/null
+++ b/libc/test/src/__support/wchar/wcrtomb_bounded_test.cpp
@@ -0,0 +1,126 @@
+//===-- Unittests for wcrtomb_bounded -------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "hdr/types/wchar_t.h"
+#include "hdr/errno_macros.h"
+#include "src/__support/wchar/mbstate.h"
+#include "src/__support/wchar/wcrtomb_bounded.h"
+#include "test/UnitTest/Test.h"
+
+// The majority of the following tests are the same as tests/src/wchar/wcrtomb_test.cpp
+
+TEST(LlvmLibcWCRToMBBoundedTest, OneByte) {
+  LIBC_NAMESPACE::internal::mbstate state;
+  wchar_t wc = L'U';
+  char mb[4];
+  auto result = LIBC_NAMESPACE::internal::wcrtomb_bounded(mb, wc, &state, 4);
+  ASSERT_TRUE(result.has_value());
+  ASSERT_EQ(result.value(), static_cast<size_t>(1));
+  ASSERT_EQ(mb[0], 'U');
+}
+
+TEST(LlvmLibcWCRToMBBoundedTest, TwoByte) {
+  LIBC_NAMESPACE::internal::mbstate state;
+  // testing utf32: 0xff -> utf8: 0xc3 0xbf
+  wchar_t wc = 0xff;
+  char mb[4];
+  auto result = LIBC_NAMESPACE::internal::wcrtomb_bounded(mb, wc, &state, 4);
+  ASSERT_TRUE(result.has_value());
+  ASSERT_EQ(result.value(), static_cast<size_t>(2));
+  ASSERT_EQ(mb[0], static_cast<char>(0xc3));
+  ASSERT_EQ(mb[1], static_cast<char>(0xbf));
+}
+
+TEST(LlvmLibcWCRToMBBoundedTest, ThreeByte) {
+  LIBC_NAMESPACE::internal::mbstate state;
+  // testing utf32: 0xac15 -> utf8: 0xea 0xb0 0x95
+  wchar_t wc = 0xac15;
+  char mb[4];
+  auto result = LIBC_NAMESPACE::internal::wcrtomb_bounded(mb, wc, &state, 4);
+  ASSERT_TRUE(result.has_value());
+  ASSERT_EQ(result.value(), static_cast<size_t>(3));
+  ASSERT_EQ(mb[0], static_cast<char>(0xea));
+  ASSERT_EQ(mb[1], static_cast<char>(0xb0));
+  ASSERT_EQ(mb[2], static_cast<char>(0x95));
+}
+
+TEST(LlvmLibcWCRToMBTest, FourByte) {
+  LIBC_NAMESPACE::internal::mbstate state;
+  // testing utf32: 0x1f921 -> utf8: 0xf0 0x9f 0xa4 0xa1
+  wchar_t wc = 0x1f921;
+  char mb[4];
+  auto result = LIBC_NAMESPACE::internal::wcrtomb_bounded(mb, wc, &state, 4);
+  ASSERT_TRUE(result.has_value());
+  ASSERT_EQ(result.value(), static_cast<size_t>(4));
+  ASSERT_EQ(mb[0], static_cast<char>(0xf0));
+  ASSERT_EQ(mb[1], static_cast<char>(0x9f));
+  ASSERT_EQ(mb[2], static_cast<char>(0xa4));
+  ASSERT_EQ(mb[3], static_cast<char>(0xa1));
+}
+
+TEST(LlvmLibcWCRToMBBoundedTest, NullString) {
+  LIBC_NAMESPACE::internal::mbstate state;
+  wchar_t wc = L'A';
+
+  // should return the multi byte length of the widechar
+  auto result =
+      LIBC_NAMESPACE::internal::wcrtomb_bounded(nullptr, wc, &state, 4);
+
+  ASSERT_TRUE(result.has_value());
+  ASSERT_EQ(result.value(), static_cast<size_t>(1));
+}
+
+TEST(LlvmLibcWCRToMBBoundedTest, InvalidWchar) {
+  LIBC_NAMESPACE::internal::mbstate state;
+  wchar_t wc = 0x12ffff;
+  char mb[4];
+  auto result = LIBC_NAMESPACE::internal::wcrtomb_bounded(mb, wc, &state, 4);
+  ASSERT_FALSE(result.has_value());
+  ASSERT_EQ(result.error(), EILSEQ);
+}
+
+TEST(LlvmLibcWCRToMBBoundedTest, InvalidMBState) {
+  LIBC_NAMESPACE::internal::mbstate inv;
+  inv.total_bytes = 6;
+  wchar_t wc = L'A';
+  char mb[4];
+  auto result = LIBC_NAMESPACE::internal::wcrtomb_bounded(mb, wc, &inv, 4);
+  ASSERT_FALSE(result.has_value());
+  ASSERT_EQ(result.error(), EINVAL);
+}
+
+// wcrtomb_bounded unique tests
+
+TEST(LlvmLibcWCRToMBBoundedTest, ContinueConversion) {
+  LIBC_NAMESPACE::internal::mbstate state;
+  // utf32: 0x1f921 -> utf8: 0xf0 0x9f 0xa4 0xa1
+  wchar_t wc = 0x1f921;
+  char mb[5] = {'\x01', '\x01', '\x01', '\x01', '\x01'};
+  auto result = LIBC_NAMESPACE::internal::wcrtomb_bounded(mb, wc, &state, 1);
+  ASSERT_TRUE(result.has_value());
+  ASSERT_EQ(result.value(), static_cast<size_t>(-1)); // conversion not completed
+  ASSERT_EQ(mb[0], '\xF0');
+  ASSERT_EQ(mb[1], '\x01');
+
+  result = LIBC_NAMESPACE::internal::wcrtomb_bounded(mb + 1, wc, &state, 2);
+  ASSERT_TRUE(result.has_value());
+  ASSERT_EQ(result.value(), static_cast<size_t>(-1)); // conversion not completed
+  ASSERT_EQ(mb[0], '\xF0');
+  ASSERT_EQ(mb[1], '\x9F');
+  ASSERT_EQ(mb[2], '\xA4');
+  ASSERT_EQ(mb[3], '\x01');
+
+  result = LIBC_NAMESPACE::internal::wcrtomb_bounded(mb + 3, wc, &state, 100);
+  ASSERT_TRUE(result.has_value());
+  ASSERT_EQ(result.value(), static_cast<size_t>(1));
+  ASSERT_EQ(mb[0], '\xF0');
+  ASSERT_EQ(mb[1], '\x9F');
+  ASSERT_EQ(mb[2], '\xA4');
+  ASSERT_EQ(mb[3], '\xA1');
+  ASSERT_EQ(mb[4], '\x01');
+}
\ No newline at end of file



More information about the libc-commits mailing list