[libc-commits] [libc] [libc] Refactored internal wcrtomb function (PR #145923)
Uzair Nawaz via libc-commits
libc-commits at lists.llvm.org
Thu Jun 26 09:40:04 PDT 2025
https://github.com/uzairnawaz updated https://github.com/llvm/llvm-project/pull/145923
>From 7b9b84a43c6356e442a2f78b0b842ac9f9dccef3 Mon Sep 17 00:00:00 2001
From: Uzair Nawaz <uzairnawaz at google.com>
Date: Thu, 26 Jun 2025 16:31:31 +0000
Subject: [PATCH 1/2] Generalized internal wcrtomb function
---
libc/src/__support/wchar/CMakeLists.txt | 6 +-
.../{wcrtomb.cpp => wcrtomb_bounded.cpp} | 24 ++--
.../wchar/{wcrtomb.h => wcrtomb_bounded.h} | 9 +-
libc/src/wchar/CMakeLists.txt | 4 +-
libc/src/wchar/wcrtomb.cpp | 12 +-
libc/src/wchar/wctomb.cpp | 4 +-
libc/test/src/__support/wchar/CMakeLists.txt | 11 ++
.../__support/wchar/wcrtomb_bounded_test.cpp | 126 ++++++++++++++++++
8 files changed, 170 insertions(+), 26 deletions(-)
rename libc/src/__support/wchar/{wcrtomb.cpp => wcrtomb_bounded.cpp} (67%)
rename libc/src/__support/wchar/{wcrtomb.h => wcrtomb_bounded.h} (68%)
create mode 100644 libc/test/src/__support/wchar/wcrtomb_bounded_test.cpp
diff --git a/libc/src/__support/wchar/CMakeLists.txt b/libc/src/__support/wchar/CMakeLists.txt
index 86a47319f278a..db57d6488bd37 100644
--- a/libc/src/__support/wchar/CMakeLists.txt
+++ b/libc/src/__support/wchar/CMakeLists.txt
@@ -21,11 +21,11 @@ add_object_library(
)
add_object_library(
- wcrtomb
+ wcrtomb_bounded
HDRS
- wcrtomb.h
+ wcrtomb_bounded.h
SRCS
- wcrtomb.cpp
+ wcrtomb_bounded.cpp
DEPENDS
libc.hdr.errno_macros
libc.hdr.types.char32_t
diff --git a/libc/src/__support/wchar/wcrtomb.cpp b/libc/src/__support/wchar/wcrtomb_bounded.cpp
similarity index 67%
rename from libc/src/__support/wchar/wcrtomb.cpp
rename to libc/src/__support/wchar/wcrtomb_bounded.cpp
index a74a6f3ec34a6..3355c08a938ef 100644
--- a/libc/src/__support/wchar/wcrtomb.cpp
+++ b/libc/src/__support/wchar/wcrtomb_bounded.cpp
@@ -6,7 +6,7 @@
//
//===----------------------------------------------------------------------===//
-#include "src/__support/wchar/wcrtomb.h"
+#include "src/__support/wchar/wcrtomb_bounded.h"
#include "src/__support/error_or.h"
#include "src/__support/wchar/character_converter.h"
#include "src/__support/wchar/mbstate.h"
@@ -21,8 +21,8 @@
namespace LIBC_NAMESPACE_DECL {
namespace internal {
-ErrorOr<size_t> wcrtomb(char *__restrict s, wchar_t wc,
- mbstate *__restrict ps) {
+ErrorOr<size_t> wcrtomb_bounded(char *__restrict s, wchar_t wc,
+ mbstate *__restrict ps, size_t max_written) {
static_assert(sizeof(wchar_t) == 4);
CharacterConverter cr(ps);
@@ -30,15 +30,19 @@ ErrorOr<size_t> wcrtomb(char *__restrict s, wchar_t wc,
if (!cr.isValidState())
return Error(EINVAL);
+ char buf[sizeof(wchar_t) / sizeof(char)];
if (s == nullptr)
- return Error(EILSEQ);
+ s = buf;
- int status = cr.push(static_cast<char32_t>(wc));
- if (status != 0)
- return Error(EILSEQ);
+ // if cr isnt empty, it should be represented in mbstate already
+ if (cr.isEmpty()) {
+ int status = cr.push(static_cast<char32_t>(wc));
+ if (status != 0)
+ return Error(EILSEQ);
+ }
size_t count = 0;
- while (!cr.isEmpty()) {
+ while (!cr.isEmpty() && count < max_written) {
auto utf8 = cr.pop_utf8(); // can never fail as long as the push succeeded
LIBC_ASSERT(utf8.has_value());
@@ -46,6 +50,10 @@ ErrorOr<size_t> wcrtomb(char *__restrict s, wchar_t wc,
s++;
count++;
}
+
+ if (!cr.isEmpty()) // didn't complete the conversion
+ return -1;
+
return count;
}
diff --git a/libc/src/__support/wchar/wcrtomb.h b/libc/src/__support/wchar/wcrtomb_bounded.h
similarity index 68%
rename from libc/src/__support/wchar/wcrtomb.h
rename to libc/src/__support/wchar/wcrtomb_bounded.h
index bcd39a92a3b76..fb310dcef40a8 100644
--- a/libc/src/__support/wchar/wcrtomb.h
+++ b/libc/src/__support/wchar/wcrtomb_bounded.h
@@ -6,8 +6,8 @@
//
//===----------------------------------------------------------------------===//
-#ifndef LLVM_LIBC_SRC__SUPPORT_WCHAR_WCRTOMB_H
-#define LLVM_LIBC_SRC__SUPPORT_WCHAR_WCRTOMB_H
+#ifndef LLVM_LIBC_SRC__SUPPORT_WCHAR_WCRTOMB_BOUNDED_H
+#define LLVM_LIBC_SRC__SUPPORT_WCHAR_WCRTOMB_BOUNDED_H
#include "hdr/types/size_t.h"
#include "hdr/types/wchar_t.h"
@@ -18,9 +18,10 @@
namespace LIBC_NAMESPACE_DECL {
namespace internal {
-ErrorOr<size_t> wcrtomb(char *__restrict s, wchar_t wc, mbstate *__restrict ps);
+ErrorOr<size_t> wcrtomb_bounded(char *__restrict s, wchar_t wc,
+ mbstate *__restrict ps, size_t max_written);
} // namespace internal
} // namespace LIBC_NAMESPACE_DECL
-#endif // LLVM_LIBC_SRC__SUPPORT_WCHAR_WCRTOMB_H
+#endif // LLVM_LIBC_SRC__SUPPORT_WCHAR_WCRTOMB_BOUNDED_H
diff --git a/libc/src/wchar/CMakeLists.txt b/libc/src/wchar/CMakeLists.txt
index 16664100d42c7..cc610502f5c7d 100644
--- a/libc/src/wchar/CMakeLists.txt
+++ b/libc/src/wchar/CMakeLists.txt
@@ -44,7 +44,7 @@ add_entrypoint_object(
libc.hdr.types.wchar_t
libc.hdr.types.mbstate_t
libc.src.__support.libc_errno
- libc.src.__support.wchar.wcrtomb
+ libc.src.__support.wchar.wcrtomb_bounded
libc.src.__support.wchar.mbstate
)
@@ -56,7 +56,7 @@ add_entrypoint_object(
wctomb.h
DEPENDS
libc.hdr.types.wchar_t
- libc.src.__support.wchar.wcrtomb
+ libc.src.__support.wchar.wcrtomb_bounded
libc.src.__support.wchar.mbstate
libc.src.__support.libc_errno
)
diff --git a/libc/src/wchar/wcrtomb.cpp b/libc/src/wchar/wcrtomb.cpp
index 3e9df0599431e..56d03e9fe5ebf 100644
--- a/libc/src/wchar/wcrtomb.cpp
+++ b/libc/src/wchar/wcrtomb.cpp
@@ -14,7 +14,7 @@
#include "src/__support/libc_errno.h"
#include "src/__support/macros/config.h"
#include "src/__support/wchar/mbstate.h"
-#include "src/__support/wchar/wcrtomb.h"
+#include "src/__support/wchar/wcrtomb_bounded.h"
namespace LIBC_NAMESPACE_DECL {
@@ -23,16 +23,14 @@ LLVM_LIBC_FUNCTION(size_t, wcrtomb,
static internal::mbstate internal_mbstate;
// when s is nullptr, this is equivalent to wcrtomb(buf, L'\0', ps)
- char buf[sizeof(wchar_t) / sizeof(char)];
- if (s == nullptr) {
- s = buf;
+ if (s == nullptr)
wc = L'\0';
- }
- auto result = internal::wcrtomb(
+ auto result = internal::wcrtomb_bounded(
s, wc,
ps == nullptr ? &internal_mbstate
- : reinterpret_cast<internal::mbstate *>(ps));
+ : reinterpret_cast<internal::mbstate *>(ps),
+ sizeof(wchar_t));
if (!result.has_value()) {
libc_errno = result.error();
diff --git a/libc/src/wchar/wctomb.cpp b/libc/src/wchar/wctomb.cpp
index 142302e6ae09b..1f4babecfa1cf 100644
--- a/libc/src/wchar/wctomb.cpp
+++ b/libc/src/wchar/wctomb.cpp
@@ -13,7 +13,7 @@
#include "src/__support/libc_errno.h"
#include "src/__support/macros/config.h"
#include "src/__support/wchar/mbstate.h"
-#include "src/__support/wchar/wcrtomb.h"
+#include "src/__support/wchar/wcrtomb_bounded.h"
namespace LIBC_NAMESPACE_DECL {
@@ -22,7 +22,7 @@ LLVM_LIBC_FUNCTION(int, wctomb, (char *s, wchar_t wc)) {
if (s == nullptr)
return 0;
- auto result = internal::wcrtomb(s, wc, &internal_mbstate);
+ auto result = internal::wcrtomb_bounded(s, wc, &internal_mbstate, sizeof(wchar_t));
if (!result.has_value()) { // invalid wide character
libc_errno = EILSEQ;
diff --git a/libc/test/src/__support/wchar/CMakeLists.txt b/libc/test/src/__support/wchar/CMakeLists.txt
index 5176bfd4b024b..44bf4c7038d80 100644
--- a/libc/test/src/__support/wchar/CMakeLists.txt
+++ b/libc/test/src/__support/wchar/CMakeLists.txt
@@ -19,3 +19,14 @@ add_libc_test(
DEPENDS
libc.src.__support.wchar.character_converter
)
+
+add_libc_test(
+ wcrtomb_bounded_test
+ SUITE
+ libc-support-tests
+ SRCS
+ wcrtomb_bounded_test.cpp
+ DEPENDS
+ libc.src.__support.wchar.wcrtomb_bounded
+ libc.src.__support.wchar.mbstate
+)
diff --git a/libc/test/src/__support/wchar/wcrtomb_bounded_test.cpp b/libc/test/src/__support/wchar/wcrtomb_bounded_test.cpp
new file mode 100644
index 0000000000000..d191f4d3be7c3
--- /dev/null
+++ b/libc/test/src/__support/wchar/wcrtomb_bounded_test.cpp
@@ -0,0 +1,126 @@
+//===-- Unittests for wcrtomb_bounded -------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "hdr/types/wchar_t.h"
+#include "hdr/errno_macros.h"
+#include "src/__support/wchar/mbstate.h"
+#include "src/__support/wchar/wcrtomb_bounded.h"
+#include "test/UnitTest/Test.h"
+
+// The majority of the following tests are the same as tests/src/wchar/wcrtomb_test.cpp
+
+TEST(LlvmLibcWCRToMBBoundedTest, OneByte) {
+ LIBC_NAMESPACE::internal::mbstate state;
+ wchar_t wc = L'U';
+ char mb[4];
+ auto result = LIBC_NAMESPACE::internal::wcrtomb_bounded(mb, wc, &state, 4);
+ ASSERT_TRUE(result.has_value());
+ ASSERT_EQ(result.value(), static_cast<size_t>(1));
+ ASSERT_EQ(mb[0], 'U');
+}
+
+TEST(LlvmLibcWCRToMBBoundedTest, TwoByte) {
+ LIBC_NAMESPACE::internal::mbstate state;
+ // testing utf32: 0xff -> utf8: 0xc3 0xbf
+ wchar_t wc = 0xff;
+ char mb[4];
+ auto result = LIBC_NAMESPACE::internal::wcrtomb_bounded(mb, wc, &state, 4);
+ ASSERT_TRUE(result.has_value());
+ ASSERT_EQ(result.value(), static_cast<size_t>(2));
+ ASSERT_EQ(mb[0], static_cast<char>(0xc3));
+ ASSERT_EQ(mb[1], static_cast<char>(0xbf));
+}
+
+TEST(LlvmLibcWCRToMBBoundedTest, ThreeByte) {
+ LIBC_NAMESPACE::internal::mbstate state;
+ // testing utf32: 0xac15 -> utf8: 0xea 0xb0 0x95
+ wchar_t wc = 0xac15;
+ char mb[4];
+ auto result = LIBC_NAMESPACE::internal::wcrtomb_bounded(mb, wc, &state, 4);
+ ASSERT_TRUE(result.has_value());
+ ASSERT_EQ(result.value(), static_cast<size_t>(3));
+ ASSERT_EQ(mb[0], static_cast<char>(0xea));
+ ASSERT_EQ(mb[1], static_cast<char>(0xb0));
+ ASSERT_EQ(mb[2], static_cast<char>(0x95));
+}
+
+TEST(LlvmLibcWCRToMBTest, FourByte) {
+ LIBC_NAMESPACE::internal::mbstate state;
+ // testing utf32: 0x1f921 -> utf8: 0xf0 0x9f 0xa4 0xa1
+ wchar_t wc = 0x1f921;
+ char mb[4];
+ auto result = LIBC_NAMESPACE::internal::wcrtomb_bounded(mb, wc, &state, 4);
+ ASSERT_TRUE(result.has_value());
+ ASSERT_EQ(result.value(), static_cast<size_t>(4));
+ ASSERT_EQ(mb[0], static_cast<char>(0xf0));
+ ASSERT_EQ(mb[1], static_cast<char>(0x9f));
+ ASSERT_EQ(mb[2], static_cast<char>(0xa4));
+ ASSERT_EQ(mb[3], static_cast<char>(0xa1));
+}
+
+TEST(LlvmLibcWCRToMBBoundedTest, NullString) {
+ LIBC_NAMESPACE::internal::mbstate state;
+ wchar_t wc = L'A';
+
+ // should return the multi byte length of the widechar
+ auto result =
+ LIBC_NAMESPACE::internal::wcrtomb_bounded(nullptr, wc, &state, 4);
+
+ ASSERT_TRUE(result.has_value());
+ ASSERT_EQ(result.value(), static_cast<size_t>(1));
+}
+
+TEST(LlvmLibcWCRToMBBoundedTest, InvalidWchar) {
+ LIBC_NAMESPACE::internal::mbstate state;
+ wchar_t wc = 0x12ffff;
+ char mb[4];
+ auto result = LIBC_NAMESPACE::internal::wcrtomb_bounded(mb, wc, &state, 4);
+ ASSERT_FALSE(result.has_value());
+ ASSERT_EQ(result.error(), EILSEQ);
+}
+
+TEST(LlvmLibcWCRToMBBoundedTest, InvalidMBState) {
+ LIBC_NAMESPACE::internal::mbstate inv;
+ inv.total_bytes = 6;
+ wchar_t wc = L'A';
+ char mb[4];
+ auto result = LIBC_NAMESPACE::internal::wcrtomb_bounded(mb, wc, &inv, 4);
+ ASSERT_FALSE(result.has_value());
+ ASSERT_EQ(result.error(), EINVAL);
+}
+
+// wcrtomb_bounded unique tests
+
+TEST(LlvmLibcWCRToMBBoundedTest, ContinueConversion) {
+ LIBC_NAMESPACE::internal::mbstate state;
+ // utf32: 0x1f921 -> utf8: 0xf0 0x9f 0xa4 0xa1
+ wchar_t wc = 0x1f921;
+ char mb[5] = {'\x01', '\x01', '\x01', '\x01', '\x01'};
+ auto result = LIBC_NAMESPACE::internal::wcrtomb_bounded(mb, wc, &state, 1);
+ ASSERT_TRUE(result.has_value());
+ ASSERT_EQ(result.value(), static_cast<size_t>(-1)); // conversion not completed
+ ASSERT_EQ(mb[0], '\xF0');
+ ASSERT_EQ(mb[1], '\x01');
+
+ result = LIBC_NAMESPACE::internal::wcrtomb_bounded(mb + 1, wc, &state, 2);
+ ASSERT_TRUE(result.has_value());
+ ASSERT_EQ(result.value(), static_cast<size_t>(-1)); // conversion not completed
+ ASSERT_EQ(mb[0], '\xF0');
+ ASSERT_EQ(mb[1], '\x9F');
+ ASSERT_EQ(mb[2], '\xA4');
+ ASSERT_EQ(mb[3], '\x01');
+
+ result = LIBC_NAMESPACE::internal::wcrtomb_bounded(mb + 3, wc, &state, 100);
+ ASSERT_TRUE(result.has_value());
+ ASSERT_EQ(result.value(), static_cast<size_t>(1));
+ ASSERT_EQ(mb[0], '\xF0');
+ ASSERT_EQ(mb[1], '\x9F');
+ ASSERT_EQ(mb[2], '\xA4');
+ ASSERT_EQ(mb[3], '\xA1');
+ ASSERT_EQ(mb[4], '\x01');
+}
\ No newline at end of file
>From 33a51b13fa100362404f0d40b8179494469909fb Mon Sep 17 00:00:00 2001
From: Uzair Nawaz <uzairnawaz at google.com>
Date: Thu, 26 Jun 2025 16:39:28 +0000
Subject: [PATCH 2/2] formatting
---
libc/src/wchar/wctomb.cpp | 3 ++-
.../test/src/__support/wchar/wcrtomb_bounded_test.cpp | 11 +++++++----
2 files changed, 9 insertions(+), 5 deletions(-)
diff --git a/libc/src/wchar/wctomb.cpp b/libc/src/wchar/wctomb.cpp
index 1f4babecfa1cf..2b2c67f4dc535 100644
--- a/libc/src/wchar/wctomb.cpp
+++ b/libc/src/wchar/wctomb.cpp
@@ -22,7 +22,8 @@ LLVM_LIBC_FUNCTION(int, wctomb, (char *s, wchar_t wc)) {
if (s == nullptr)
return 0;
- auto result = internal::wcrtomb_bounded(s, wc, &internal_mbstate, sizeof(wchar_t));
+ auto result =
+ internal::wcrtomb_bounded(s, wc, &internal_mbstate, sizeof(wchar_t));
if (!result.has_value()) { // invalid wide character
libc_errno = EILSEQ;
diff --git a/libc/test/src/__support/wchar/wcrtomb_bounded_test.cpp b/libc/test/src/__support/wchar/wcrtomb_bounded_test.cpp
index d191f4d3be7c3..8ba5cfb119d29 100644
--- a/libc/test/src/__support/wchar/wcrtomb_bounded_test.cpp
+++ b/libc/test/src/__support/wchar/wcrtomb_bounded_test.cpp
@@ -6,13 +6,14 @@
//
//===----------------------------------------------------------------------===//
-#include "hdr/types/wchar_t.h"
#include "hdr/errno_macros.h"
+#include "hdr/types/wchar_t.h"
#include "src/__support/wchar/mbstate.h"
#include "src/__support/wchar/wcrtomb_bounded.h"
#include "test/UnitTest/Test.h"
-// The majority of the following tests are the same as tests/src/wchar/wcrtomb_test.cpp
+// The majority of the following tests are the same as
+// tests/src/wchar/wcrtomb_test.cpp
TEST(LlvmLibcWCRToMBBoundedTest, OneByte) {
LIBC_NAMESPACE::internal::mbstate state;
@@ -103,13 +104,15 @@ TEST(LlvmLibcWCRToMBBoundedTest, ContinueConversion) {
char mb[5] = {'\x01', '\x01', '\x01', '\x01', '\x01'};
auto result = LIBC_NAMESPACE::internal::wcrtomb_bounded(mb, wc, &state, 1);
ASSERT_TRUE(result.has_value());
- ASSERT_EQ(result.value(), static_cast<size_t>(-1)); // conversion not completed
+ ASSERT_EQ(result.value(),
+ static_cast<size_t>(-1)); // conversion not completed
ASSERT_EQ(mb[0], '\xF0');
ASSERT_EQ(mb[1], '\x01');
result = LIBC_NAMESPACE::internal::wcrtomb_bounded(mb + 1, wc, &state, 2);
ASSERT_TRUE(result.has_value());
- ASSERT_EQ(result.value(), static_cast<size_t>(-1)); // conversion not completed
+ ASSERT_EQ(result.value(),
+ static_cast<size_t>(-1)); // conversion not completed
ASSERT_EQ(mb[0], '\xF0');
ASSERT_EQ(mb[1], '\x9F');
ASSERT_EQ(mb[2], '\xA4');
More information about the libc-commits
mailing list