[libc-commits] [libc] [libc] Fixed StringConverter Error Edge Case (PR #149356)
Uzair Nawaz via libc-commits
libc-commits at lists.llvm.org
Thu Jul 17 09:58:33 PDT 2025
https://github.com/uzairnawaz updated https://github.com/llvm/llvm-project/pull/149356
>From 0a10352da189462ebf8b539d74105ab77b655388 Mon Sep 17 00:00:00 2001
From: Uzair Nawaz <uzairnawaz at google.com>
Date: Thu, 17 Jul 2025 16:47:06 +0000
Subject: [PATCH 1/2] fixed edge case where we try to pop an invalid character
even if there is no space in dest
---
libc/src/__support/wchar/string_converter.h | 8 ++-
.../__support/wchar/string_converter_test.cpp | 57 +++++++++++++++++++
2 files changed, 64 insertions(+), 1 deletion(-)
diff --git a/libc/src/__support/wchar/string_converter.h b/libc/src/__support/wchar/string_converter.h
index 0635bc57bf3e2..b4e6fde954235 100644
--- a/libc/src/__support/wchar/string_converter.h
+++ b/libc/src/__support/wchar/string_converter.h
@@ -56,11 +56,14 @@ template <typename T> class StringConverter {
// TODO: following functions are almost identical
// look into templating CharacterConverter pop functions
ErrorOr<char32_t> popUTF32() {
+ if (num_to_write == 0)
+ return Error(-1);
+
if (cr.isEmpty() || src_idx == 0) {
auto src_elements_read = pushFullCharacter();
if (!src_elements_read.has_value())
return Error(src_elements_read.error());
-
+
if (cr.sizeAsUTF32() > num_to_write) {
cr.clear();
return Error(-1);
@@ -79,6 +82,9 @@ template <typename T> class StringConverter {
}
ErrorOr<char8_t> popUTF8() {
+ if (num_to_write == 0)
+ return Error(-1);
+
if (cr.isEmpty() || src_idx == 0) {
auto src_elements_read = pushFullCharacter();
if (!src_elements_read.has_value())
diff --git a/libc/test/src/__support/wchar/string_converter_test.cpp b/libc/test/src/__support/wchar/string_converter_test.cpp
index 14d074156d033..d514df9317852 100644
--- a/libc/test/src/__support/wchar/string_converter_test.cpp
+++ b/libc/test/src/__support/wchar/string_converter_test.cpp
@@ -245,6 +245,63 @@ TEST(LlvmLibcStringConverterTest, UTF8To32ErrorHandling) {
ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 4);
}
+TEST(LlvmLibcStringConverterTest, InvalidCharacterOutsideBounds) {
+ // if an invalid character exists in the source string but we don't have space
+ // to write it, we should return a "stop converting" error rather than an
+ // invalid character error
+
+ // first 4 bytes are clown emoji (🤡)
+ // next 3 form an invalid character
+ const char *src1 = "\xF0\x9F\xA4\xA1\x90\x88\x30";
+ LIBC_NAMESPACE::internal::mbstate ps1;
+ LIBC_NAMESPACE::internal::StringConverter<char8_t> sc1(
+ reinterpret_cast<const char8_t *>(src1), &ps1, 1);
+
+ auto res1 = sc1.popUTF32();
+ ASSERT_TRUE(res1.has_value());
+ ASSERT_EQ(static_cast<int>(res1.value()), 0x1f921);
+ ASSERT_EQ(static_cast<int>(sc1.getSourceIndex()), 4);
+
+ res1 = sc1.popUTF32();
+ ASSERT_FALSE(res1.has_value());
+ // no space to write error NOT invalid character error (EILSEQ)
+ ASSERT_EQ(static_cast<int>(res1.error()), -1);
+ ASSERT_EQ(static_cast<int>(sc1.getSourceIndex()), 4);
+
+ const wchar_t src2[] = {
+ static_cast<wchar_t>(0x1f921), static_cast<wchar_t>(0xffffff),
+ static_cast<wchar_t>(0x0)}; // clown emoji, invalid utf32
+ LIBC_NAMESPACE::internal::mbstate ps2;
+ LIBC_NAMESPACE::internal::StringConverter<char32_t> sc2(
+ reinterpret_cast<const char32_t *>(src2), &ps2, 4);
+
+ auto res2 = sc2.popUTF8();
+ ASSERT_TRUE(res2.has_value());
+ ASSERT_EQ(static_cast<int>(res2.value()), 0xF0);
+ ASSERT_EQ(static_cast<int>(sc2.getSourceIndex()), 1);
+
+ res2 = sc2.popUTF8();
+ ASSERT_TRUE(res2.has_value());
+ ASSERT_EQ(static_cast<int>(res2.value()), 0x9F);
+ ASSERT_EQ(static_cast<int>(sc2.getSourceIndex()), 1);
+
+ res2 = sc2.popUTF8();
+ ASSERT_TRUE(res2.has_value());
+ ASSERT_EQ(static_cast<int>(res2.value()), 0xA4);
+ ASSERT_EQ(static_cast<int>(sc2.getSourceIndex()), 1);
+
+ res2 = sc2.popUTF8();
+ ASSERT_TRUE(res2.has_value());
+ ASSERT_EQ(static_cast<int>(res2.value()), 0xA1);
+ ASSERT_EQ(static_cast<int>(sc2.getSourceIndex()), 1);
+
+ res2 = sc2.popUTF8();
+ ASSERT_FALSE(res2.has_value());
+ // no space to write error NOT invalid character error (EILSEQ)
+ ASSERT_EQ(static_cast<int>(res2.error()), -1);
+ ASSERT_EQ(static_cast<int>(sc2.getSourceIndex()), 1);
+}
+
TEST(LlvmLibcStringConverterTest, MultipleStringConverters32To8) {
/*
We do NOT test partially popping a character and expecting the next
>From 834d65c239dd2a8bba57ee3a97c9ed8337efa216 Mon Sep 17 00:00:00 2001
From: Uzair Nawaz <uzairnawaz at google.com>
Date: Thu, 17 Jul 2025 16:57:59 +0000
Subject: [PATCH 2/2] formatting
---
libc/src/__support/wchar/string_converter.h | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/libc/src/__support/wchar/string_converter.h b/libc/src/__support/wchar/string_converter.h
index b4e6fde954235..869ebdfc8b390 100644
--- a/libc/src/__support/wchar/string_converter.h
+++ b/libc/src/__support/wchar/string_converter.h
@@ -63,7 +63,7 @@ template <typename T> class StringConverter {
auto src_elements_read = pushFullCharacter();
if (!src_elements_read.has_value())
return Error(src_elements_read.error());
-
+
if (cr.sizeAsUTF32() > num_to_write) {
cr.clear();
return Error(-1);
@@ -84,7 +84,7 @@ template <typename T> class StringConverter {
ErrorOr<char8_t> popUTF8() {
if (num_to_write == 0)
return Error(-1);
-
+
if (cr.isEmpty() || src_idx == 0) {
auto src_elements_read = pushFullCharacter();
if (!src_elements_read.has_value())
More information about the libc-commits
mailing list