[libc-commits] [libc] [libc] Added isValidState to CharacterConverter class to ensure a provided mbstate is valid (PR #145564)
Uzair Nawaz via libc-commits
libc-commits at lists.llvm.org
Wed Jun 25 10:49:38 PDT 2025
https://github.com/uzairnawaz updated https://github.com/llvm/llvm-project/pull/145564
>From f370bf713a9ca7275d75695045efd327d0055ad8 Mon Sep 17 00:00:00 2001
From: Uzair Nawaz <uzairnawaz at google.com>
Date: Tue, 24 Jun 2025 18:18:11 +0000
Subject: [PATCH 1/4] added isValidState
---
.../__support/wchar/character_converter.cpp | 12 +++++-
.../src/__support/wchar/character_converter.h | 1 +
.../src/__support/wchar/utf32_to_8_test.cpp | 42 +++++++++++++++++++
3 files changed, 54 insertions(+), 1 deletion(-)
diff --git a/libc/src/__support/wchar/character_converter.cpp b/libc/src/__support/wchar/character_converter.cpp
index 1f81de4248ff0..8c978b659f3ab 100644
--- a/libc/src/__support/wchar/character_converter.cpp
+++ b/libc/src/__support/wchar/character_converter.cpp
@@ -25,6 +25,8 @@ constexpr size_t ENCODED_BITS_PER_UTF8 = 6;
// Information not metadata (# of bits excluding the byte headers)
constexpr uint32_t MASK_ENCODED_BITS =
mask_trailing_ones<uint32_t, ENCODED_BITS_PER_UTF8>();
+// Maximum value for utf-32 for a utf-8 sequence of a given length
+constexpr char32_t MAX_VALUE_PER_UTF8_LEN[] = {0x7f, 0x7ff, 0xffff, 0x10ffff};
CharacterConverter::CharacterConverter(mbstate *mbstate) { state = mbstate; }
@@ -40,6 +42,15 @@ bool CharacterConverter::isFull() {
bool CharacterConverter::isEmpty() { return state->bytes_stored == 0; }
+bool CharacterConverter::isValidState() {
+ const char32_t max_utf32_value =
+ state->total_bytes == 0 ? 0
+ : MAX_VALUE_PER_UTF8_LEN[state->total_bytes - 1];
+ return state->bytes_stored <= state->total_bytes &&
+ state->bytes_stored >= 0 && state->total_bytes <= 4 &&
+ state->partial <= max_utf32_value;
+}
+
int CharacterConverter::push(char8_t utf8_byte) {
uint8_t num_ones = static_cast<uint8_t>(cpp::countl_one(utf8_byte));
// Checking the first byte if first push
@@ -90,7 +101,6 @@ int CharacterConverter::push(char32_t utf32) {
state->partial = utf32;
// determine number of utf-8 bytes needed to represent this utf32 value
- constexpr char32_t MAX_VALUE_PER_UTF8_LEN[] = {0x7f, 0x7ff, 0xffff, 0x10ffff};
constexpr int NUM_RANGES = 4;
for (uint8_t i = 0; i < NUM_RANGES; i++) {
if (state->partial <= MAX_VALUE_PER_UTF8_LEN[i]) {
diff --git a/libc/src/__support/wchar/character_converter.h b/libc/src/__support/wchar/character_converter.h
index be0e6129df236..d9a63fdc0522c 100644
--- a/libc/src/__support/wchar/character_converter.h
+++ b/libc/src/__support/wchar/character_converter.h
@@ -28,6 +28,7 @@ class CharacterConverter {
void clear();
bool isFull();
bool isEmpty();
+ bool isValidState();
int push(char8_t utf8_byte);
int push(char32_t utf32);
diff --git a/libc/test/src/__support/wchar/utf32_to_8_test.cpp b/libc/test/src/__support/wchar/utf32_to_8_test.cpp
index a6a7bc4aa6f4c..1ad523e148845 100644
--- a/libc/test/src/__support/wchar/utf32_to_8_test.cpp
+++ b/libc/test/src/__support/wchar/utf32_to_8_test.cpp
@@ -186,3 +186,45 @@ TEST(LlvmLibcCharacterConverterUTF32To8Test, CantPushMidConversion) {
int err = cr.push(utf32);
ASSERT_EQ(err, -1);
}
+
+TEST(LlvmLibcCharacterConverterUTF32To8Test, InvalidState) {
+ LIBC_NAMESPACE::internal::mbstate s1;
+ LIBC_NAMESPACE::internal::CharacterConverter c1(&s1);
+ ASSERT_TRUE(c1.isValidState());
+
+ LIBC_NAMESPACE::internal::mbstate s2{0, 2, 0};
+ LIBC_NAMESPACE::internal::CharacterConverter c2(&s2);
+ ASSERT_FALSE(c2.isValidState());
+
+ LIBC_NAMESPACE::internal::mbstate s3{0x7f, 1, 1};
+ LIBC_NAMESPACE::internal::CharacterConverter c3(&s3);
+ ASSERT_TRUE(c3.isValidState());
+ LIBC_NAMESPACE::internal::mbstate s4{0x80, 1, 1};
+ LIBC_NAMESPACE::internal::CharacterConverter c4(&s4);
+ ASSERT_FALSE(c4.isValidState());
+
+ LIBC_NAMESPACE::internal::mbstate s5{0x7ff, 1, 2};
+ LIBC_NAMESPACE::internal::CharacterConverter c5(&s5);
+ ASSERT_TRUE(c5.isValidState());
+ LIBC_NAMESPACE::internal::mbstate s6{0x800, 1, 2};
+ LIBC_NAMESPACE::internal::CharacterConverter c6(&s6);
+ ASSERT_FALSE(c6.isValidState());
+
+ LIBC_NAMESPACE::internal::mbstate s7{0xffff, 1, 3};
+ LIBC_NAMESPACE::internal::CharacterConverter c7(&s7);
+ ASSERT_TRUE(c7.isValidState());
+ LIBC_NAMESPACE::internal::mbstate s8{0x10000, 1, 3};
+ LIBC_NAMESPACE::internal::CharacterConverter c8(&s8);
+ ASSERT_FALSE(c8.isValidState());
+
+ LIBC_NAMESPACE::internal::mbstate s9{0x10ffff, 1, 4};
+ LIBC_NAMESPACE::internal::CharacterConverter c9(&s9);
+ ASSERT_TRUE(c9.isValidState());
+ LIBC_NAMESPACE::internal::mbstate s10{0x110000, 1, 2};
+ LIBC_NAMESPACE::internal::CharacterConverter c10(&s10);
+ ASSERT_FALSE(c10.isValidState());
+
+ LIBC_NAMESPACE::internal::mbstate s11{0, 0, 5};
+ LIBC_NAMESPACE::internal::CharacterConverter c11(&s11);
+ ASSERT_FALSE(c11.isValidState());
+}
>From f66522885fde5e8361f70a51a596caaa66a59830 Mon Sep 17 00:00:00 2001
From: Uzair Nawaz <uzairnawaz at google.com>
Date: Tue, 24 Jun 2025 22:10:44 +0000
Subject: [PATCH 2/4] fixed isValidState edge case
---
libc/src/__support/wchar/character_converter.cpp | 10 ++++++----
1 file changed, 6 insertions(+), 4 deletions(-)
diff --git a/libc/src/__support/wchar/character_converter.cpp b/libc/src/__support/wchar/character_converter.cpp
index 8c978b659f3ab..01bed6e030287 100644
--- a/libc/src/__support/wchar/character_converter.cpp
+++ b/libc/src/__support/wchar/character_converter.cpp
@@ -27,6 +27,7 @@ constexpr uint32_t MASK_ENCODED_BITS =
mask_trailing_ones<uint32_t, ENCODED_BITS_PER_UTF8>();
// Maximum value for utf-32 for a utf-8 sequence of a given length
constexpr char32_t MAX_VALUE_PER_UTF8_LEN[] = {0x7f, 0x7ff, 0xffff, 0x10ffff};
+constexpr int MAX_UTF8_LENGTH = 4;
CharacterConverter::CharacterConverter(mbstate *mbstate) { state = mbstate; }
@@ -43,12 +44,14 @@ bool CharacterConverter::isFull() {
bool CharacterConverter::isEmpty() { return state->bytes_stored == 0; }
bool CharacterConverter::isValidState() {
+ if (state->total_bytes > 4)
+ return false;
+
const char32_t max_utf32_value =
state->total_bytes == 0 ? 0
: MAX_VALUE_PER_UTF8_LEN[state->total_bytes - 1];
return state->bytes_stored <= state->total_bytes &&
- state->bytes_stored >= 0 && state->total_bytes <= 4 &&
- state->partial <= max_utf32_value;
+ state->bytes_stored >= 0 && state->partial <= max_utf32_value;
}
int CharacterConverter::push(char8_t utf8_byte) {
@@ -101,8 +104,7 @@ int CharacterConverter::push(char32_t utf32) {
state->partial = utf32;
// determine number of utf-8 bytes needed to represent this utf32 value
- constexpr int NUM_RANGES = 4;
- for (uint8_t i = 0; i < NUM_RANGES; i++) {
+ for (uint8_t i = 0; i < MAX_UTF8_LENGTH; i++) {
if (state->partial <= MAX_VALUE_PER_UTF8_LEN[i]) {
state->total_bytes = i + 1;
state->bytes_stored = i + 1;
>From 1b5d23b293300e675b79c6b9d58b8df618aa86dc Mon Sep 17 00:00:00 2001
From: Uzair Nawaz <uzairnawaz at google.com>
Date: Tue, 24 Jun 2025 22:34:21 +0000
Subject: [PATCH 3/4] replaced magic number
---
libc/src/__support/wchar/character_converter.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/libc/src/__support/wchar/character_converter.cpp b/libc/src/__support/wchar/character_converter.cpp
index 01bed6e030287..dbf0024653ccf 100644
--- a/libc/src/__support/wchar/character_converter.cpp
+++ b/libc/src/__support/wchar/character_converter.cpp
@@ -44,7 +44,7 @@ bool CharacterConverter::isFull() {
bool CharacterConverter::isEmpty() { return state->bytes_stored == 0; }
bool CharacterConverter::isValidState() {
- if (state->total_bytes > 4)
+ if (state->total_bytes > MAX_UTF8_LENGTH)
return false;
const char32_t max_utf32_value =
>From 6c7705c893199332eb5801f22a93131e9722f0f2 Mon Sep 17 00:00:00 2001
From: Uzair Nawaz <uzairnawaz at google.com>
Date: Wed, 25 Jun 2025 17:49:08 +0000
Subject: [PATCH 4/4] redundant check
---
libc/src/__support/wchar/character_converter.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/libc/src/__support/wchar/character_converter.cpp b/libc/src/__support/wchar/character_converter.cpp
index dbf0024653ccf..c54a1b751f402 100644
--- a/libc/src/__support/wchar/character_converter.cpp
+++ b/libc/src/__support/wchar/character_converter.cpp
@@ -51,7 +51,7 @@ bool CharacterConverter::isValidState() {
state->total_bytes == 0 ? 0
: MAX_VALUE_PER_UTF8_LEN[state->total_bytes - 1];
return state->bytes_stored <= state->total_bytes &&
- state->bytes_stored >= 0 && state->partial <= max_utf32_value;
+ state->partial <= max_utf32_value;
}
int CharacterConverter::push(char8_t utf8_byte) {
More information about the libc-commits
mailing list