[libc-commits] [libc] [libc] Wchar Stringconverter (PR #146388)
Uzair Nawaz via libc-commits
libc-commits at lists.llvm.org
Wed Jul 2 10:21:29 PDT 2025
https://github.com/uzairnawaz updated https://github.com/llvm/llvm-project/pull/146388
>From c1e1650a822d6b8867cf05304937aa4a2c046fd3 Mon Sep 17 00:00:00 2001
From: Uzair Nawaz <uzairnawaz at google.com>
Date: Mon, 30 Jun 2025 16:24:13 +0000
Subject: [PATCH 1/4] added tests
---
libc/src/__support/wchar/CMakeLists.txt | 12 ++
libc/src/__support/wchar/string_converter.h | 85 +++++++++
libc/test/src/__support/wchar/CMakeLists.txt | 10 ++
.../__support/wchar/string_converter_test.cpp | 167 ++++++++++++++++++
4 files changed, 274 insertions(+)
create mode 100644 libc/src/__support/wchar/string_converter.h
create mode 100644 libc/test/src/__support/wchar/string_converter_test.cpp
diff --git a/libc/src/__support/wchar/CMakeLists.txt b/libc/src/__support/wchar/CMakeLists.txt
index 86a47319f278a..a62fd8fbb64cd 100644
--- a/libc/src/__support/wchar/CMakeLists.txt
+++ b/libc/src/__support/wchar/CMakeLists.txt
@@ -6,6 +6,18 @@ add_header_library(
libc.hdr.types.char32_t
)
+add_header_library(
+ string_converter
+ HDRS
+ string_converter.h
+ DEPENDS
+ libc.hdr.types.char8_t
+ libc.hdr.types.char32_t
+ libc.src.__support.error_or
+ .mbstate
+ .character_converter
+)
+
add_object_library(
character_converter
HDRS
diff --git a/libc/src/__support/wchar/string_converter.h b/libc/src/__support/wchar/string_converter.h
new file mode 100644
index 0000000000000..cfbf116fd112e
--- /dev/null
+++ b/libc/src/__support/wchar/string_converter.h
@@ -0,0 +1,85 @@
+//===-- Definition of a class for mbstate_t and conversion -----*-- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_STRING_CONVERTER_H
+#define LLVM_LIBC_SRC___SUPPORT_STRING_CONVERTER_H
+
+#include "hdr/types/char32_t.h"
+#include "hdr/types/char8_t.h"
+#include "hdr/types/size_t.h"
+#include "src/__support/common.h"
+#include "src/__support/error_or.h"
+#include "src/__support/wchar/character_converter.h"
+#include "src/__support/wchar/mbstate.h"
+
+namespace LIBC_NAMESPACE_DECL {
+namespace internal {
+
+template <typename T> class StringConverter {
+private:
+ CharacterConverter cr;
+ const T *src;
+ size_t src_len;
+ size_t src_idx;
+
+ int pushFullCharacter() {
+ if (!cr.isEmpty())
+ return 0;
+
+ int original_idx = src_idx;
+ while (!cr.isFull() && src_idx < src_len) {
+ int err = cr.push(src[src_idx++]);
+ if (err != 0) {
+ // point to the beginning of the invalid sequence
+ src_idx = original_idx;
+ return err;
+ }
+ }
+
+ if (src_idx == src_len && !cr.isFull()) {
+ // src points to the beginning of the character
+ src_idx = original_idx;
+ return -1;
+ }
+
+ return 0;
+ }
+
+public:
+ StringConverter(const T *s, mbstate *ps)
+ : cr(ps), src(s), src_len(SIZE_MAX), src_idx(0) {}
+ StringConverter(const T *s, size_t len, mbstate *ps)
+ : cr(ps), src(s), src_len(len), src_idx(0) {}
+
+ ErrorOr<char32_t> popUTF32() {
+ if (pushFullCharacter() != 0)
+ return Error(0);
+
+ auto out = cr.pop_utf32();
+ if (out.has_value() && out.value() == L'\0')
+ src_len = src_idx;
+
+ return out;
+ }
+
+ ErrorOr<char8_t> popUTF8() {
+ if (pushFullCharacter() != 0)
+ return Error(0);
+
+ auto out = cr.pop_utf8();
+ if (out.has_value() && out.value() == '\0')
+ src_len = src_idx;
+
+ return out;
+ }
+};
+
+} // namespace internal
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC___SUPPORT_STRING_CONVERTER_H
diff --git a/libc/test/src/__support/wchar/CMakeLists.txt b/libc/test/src/__support/wchar/CMakeLists.txt
index 5176bfd4b024b..e8790df9a73cf 100644
--- a/libc/test/src/__support/wchar/CMakeLists.txt
+++ b/libc/test/src/__support/wchar/CMakeLists.txt
@@ -19,3 +19,13 @@ add_libc_test(
DEPENDS
libc.src.__support.wchar.character_converter
)
+
+add_libc_test(
+ string_converter_test.cpp
+ SUITE
+ libc-support-tests
+ SRCS
+ string_converter_test.cpp
+ DEPENDS
+ libc.src.__support.wchar.string_converter
+)
diff --git a/libc/test/src/__support/wchar/string_converter_test.cpp b/libc/test/src/__support/wchar/string_converter_test.cpp
new file mode 100644
index 0000000000000..2532b54f9ab89
--- /dev/null
+++ b/libc/test/src/__support/wchar/string_converter_test.cpp
@@ -0,0 +1,167 @@
+//===-- Unittests for StringConverter class -------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "hdr/errno_macros.h"
+#include "hdr/types/char32_t.h"
+#include "hdr/types/char8_t.h"
+#include "src/__support/error_or.h"
+#include "src/__support/wchar/mbstate.h"
+#include "src/__support/wchar/string_converter.h"
+#include "test/UnitTest/Test.h"
+
+TEST(LlvmLibcStringConverterTest, UTF8To32) {
+ // first 4 bytes are clown emoji, then next 3 are sigma symbol
+ const char *src = "\xF0\x9F\xA4\xA1\xE2\x88\x91";
+ LIBC_NAMESPACE::internal::mbstate state;
+ LIBC_NAMESPACE::internal::StringConverter<char8_t> sc(
+ reinterpret_cast<const char8_t *>(src), &state);
+
+ auto res = sc.popUTF32();
+ ASSERT_TRUE(res.has_value());
+ ASSERT_EQ(static_cast<int>(res.value()), 0x1f921);
+
+ res = sc.popUTF32();
+ ASSERT_TRUE(res.has_value());
+ ASSERT_EQ(static_cast<int>(res.value()), 0x2211);
+
+ res = sc.popUTF32();
+ ASSERT_TRUE(res.has_value());
+ ASSERT_EQ(static_cast<int>(res.value()), 0);
+
+ res = sc.popUTF32();
+ ASSERT_FALSE(res.has_value());
+ ASSERT_EQ(res.error(), 0);
+}
+
+TEST(LlvmLibcStringConverterTest, UTF32To8) {
+ const wchar_t *src = L"\x1f921\x2211";
+ LIBC_NAMESPACE::internal::mbstate state;
+ LIBC_NAMESPACE::internal::StringConverter<char32_t> sc(
+ reinterpret_cast<const char32_t *>(src), &state);
+
+ auto res = sc.popUTF8();
+ ASSERT_TRUE(res.has_value());
+ ASSERT_EQ(static_cast<int>(res.value()), 0xF0);
+
+ res = sc.popUTF8();
+ ASSERT_TRUE(res.has_value());
+ ASSERT_EQ(static_cast<int>(res.value()), 0x9F);
+
+ res = sc.popUTF8();
+ ASSERT_TRUE(res.has_value());
+ ASSERT_EQ(static_cast<int>(res.value()), 0xA4);
+
+ res = sc.popUTF8();
+ ASSERT_TRUE(res.has_value());
+ ASSERT_EQ(static_cast<int>(res.value()), 0xA1);
+
+ res = sc.popUTF8();
+ ASSERT_TRUE(res.has_value());
+ ASSERT_EQ(static_cast<int>(res.value()), 0xE2);
+
+ res = sc.popUTF8();
+ ASSERT_TRUE(res.has_value());
+ ASSERT_EQ(static_cast<int>(res.value()), 0x88);
+
+ res = sc.popUTF8();
+ ASSERT_TRUE(res.has_value());
+ ASSERT_EQ(static_cast<int>(res.value()), 0x91);
+
+ res = sc.popUTF8();
+ ASSERT_TRUE(res.has_value());
+ ASSERT_EQ(static_cast<int>(res.value()), 0);
+
+ res = sc.popUTF8();
+ ASSERT_FALSE(res.has_value());
+ ASSERT_EQ(res.error(), 0);
+}
+
+TEST(LlvmLibcStringConverterTest, UTF32To8PartialRead) {
+ const wchar_t *src = L"\x1f921\x2211";
+ LIBC_NAMESPACE::internal::mbstate state;
+ LIBC_NAMESPACE::internal::StringConverter<char32_t> sc(
+ reinterpret_cast<const char32_t *>(src), 1, &state);
+
+ auto res = sc.popUTF8();
+ ASSERT_TRUE(res.has_value());
+ ASSERT_EQ(static_cast<int>(res.value()), 0xF0);
+
+ res = sc.popUTF8();
+ ASSERT_TRUE(res.has_value());
+ ASSERT_EQ(static_cast<int>(res.value()), 0x9F);
+
+ res = sc.popUTF8();
+ ASSERT_TRUE(res.has_value());
+ ASSERT_EQ(static_cast<int>(res.value()), 0xA4);
+
+ res = sc.popUTF8();
+ ASSERT_TRUE(res.has_value());
+ ASSERT_EQ(static_cast<int>(res.value()), 0xA1);
+
+ res = sc.popUTF8();
+ ASSERT_FALSE(res.has_value());
+ ASSERT_EQ(res.error(), 0);
+}
+
+TEST(LlvmLibcStringConverterTest, UTF8To32PartialRead) {
+ // first 4 bytes are clown emoji, then next 3 are sigma symbol
+ const char *src = "\xF0\x9F\xA4\xA1\xE2\x88\x91";
+ LIBC_NAMESPACE::internal::mbstate state;
+ LIBC_NAMESPACE::internal::StringConverter<char8_t> sc(
+ reinterpret_cast<const char8_t *>(src), 5, &state);
+
+ auto res = sc.popUTF32();
+ ASSERT_TRUE(res.has_value());
+ ASSERT_EQ(static_cast<int>(res.value()), 0x1f921);
+
+ res = sc.popUTF32();
+ ASSERT_FALSE(res.has_value());
+ ASSERT_EQ(static_cast<int>(res.error()), 0);
+}
+
+TEST(LlvmLibcStringConverterTest, UTF32To8ErrorHandling) {
+ const wchar_t *src = L"\x1f921\xffffff";
+ LIBC_NAMESPACE::internal::mbstate state;
+ LIBC_NAMESPACE::internal::StringConverter<char32_t> sc(
+ reinterpret_cast<const char32_t *>(src), &state);
+
+ auto res = sc.popUTF8();
+ ASSERT_TRUE(res.has_value());
+ ASSERT_EQ(static_cast<int>(res.value()), 0xF0);
+
+ res = sc.popUTF8();
+ ASSERT_TRUE(res.has_value());
+ ASSERT_EQ(static_cast<int>(res.value()), 0x9F);
+
+ res = sc.popUTF8();
+ ASSERT_TRUE(res.has_value());
+ ASSERT_EQ(static_cast<int>(res.value()), 0xA4);
+
+ res = sc.popUTF8();
+ ASSERT_TRUE(res.has_value());
+ ASSERT_EQ(static_cast<int>(res.value()), 0xA1);
+
+ res = sc.popUTF8();
+ ASSERT_FALSE(res.has_value());
+ ASSERT_EQ(static_cast<int>(res.error()), EILSEQ);
+}
+
+TEST(LlvmLibcStringConverterTest, UTF8To32ErrorHandling) {
+ const char *src = "\xF0\x9F\xA4\xA1\xE2\x88\x91";
+ LIBC_NAMESPACE::internal::mbstate state;
+ LIBC_NAMESPACE::internal::StringConverter<char8_t> sc(
+ reinterpret_cast<const char8_t *>(src), &state);
+
+ auto res = sc.popUTF32();
+ ASSERT_TRUE(res.has_value());
+ ASSERT_EQ(static_cast<int>(res.value()), 0x1f921);
+
+ res = sc.popUTF32();
+ ASSERT_FALSE(res.has_value());
+ ASSERT_EQ(static_cast<int>(res.error()), EILSEQ);
+}
>From f18221f23593f2e6a17e6a22a42c65f844e2f69f Mon Sep 17 00:00:00 2001
From: Uzair Nawaz <uzairnawaz at google.com>
Date: Mon, 30 Jun 2025 16:45:57 +0000
Subject: [PATCH 2/4] fixed test
---
libc/src/__support/wchar/string_converter.h | 10 ++++++----
.../test/src/__support/wchar/string_converter_test.cpp | 10 +++++-----
2 files changed, 11 insertions(+), 9 deletions(-)
diff --git a/libc/src/__support/wchar/string_converter.h b/libc/src/__support/wchar/string_converter.h
index cfbf116fd112e..769691e5e2a87 100644
--- a/libc/src/__support/wchar/string_converter.h
+++ b/libc/src/__support/wchar/string_converter.h
@@ -57,8 +57,9 @@ template <typename T> class StringConverter {
: cr(ps), src(s), src_len(len), src_idx(0) {}
ErrorOr<char32_t> popUTF32() {
- if (pushFullCharacter() != 0)
- return Error(0);
+ int err = pushFullCharacter();
+ if (err != 0)
+ return Error(err);
auto out = cr.pop_utf32();
if (out.has_value() && out.value() == L'\0')
@@ -68,8 +69,9 @@ template <typename T> class StringConverter {
}
ErrorOr<char8_t> popUTF8() {
- if (pushFullCharacter() != 0)
- return Error(0);
+ int err = pushFullCharacter();
+ if (err != 0)
+ return Error(err);
auto out = cr.pop_utf8();
if (out.has_value() && out.value() == '\0')
diff --git a/libc/test/src/__support/wchar/string_converter_test.cpp b/libc/test/src/__support/wchar/string_converter_test.cpp
index 2532b54f9ab89..54e2d86585a71 100644
--- a/libc/test/src/__support/wchar/string_converter_test.cpp
+++ b/libc/test/src/__support/wchar/string_converter_test.cpp
@@ -35,7 +35,7 @@ TEST(LlvmLibcStringConverterTest, UTF8To32) {
res = sc.popUTF32();
ASSERT_FALSE(res.has_value());
- ASSERT_EQ(res.error(), 0);
+ ASSERT_EQ(res.error(), -1);
}
TEST(LlvmLibcStringConverterTest, UTF32To8) {
@@ -78,7 +78,7 @@ TEST(LlvmLibcStringConverterTest, UTF32To8) {
res = sc.popUTF8();
ASSERT_FALSE(res.has_value());
- ASSERT_EQ(res.error(), 0);
+ ASSERT_EQ(res.error(), -1);
}
TEST(LlvmLibcStringConverterTest, UTF32To8PartialRead) {
@@ -105,7 +105,7 @@ TEST(LlvmLibcStringConverterTest, UTF32To8PartialRead) {
res = sc.popUTF8();
ASSERT_FALSE(res.has_value());
- ASSERT_EQ(res.error(), 0);
+ ASSERT_EQ(res.error(), -1);
}
TEST(LlvmLibcStringConverterTest, UTF8To32PartialRead) {
@@ -121,7 +121,7 @@ TEST(LlvmLibcStringConverterTest, UTF8To32PartialRead) {
res = sc.popUTF32();
ASSERT_FALSE(res.has_value());
- ASSERT_EQ(static_cast<int>(res.error()), 0);
+ ASSERT_EQ(static_cast<int>(res.error()), -1);
}
TEST(LlvmLibcStringConverterTest, UTF32To8ErrorHandling) {
@@ -152,7 +152,7 @@ TEST(LlvmLibcStringConverterTest, UTF32To8ErrorHandling) {
}
TEST(LlvmLibcStringConverterTest, UTF8To32ErrorHandling) {
- const char *src = "\xF0\x9F\xA4\xA1\xE2\x88\x91";
+ const char *src = "\xF0\x9F\xA4\xA1\xE2\x88";
LIBC_NAMESPACE::internal::mbstate state;
LIBC_NAMESPACE::internal::StringConverter<char8_t> sc(
reinterpret_cast<const char8_t *>(src), &state);
>From 41e7e31d195535f1566bf0092030e86897ef5448 Mon Sep 17 00:00:00 2001
From: Uzair Nawaz <uzairnawaz at google.com>
Date: Mon, 30 Jun 2025 20:13:26 +0000
Subject: [PATCH 3/4] fixed dependencies; added comments
---
libc/src/__support/wchar/CMakeLists.txt | 1 +
libc/src/__support/wchar/string_converter.h | 5 +++--
libc/test/src/__support/wchar/CMakeLists.txt | 4 ++++
.../src/__support/wchar/string_converter_test.cpp | 12 +++++++-----
4 files changed, 15 insertions(+), 7 deletions(-)
diff --git a/libc/src/__support/wchar/CMakeLists.txt b/libc/src/__support/wchar/CMakeLists.txt
index 836fecde8d6df..0828e4057c172 100644
--- a/libc/src/__support/wchar/CMakeLists.txt
+++ b/libc/src/__support/wchar/CMakeLists.txt
@@ -13,6 +13,7 @@ add_header_library(
DEPENDS
libc.hdr.types.char8_t
libc.hdr.types.char32_t
+ libc.hdr.types.size_t
libc.src.__support.error_or
.mbstate
.character_converter
diff --git a/libc/src/__support/wchar/string_converter.h b/libc/src/__support/wchar/string_converter.h
index 769691e5e2a87..100828cd45fab 100644
--- a/libc/src/__support/wchar/string_converter.h
+++ b/libc/src/__support/wchar/string_converter.h
@@ -41,6 +41,7 @@ template <typename T> class StringConverter {
}
}
+ // if we aren't able to read a full character from the source string
if (src_idx == src_len && !cr.isFull()) {
// src points to the beginning of the character
src_idx = original_idx;
@@ -64,7 +65,7 @@ template <typename T> class StringConverter {
auto out = cr.pop_utf32();
if (out.has_value() && out.value() == L'\0')
src_len = src_idx;
-
+
return out;
}
@@ -76,7 +77,7 @@ template <typename T> class StringConverter {
auto out = cr.pop_utf8();
if (out.has_value() && out.value() == '\0')
src_len = src_idx;
-
+
return out;
}
};
diff --git a/libc/test/src/__support/wchar/CMakeLists.txt b/libc/test/src/__support/wchar/CMakeLists.txt
index e8790df9a73cf..6332d6469e71e 100644
--- a/libc/test/src/__support/wchar/CMakeLists.txt
+++ b/libc/test/src/__support/wchar/CMakeLists.txt
@@ -28,4 +28,8 @@ add_libc_test(
string_converter_test.cpp
DEPENDS
libc.src.__support.wchar.string_converter
+ libc.src.__support.wchar.mbstate
+ libc.src.__support.wchar.error_or
+ libc.hdr.errno_macros
+ libc.hdr.types.char32_t
)
diff --git a/libc/test/src/__support/wchar/string_converter_test.cpp b/libc/test/src/__support/wchar/string_converter_test.cpp
index 54e2d86585a71..268cd6f17b590 100644
--- a/libc/test/src/__support/wchar/string_converter_test.cpp
+++ b/libc/test/src/__support/wchar/string_converter_test.cpp
@@ -15,7 +15,7 @@
#include "test/UnitTest/Test.h"
TEST(LlvmLibcStringConverterTest, UTF8To32) {
- // first 4 bytes are clown emoji, then next 3 are sigma symbol
+ // first 4 bytes are clown emoji (🤡), then next 3 are sigma symbol (∑)
const char *src = "\xF0\x9F\xA4\xA1\xE2\x88\x91";
LIBC_NAMESPACE::internal::mbstate state;
LIBC_NAMESPACE::internal::StringConverter<char8_t> sc(
@@ -39,7 +39,7 @@ TEST(LlvmLibcStringConverterTest, UTF8To32) {
}
TEST(LlvmLibcStringConverterTest, UTF32To8) {
- const wchar_t *src = L"\x1f921\x2211";
+ const wchar_t *src = L"\x1f921\x2211"; // clown emoji, sigma symbol
LIBC_NAMESPACE::internal::mbstate state;
LIBC_NAMESPACE::internal::StringConverter<char32_t> sc(
reinterpret_cast<const char32_t *>(src), &state);
@@ -82,7 +82,7 @@ TEST(LlvmLibcStringConverterTest, UTF32To8) {
}
TEST(LlvmLibcStringConverterTest, UTF32To8PartialRead) {
- const wchar_t *src = L"\x1f921\x2211";
+ const wchar_t *src = L"\x1f921\x2211"; // clown emoji, sigma symbol
LIBC_NAMESPACE::internal::mbstate state;
LIBC_NAMESPACE::internal::StringConverter<char32_t> sc(
reinterpret_cast<const char32_t *>(src), 1, &state);
@@ -114,7 +114,7 @@ TEST(LlvmLibcStringConverterTest, UTF8To32PartialRead) {
LIBC_NAMESPACE::internal::mbstate state;
LIBC_NAMESPACE::internal::StringConverter<char8_t> sc(
reinterpret_cast<const char8_t *>(src), 5, &state);
-
+
auto res = sc.popUTF32();
ASSERT_TRUE(res.has_value());
ASSERT_EQ(static_cast<int>(res.value()), 0x1f921);
@@ -125,7 +125,7 @@ TEST(LlvmLibcStringConverterTest, UTF8To32PartialRead) {
}
TEST(LlvmLibcStringConverterTest, UTF32To8ErrorHandling) {
- const wchar_t *src = L"\x1f921\xffffff";
+ const wchar_t *src = L"\x1f921\xffffff"; // clown emoji, invalid utf32
LIBC_NAMESPACE::internal::mbstate state;
LIBC_NAMESPACE::internal::StringConverter<char32_t> sc(
reinterpret_cast<const char32_t *>(src), &state);
@@ -152,6 +152,8 @@ TEST(LlvmLibcStringConverterTest, UTF32To8ErrorHandling) {
}
TEST(LlvmLibcStringConverterTest, UTF8To32ErrorHandling) {
+ // first 4 bytes are clown emoji (🤡)
+ // next 2 don't form a complete character
const char *src = "\xF0\x9F\xA4\xA1\xE2\x88";
LIBC_NAMESPACE::internal::mbstate state;
LIBC_NAMESPACE::internal::StringConverter<char8_t> sc(
>From e6453a23ac6ce1173e39319f4775af52e86c3109 Mon Sep 17 00:00:00 2001
From: Uzair Nawaz <uzairnawaz at google.com>
Date: Wed, 2 Jul 2025 17:20:55 +0000
Subject: [PATCH 4/4] StringConverter class complete!
---
.../__support/wchar/character_converter.cpp | 11 ++
.../src/__support/wchar/character_converter.h | 4 +
libc/src/__support/wchar/string_converter.h | 71 +++++---
libc/test/src/__support/wchar/CMakeLists.txt | 2 +-
.../__support/wchar/string_converter_test.cpp | 155 +++++++++++++++++-
5 files changed, 211 insertions(+), 32 deletions(-)
diff --git a/libc/src/__support/wchar/character_converter.cpp b/libc/src/__support/wchar/character_converter.cpp
index 3cacfa5689e4d..8d7c4183c5998 100644
--- a/libc/src/__support/wchar/character_converter.cpp
+++ b/libc/src/__support/wchar/character_converter.cpp
@@ -14,6 +14,7 @@
#include "src/__support/error_or.h"
#include "src/__support/math_extras.h"
#include "src/__support/wchar/mbstate.h"
+#include <stddef.h>
#include "character_converter.h"
@@ -92,6 +93,7 @@ int CharacterConverter::push(char8_t utf8_byte) {
state->bytes_stored++;
return 0;
}
+
// Invalid byte -> reset the state
clear();
return EILSEQ;
@@ -130,6 +132,12 @@ ErrorOr<char32_t> CharacterConverter::pop_utf32() {
return utf32;
}
+size_t CharacterConverter::sizeAsUTF32() {
+ return 1; // a single utf-32 value can fit an entire character
+}
+
+size_t CharacterConverter::sizeAsUTF8() { return state->total_bytes; }
+
ErrorOr<char8_t> CharacterConverter::pop_utf8() {
if (isEmpty())
return Error(-1);
@@ -156,6 +164,9 @@ ErrorOr<char8_t> CharacterConverter::pop_utf8() {
}
state->bytes_stored--;
+ if (state->bytes_stored == 0)
+ clear();
+
return static_cast<char8_t>(output);
}
diff --git a/libc/src/__support/wchar/character_converter.h b/libc/src/__support/wchar/character_converter.h
index d9a63fdc0522c..9e8dd71028002 100644
--- a/libc/src/__support/wchar/character_converter.h
+++ b/libc/src/__support/wchar/character_converter.h
@@ -14,6 +14,7 @@
#include "src/__support/common.h"
#include "src/__support/error_or.h"
#include "src/__support/wchar/mbstate.h"
+#include <stddef.h>
namespace LIBC_NAMESPACE_DECL {
namespace internal {
@@ -30,6 +31,9 @@ class CharacterConverter {
bool isEmpty();
bool isValidState();
+ size_t sizeAsUTF32();
+ size_t sizeAsUTF8();
+
int push(char8_t utf8_byte);
int push(char32_t utf32);
diff --git a/libc/src/__support/wchar/string_converter.h b/libc/src/__support/wchar/string_converter.h
index 100828cd45fab..d7c735a8c7b17 100644
--- a/libc/src/__support/wchar/string_converter.h
+++ b/libc/src/__support/wchar/string_converter.h
@@ -26,25 +26,20 @@ template <typename T> class StringConverter {
const T *src;
size_t src_len;
size_t src_idx;
+ size_t bytes_pushed;
+ size_t num_to_write;
int pushFullCharacter() {
- if (!cr.isEmpty())
- return 0;
-
- int original_idx = src_idx;
- while (!cr.isFull() && src_idx < src_len) {
- int err = cr.push(src[src_idx++]);
- if (err != 0) {
- // point to the beginning of the invalid sequence
- src_idx = original_idx;
+ for (bytes_pushed = 0; !cr.isFull() && src_idx + bytes_pushed < src_len;
+ ++bytes_pushed) {
+ int err = cr.push(src[src_idx + bytes_pushed]);
+ if (err != 0)
return err;
- }
}
// if we aren't able to read a full character from the source string
- if (src_idx == src_len && !cr.isFull()) {
- // src points to the beginning of the character
- src_idx = original_idx;
+ if (src_idx + bytes_pushed == src_len && !cr.isFull()) {
+ src_idx += bytes_pushed;
return -1;
}
@@ -52,34 +47,64 @@ template <typename T> class StringConverter {
}
public:
- StringConverter(const T *s, mbstate *ps)
- : cr(ps), src(s), src_len(SIZE_MAX), src_idx(0) {}
- StringConverter(const T *s, size_t len, mbstate *ps)
- : cr(ps), src(s), src_len(len), src_idx(0) {}
+ StringConverter(const T *s, size_t srclen, size_t dstlen, mbstate *ps)
+ : cr(ps), src(s), src_len(srclen), src_idx(0), bytes_pushed(0),
+ num_to_write(dstlen) {
+ pushFullCharacter();
+ }
+
+ StringConverter(const T *s, size_t dstlen, mbstate *ps)
+ : StringConverter(s, SIZE_MAX, dstlen, ps) {}
ErrorOr<char32_t> popUTF32() {
- int err = pushFullCharacter();
- if (err != 0)
- return Error(err);
+ if (cr.isEmpty()) {
+ int err = pushFullCharacter();
+ if (err != 0)
+ return Error(err);
+
+ if (cr.sizeAsUTF32() > num_to_write) {
+ cr.clear();
+ return Error(-1);
+ }
+ }
auto out = cr.pop_utf32();
+ if (cr.isEmpty())
+ src_idx += bytes_pushed;
+
if (out.has_value() && out.value() == L'\0')
src_len = src_idx;
+ num_to_write--;
+
return out;
}
ErrorOr<char8_t> popUTF8() {
- int err = pushFullCharacter();
- if (err != 0)
- return Error(err);
+ if (cr.isEmpty()) {
+ int err = pushFullCharacter();
+ if (err != 0)
+ return Error(err);
+
+ if (cr.sizeAsUTF8() > num_to_write) {
+ cr.clear();
+ return Error(-1);
+ }
+ }
auto out = cr.pop_utf8();
+ if (cr.isEmpty())
+ src_idx += bytes_pushed;
+
if (out.has_value() && out.value() == '\0')
src_len = src_idx;
+ num_to_write--;
+
return out;
}
+
+ size_t getSourceIndex() { return src_idx; }
};
} // namespace internal
diff --git a/libc/test/src/__support/wchar/CMakeLists.txt b/libc/test/src/__support/wchar/CMakeLists.txt
index 6332d6469e71e..6982232d67544 100644
--- a/libc/test/src/__support/wchar/CMakeLists.txt
+++ b/libc/test/src/__support/wchar/CMakeLists.txt
@@ -29,7 +29,7 @@ add_libc_test(
DEPENDS
libc.src.__support.wchar.string_converter
libc.src.__support.wchar.mbstate
- libc.src.__support.wchar.error_or
+ libc.src.__support.error_or
libc.hdr.errno_macros
libc.hdr.types.char32_t
)
diff --git a/libc/test/src/__support/wchar/string_converter_test.cpp b/libc/test/src/__support/wchar/string_converter_test.cpp
index 268cd6f17b590..cb908e2fec2a8 100644
--- a/libc/test/src/__support/wchar/string_converter_test.cpp
+++ b/libc/test/src/__support/wchar/string_converter_test.cpp
@@ -19,89 +19,106 @@ TEST(LlvmLibcStringConverterTest, UTF8To32) {
const char *src = "\xF0\x9F\xA4\xA1\xE2\x88\x91";
LIBC_NAMESPACE::internal::mbstate state;
LIBC_NAMESPACE::internal::StringConverter<char8_t> sc(
- reinterpret_cast<const char8_t *>(src), &state);
+ reinterpret_cast<const char8_t *>(src), SIZE_MAX, &state);
auto res = sc.popUTF32();
ASSERT_TRUE(res.has_value());
ASSERT_EQ(static_cast<int>(res.value()), 0x1f921);
+ ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 4);
res = sc.popUTF32();
ASSERT_TRUE(res.has_value());
ASSERT_EQ(static_cast<int>(res.value()), 0x2211);
+ ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 7);
res = sc.popUTF32();
ASSERT_TRUE(res.has_value());
ASSERT_EQ(static_cast<int>(res.value()), 0);
+ ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 8);
res = sc.popUTF32();
ASSERT_FALSE(res.has_value());
ASSERT_EQ(res.error(), -1);
+ ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 8);
}
TEST(LlvmLibcStringConverterTest, UTF32To8) {
const wchar_t *src = L"\x1f921\x2211"; // clown emoji, sigma symbol
LIBC_NAMESPACE::internal::mbstate state;
LIBC_NAMESPACE::internal::StringConverter<char32_t> sc(
- reinterpret_cast<const char32_t *>(src), &state);
+ reinterpret_cast<const char32_t *>(src), SIZE_MAX, &state);
auto res = sc.popUTF8();
ASSERT_TRUE(res.has_value());
ASSERT_EQ(static_cast<int>(res.value()), 0xF0);
+ ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 0);
res = sc.popUTF8();
ASSERT_TRUE(res.has_value());
ASSERT_EQ(static_cast<int>(res.value()), 0x9F);
+ ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 0);
res = sc.popUTF8();
ASSERT_TRUE(res.has_value());
ASSERT_EQ(static_cast<int>(res.value()), 0xA4);
+ ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 0);
res = sc.popUTF8();
ASSERT_TRUE(res.has_value());
ASSERT_EQ(static_cast<int>(res.value()), 0xA1);
+ ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 1);
res = sc.popUTF8();
ASSERT_TRUE(res.has_value());
ASSERT_EQ(static_cast<int>(res.value()), 0xE2);
+ ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 1);
res = sc.popUTF8();
ASSERT_TRUE(res.has_value());
ASSERT_EQ(static_cast<int>(res.value()), 0x88);
+ ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 1);
res = sc.popUTF8();
ASSERT_TRUE(res.has_value());
ASSERT_EQ(static_cast<int>(res.value()), 0x91);
+ ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 2);
res = sc.popUTF8();
ASSERT_TRUE(res.has_value());
ASSERT_EQ(static_cast<int>(res.value()), 0);
+ ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 3);
res = sc.popUTF8();
ASSERT_FALSE(res.has_value());
ASSERT_EQ(res.error(), -1);
+ ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 3);
}
TEST(LlvmLibcStringConverterTest, UTF32To8PartialRead) {
const wchar_t *src = L"\x1f921\x2211"; // clown emoji, sigma symbol
LIBC_NAMESPACE::internal::mbstate state;
LIBC_NAMESPACE::internal::StringConverter<char32_t> sc(
- reinterpret_cast<const char32_t *>(src), 1, &state);
+ reinterpret_cast<const char32_t *>(src), 1, SIZE_MAX, &state);
auto res = sc.popUTF8();
ASSERT_TRUE(res.has_value());
ASSERT_EQ(static_cast<int>(res.value()), 0xF0);
+ ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 0);
res = sc.popUTF8();
ASSERT_TRUE(res.has_value());
ASSERT_EQ(static_cast<int>(res.value()), 0x9F);
+ ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 0);
res = sc.popUTF8();
ASSERT_TRUE(res.has_value());
ASSERT_EQ(static_cast<int>(res.value()), 0xA4);
+ ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 0);
res = sc.popUTF8();
ASSERT_TRUE(res.has_value());
ASSERT_EQ(static_cast<int>(res.value()), 0xA1);
+ ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 1);
res = sc.popUTF8();
ASSERT_FALSE(res.has_value());
@@ -113,57 +130,179 @@ TEST(LlvmLibcStringConverterTest, UTF8To32PartialRead) {
const char *src = "\xF0\x9F\xA4\xA1\xE2\x88\x91";
LIBC_NAMESPACE::internal::mbstate state;
LIBC_NAMESPACE::internal::StringConverter<char8_t> sc(
- reinterpret_cast<const char8_t *>(src), 5, &state);
+ reinterpret_cast<const char8_t *>(src), 5, SIZE_MAX, &state);
auto res = sc.popUTF32();
ASSERT_TRUE(res.has_value());
ASSERT_EQ(static_cast<int>(res.value()), 0x1f921);
+ ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 4);
res = sc.popUTF32();
ASSERT_FALSE(res.has_value());
ASSERT_EQ(static_cast<int>(res.error()), -1);
+ ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 5);
}
TEST(LlvmLibcStringConverterTest, UTF32To8ErrorHandling) {
const wchar_t *src = L"\x1f921\xffffff"; // clown emoji, invalid utf32
LIBC_NAMESPACE::internal::mbstate state;
LIBC_NAMESPACE::internal::StringConverter<char32_t> sc(
- reinterpret_cast<const char32_t *>(src), &state);
+ reinterpret_cast<const char32_t *>(src), SIZE_MAX, &state);
auto res = sc.popUTF8();
ASSERT_TRUE(res.has_value());
ASSERT_EQ(static_cast<int>(res.value()), 0xF0);
+ ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 0);
res = sc.popUTF8();
ASSERT_TRUE(res.has_value());
ASSERT_EQ(static_cast<int>(res.value()), 0x9F);
+ ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 0);
res = sc.popUTF8();
ASSERT_TRUE(res.has_value());
ASSERT_EQ(static_cast<int>(res.value()), 0xA4);
+ ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 0);
res = sc.popUTF8();
ASSERT_TRUE(res.has_value());
ASSERT_EQ(static_cast<int>(res.value()), 0xA1);
+ ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 1);
res = sc.popUTF8();
ASSERT_FALSE(res.has_value());
ASSERT_EQ(static_cast<int>(res.error()), EILSEQ);
+ ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 1);
}
TEST(LlvmLibcStringConverterTest, UTF8To32ErrorHandling) {
// first 4 bytes are clown emoji (🤡)
- // next 2 don't form a complete character
- const char *src = "\xF0\x9F\xA4\xA1\xE2\x88";
+ // next 3 form an invalid character
+ const char *src = "\xF0\x9F\xA4\xA1\x90\x88\x30";
LIBC_NAMESPACE::internal::mbstate state;
LIBC_NAMESPACE::internal::StringConverter<char8_t> sc(
- reinterpret_cast<const char8_t *>(src), &state);
+ reinterpret_cast<const char8_t *>(src), SIZE_MAX, &state);
auto res = sc.popUTF32();
ASSERT_TRUE(res.has_value());
ASSERT_EQ(static_cast<int>(res.value()), 0x1f921);
+ ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 4);
res = sc.popUTF32();
ASSERT_FALSE(res.has_value());
ASSERT_EQ(static_cast<int>(res.error()), EILSEQ);
+ ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 4);
+}
+
+TEST(LlvmLibcStringConverterTest, MultipleStringConverters32To8) {
+ /*
+ We do NOT test partially popping a character and expecting the next
+ StringConverter to continue where we left off. This is not expected to work
+ and considered invalid.
+ */
+ const wchar_t *src = L"\x1f921\xff"; // clown emoji, sigma symbol
+ LIBC_NAMESPACE::internal::mbstate state;
+ LIBC_NAMESPACE::internal::StringConverter<char32_t> sc1(
+ reinterpret_cast<const char32_t *>(src), 1, SIZE_MAX, &state);
+
+ auto res = sc1.popUTF8();
+ ASSERT_TRUE(res.has_value());
+ ASSERT_EQ(static_cast<int>(res.value()), 0xF0);
+ ASSERT_EQ(static_cast<int>(sc1.getSourceIndex()), 0);
+
+ res = sc1.popUTF8();
+ ASSERT_TRUE(res.has_value());
+ ASSERT_EQ(static_cast<int>(res.value()), 0x9F);
+ ASSERT_EQ(static_cast<int>(sc1.getSourceIndex()), 0);
+
+ res = sc1.popUTF8();
+ ASSERT_TRUE(res.has_value());
+ ASSERT_EQ(static_cast<int>(res.value()), 0xA4);
+ ASSERT_EQ(static_cast<int>(sc1.getSourceIndex()), 0);
+
+ res = sc1.popUTF8();
+ ASSERT_TRUE(res.has_value());
+ ASSERT_EQ(static_cast<int>(res.value()), 0xA1);
+ ASSERT_EQ(static_cast<int>(sc1.getSourceIndex()), 1);
+
+ LIBC_NAMESPACE::internal::StringConverter<char32_t> sc2(
+ reinterpret_cast<const char32_t *>(src) + sc1.getSourceIndex(), 1,
+ SIZE_MAX, &state);
+
+ res = sc2.popUTF8();
+ ASSERT_TRUE(res.has_value());
+ ASSERT_EQ(static_cast<int>(res.value()), 0xC3);
+ ASSERT_EQ(static_cast<int>(sc2.getSourceIndex()), 0);
+
+ res = sc2.popUTF8();
+ ASSERT_TRUE(res.has_value());
+ ASSERT_EQ(static_cast<int>(res.value()), 0xBF);
+ ASSERT_EQ(static_cast<int>(sc2.getSourceIndex()), 1);
+}
+
+TEST(LlvmLibcStringConverterTest, MultipleStringConverters8To32) {
+ const char *src = "\xF0\x9F\xA4\xA1"; // clown emoji
+ LIBC_NAMESPACE::internal::mbstate state;
+ LIBC_NAMESPACE::internal::StringConverter<char8_t> sc1(
+ reinterpret_cast<const char8_t *>(src), 2, SIZE_MAX, &state);
+
+ auto res = sc1.popUTF32();
+ ASSERT_FALSE(res.has_value());
+ ASSERT_EQ(static_cast<int>(res.error()), -1);
+ ASSERT_EQ(static_cast<int>(sc1.getSourceIndex()), 2);
+
+ LIBC_NAMESPACE::internal::StringConverter<char8_t> sc2(
+ reinterpret_cast<const char8_t *>(src) + sc1.getSourceIndex(), 3,
+ SIZE_MAX, &state);
+
+ res = sc2.popUTF32();
+ ASSERT_TRUE(res.has_value());
+ ASSERT_EQ(static_cast<int>(res.value()), 0x1f921);
+ ASSERT_EQ(static_cast<int>(sc2.getSourceIndex()), 2);
+
+ res = sc2.popUTF32();
+ ASSERT_TRUE(res.has_value());
+ ASSERT_EQ(static_cast<int>(res.value()), 0);
+ ASSERT_EQ(static_cast<int>(sc2.getSourceIndex()), 3);
+}
+
+TEST(LlvmLibcStringConverterTest, DstLimitUTF8To32) {
+ const char *src = "\xF0\x9F\xA4\xA1\xF0\x9F\xA4\xA1"; // 2 clown emojis
+ LIBC_NAMESPACE::internal::mbstate state;
+ LIBC_NAMESPACE::internal::StringConverter<char8_t> sc(
+ reinterpret_cast<const char8_t *>(src), SIZE_MAX, 1, &state);
+
+ auto res = sc.popUTF32();
+ ASSERT_TRUE(res.has_value());
+ ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 4);
+
+ res = sc.popUTF32(); // no space to pop this into
+ ASSERT_FALSE(res.has_value());
+}
+
+TEST(LlvmLibcStringConverterTest, DstLimitUTF32To8) {
+ const wchar_t *src = L"\x1f921\x1f921"; // 2 clown emojis
+ LIBC_NAMESPACE::internal::mbstate state;
+ LIBC_NAMESPACE::internal::StringConverter<char32_t> sc(
+ reinterpret_cast<const char32_t *>(src), SIZE_MAX, 5, &state);
+
+ auto res = sc.popUTF8();
+ ASSERT_TRUE(res.has_value());
+ ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 0);
+
+ res = sc.popUTF8();
+ ASSERT_TRUE(res.has_value());
+ ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 0);
+
+ res = sc.popUTF8();
+ ASSERT_TRUE(res.has_value());
+ ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 0);
+
+ res = sc.popUTF8();
+ ASSERT_TRUE(res.has_value());
+ ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 1);
+
+ res = sc.popUTF8();
+ ASSERT_FALSE(res.has_value());
+ ASSERT_EQ(static_cast<int>(sc.getSourceIndex()), 1);
}
More information about the libc-commits
mailing list