[libc-commits] [libc] [libc] Wchar Stringconverter (PR #146388)
via libc-commits
libc-commits at lists.llvm.org
Mon Jun 30 09:50:06 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-libc
Author: Uzair Nawaz (uzairnawaz)
<details>
<summary>Changes</summary>
Implemented a string converter class to encapsulate the logic of converting between utf8 <-> utf32
---
Full diff: https://github.com/llvm/llvm-project/pull/146388.diff
4 Files Affected:
- (modified) libc/src/__support/wchar/CMakeLists.txt (+12)
- (added) libc/src/__support/wchar/string_converter.h (+87)
- (modified) libc/test/src/__support/wchar/CMakeLists.txt (+10)
- (added) libc/test/src/__support/wchar/string_converter_test.cpp (+167)
``````````diff
diff --git a/libc/src/__support/wchar/CMakeLists.txt b/libc/src/__support/wchar/CMakeLists.txt
index d3fb58ed0c71c..836fecde8d6df 100644
--- a/libc/src/__support/wchar/CMakeLists.txt
+++ b/libc/src/__support/wchar/CMakeLists.txt
@@ -6,6 +6,18 @@ add_header_library(
libc.hdr.types.char32_t
)
+add_header_library(
+ string_converter
+ HDRS
+ string_converter.h
+ DEPENDS
+ libc.hdr.types.char8_t
+ libc.hdr.types.char32_t
+ libc.src.__support.error_or
+ .mbstate
+ .character_converter
+)
+
add_object_library(
character_converter
HDRS
diff --git a/libc/src/__support/wchar/string_converter.h b/libc/src/__support/wchar/string_converter.h
new file mode 100644
index 0000000000000..769691e5e2a87
--- /dev/null
+++ b/libc/src/__support/wchar/string_converter.h
@@ -0,0 +1,87 @@
+//===-- Definition of a class for mbstate_t and conversion -----*-- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_STRING_CONVERTER_H
+#define LLVM_LIBC_SRC___SUPPORT_STRING_CONVERTER_H
+
+#include "hdr/types/char32_t.h"
+#include "hdr/types/char8_t.h"
+#include "hdr/types/size_t.h"
+#include "src/__support/common.h"
+#include "src/__support/error_or.h"
+#include "src/__support/wchar/character_converter.h"
+#include "src/__support/wchar/mbstate.h"
+
+namespace LIBC_NAMESPACE_DECL {
+namespace internal {
+
+template <typename T> class StringConverter {
+private:
+ CharacterConverter cr;
+ const T *src;
+ size_t src_len;
+ size_t src_idx;
+
+ int pushFullCharacter() {
+ if (!cr.isEmpty())
+ return 0;
+
+ int original_idx = src_idx;
+ while (!cr.isFull() && src_idx < src_len) {
+ int err = cr.push(src[src_idx++]);
+ if (err != 0) {
+ // point to the beginning of the invalid sequence
+ src_idx = original_idx;
+ return err;
+ }
+ }
+
+ if (src_idx == src_len && !cr.isFull()) {
+ // src points to the beginning of the character
+ src_idx = original_idx;
+ return -1;
+ }
+
+ return 0;
+ }
+
+public:
+ StringConverter(const T *s, mbstate *ps)
+ : cr(ps), src(s), src_len(SIZE_MAX), src_idx(0) {}
+ StringConverter(const T *s, size_t len, mbstate *ps)
+ : cr(ps), src(s), src_len(len), src_idx(0) {}
+
+ ErrorOr<char32_t> popUTF32() {
+ int err = pushFullCharacter();
+ if (err != 0)
+ return Error(err);
+
+ auto out = cr.pop_utf32();
+ if (out.has_value() && out.value() == L'\0')
+ src_len = src_idx;
+
+ return out;
+ }
+
+ ErrorOr<char8_t> popUTF8() {
+ int err = pushFullCharacter();
+ if (err != 0)
+ return Error(err);
+
+ auto out = cr.pop_utf8();
+ if (out.has_value() && out.value() == '\0')
+ src_len = src_idx;
+
+ return out;
+ }
+};
+
+} // namespace internal
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC___SUPPORT_STRING_CONVERTER_H
diff --git a/libc/test/src/__support/wchar/CMakeLists.txt b/libc/test/src/__support/wchar/CMakeLists.txt
index 5176bfd4b024b..e8790df9a73cf 100644
--- a/libc/test/src/__support/wchar/CMakeLists.txt
+++ b/libc/test/src/__support/wchar/CMakeLists.txt
@@ -19,3 +19,13 @@ add_libc_test(
DEPENDS
libc.src.__support.wchar.character_converter
)
+
+add_libc_test(
+ string_converter_test.cpp
+ SUITE
+ libc-support-tests
+ SRCS
+ string_converter_test.cpp
+ DEPENDS
+ libc.src.__support.wchar.string_converter
+)
diff --git a/libc/test/src/__support/wchar/string_converter_test.cpp b/libc/test/src/__support/wchar/string_converter_test.cpp
new file mode 100644
index 0000000000000..54e2d86585a71
--- /dev/null
+++ b/libc/test/src/__support/wchar/string_converter_test.cpp
@@ -0,0 +1,167 @@
+//===-- Unittests for StringConverter class -------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "hdr/errno_macros.h"
+#include "hdr/types/char32_t.h"
+#include "hdr/types/char8_t.h"
+#include "src/__support/error_or.h"
+#include "src/__support/wchar/mbstate.h"
+#include "src/__support/wchar/string_converter.h"
+#include "test/UnitTest/Test.h"
+
+TEST(LlvmLibcStringConverterTest, UTF8To32) {
+ // first 4 bytes are clown emoji, then next 3 are sigma symbol
+ const char *src = "\xF0\x9F\xA4\xA1\xE2\x88\x91";
+ LIBC_NAMESPACE::internal::mbstate state;
+ LIBC_NAMESPACE::internal::StringConverter<char8_t> sc(
+ reinterpret_cast<const char8_t *>(src), &state);
+
+ auto res = sc.popUTF32();
+ ASSERT_TRUE(res.has_value());
+ ASSERT_EQ(static_cast<int>(res.value()), 0x1f921);
+
+ res = sc.popUTF32();
+ ASSERT_TRUE(res.has_value());
+ ASSERT_EQ(static_cast<int>(res.value()), 0x2211);
+
+ res = sc.popUTF32();
+ ASSERT_TRUE(res.has_value());
+ ASSERT_EQ(static_cast<int>(res.value()), 0);
+
+ res = sc.popUTF32();
+ ASSERT_FALSE(res.has_value());
+ ASSERT_EQ(res.error(), -1);
+}
+
+TEST(LlvmLibcStringConverterTest, UTF32To8) {
+ const wchar_t *src = L"\x1f921\x2211";
+ LIBC_NAMESPACE::internal::mbstate state;
+ LIBC_NAMESPACE::internal::StringConverter<char32_t> sc(
+ reinterpret_cast<const char32_t *>(src), &state);
+
+ auto res = sc.popUTF8();
+ ASSERT_TRUE(res.has_value());
+ ASSERT_EQ(static_cast<int>(res.value()), 0xF0);
+
+ res = sc.popUTF8();
+ ASSERT_TRUE(res.has_value());
+ ASSERT_EQ(static_cast<int>(res.value()), 0x9F);
+
+ res = sc.popUTF8();
+ ASSERT_TRUE(res.has_value());
+ ASSERT_EQ(static_cast<int>(res.value()), 0xA4);
+
+ res = sc.popUTF8();
+ ASSERT_TRUE(res.has_value());
+ ASSERT_EQ(static_cast<int>(res.value()), 0xA1);
+
+ res = sc.popUTF8();
+ ASSERT_TRUE(res.has_value());
+ ASSERT_EQ(static_cast<int>(res.value()), 0xE2);
+
+ res = sc.popUTF8();
+ ASSERT_TRUE(res.has_value());
+ ASSERT_EQ(static_cast<int>(res.value()), 0x88);
+
+ res = sc.popUTF8();
+ ASSERT_TRUE(res.has_value());
+ ASSERT_EQ(static_cast<int>(res.value()), 0x91);
+
+ res = sc.popUTF8();
+ ASSERT_TRUE(res.has_value());
+ ASSERT_EQ(static_cast<int>(res.value()), 0);
+
+ res = sc.popUTF8();
+ ASSERT_FALSE(res.has_value());
+ ASSERT_EQ(res.error(), -1);
+}
+
+TEST(LlvmLibcStringConverterTest, UTF32To8PartialRead) {
+ const wchar_t *src = L"\x1f921\x2211";
+ LIBC_NAMESPACE::internal::mbstate state;
+ LIBC_NAMESPACE::internal::StringConverter<char32_t> sc(
+ reinterpret_cast<const char32_t *>(src), 1, &state);
+
+ auto res = sc.popUTF8();
+ ASSERT_TRUE(res.has_value());
+ ASSERT_EQ(static_cast<int>(res.value()), 0xF0);
+
+ res = sc.popUTF8();
+ ASSERT_TRUE(res.has_value());
+ ASSERT_EQ(static_cast<int>(res.value()), 0x9F);
+
+ res = sc.popUTF8();
+ ASSERT_TRUE(res.has_value());
+ ASSERT_EQ(static_cast<int>(res.value()), 0xA4);
+
+ res = sc.popUTF8();
+ ASSERT_TRUE(res.has_value());
+ ASSERT_EQ(static_cast<int>(res.value()), 0xA1);
+
+ res = sc.popUTF8();
+ ASSERT_FALSE(res.has_value());
+ ASSERT_EQ(res.error(), -1);
+}
+
+TEST(LlvmLibcStringConverterTest, UTF8To32PartialRead) {
+ // first 4 bytes are clown emoji, then next 3 are sigma symbol
+ const char *src = "\xF0\x9F\xA4\xA1\xE2\x88\x91";
+ LIBC_NAMESPACE::internal::mbstate state;
+ LIBC_NAMESPACE::internal::StringConverter<char8_t> sc(
+ reinterpret_cast<const char8_t *>(src), 5, &state);
+
+ auto res = sc.popUTF32();
+ ASSERT_TRUE(res.has_value());
+ ASSERT_EQ(static_cast<int>(res.value()), 0x1f921);
+
+ res = sc.popUTF32();
+ ASSERT_FALSE(res.has_value());
+ ASSERT_EQ(static_cast<int>(res.error()), -1);
+}
+
+TEST(LlvmLibcStringConverterTest, UTF32To8ErrorHandling) {
+ const wchar_t *src = L"\x1f921\xffffff";
+ LIBC_NAMESPACE::internal::mbstate state;
+ LIBC_NAMESPACE::internal::StringConverter<char32_t> sc(
+ reinterpret_cast<const char32_t *>(src), &state);
+
+ auto res = sc.popUTF8();
+ ASSERT_TRUE(res.has_value());
+ ASSERT_EQ(static_cast<int>(res.value()), 0xF0);
+
+ res = sc.popUTF8();
+ ASSERT_TRUE(res.has_value());
+ ASSERT_EQ(static_cast<int>(res.value()), 0x9F);
+
+ res = sc.popUTF8();
+ ASSERT_TRUE(res.has_value());
+ ASSERT_EQ(static_cast<int>(res.value()), 0xA4);
+
+ res = sc.popUTF8();
+ ASSERT_TRUE(res.has_value());
+ ASSERT_EQ(static_cast<int>(res.value()), 0xA1);
+
+ res = sc.popUTF8();
+ ASSERT_FALSE(res.has_value());
+ ASSERT_EQ(static_cast<int>(res.error()), EILSEQ);
+}
+
+TEST(LlvmLibcStringConverterTest, UTF8To32ErrorHandling) {
+ const char *src = "\xF0\x9F\xA4\xA1\xE2\x88";
+ LIBC_NAMESPACE::internal::mbstate state;
+ LIBC_NAMESPACE::internal::StringConverter<char8_t> sc(
+ reinterpret_cast<const char8_t *>(src), &state);
+
+ auto res = sc.popUTF32();
+ ASSERT_TRUE(res.has_value());
+ ASSERT_EQ(static_cast<int>(res.value()), 0x1f921);
+
+ res = sc.popUTF32();
+ ASSERT_FALSE(res.has_value());
+ ASSERT_EQ(static_cast<int>(res.error()), EILSEQ);
+}
``````````
</details>
https://github.com/llvm/llvm-project/pull/146388
More information about the libc-commits
mailing list