[libc-commits] [libc] [libc] Implemented CharacterConverter push/pop for utf32->utf8 conversions (PR #143971)
via libc-commits
libc-commits at lists.llvm.org
Mon Jun 16 11:01:10 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-libc
Author: Uzair Nawaz (uzairnawaz)
<details>
<summary>Changes</summary>
Implemented CharacterConverter methods for conversion between utf32 -> utf8
Added tests
---
Full diff: https://github.com/llvm/llvm-project/pull/143971.diff
8 Files Affected:
- (modified) libc/src/__support/wchar/CMakeLists.txt (+2-7)
- (modified) libc/src/__support/wchar/character_converter.cpp (+65-5)
- (modified) libc/src/__support/wchar/character_converter.h (+5-3)
- (modified) libc/src/__support/wchar/mbstate.h (+9)
- (removed) libc/src/__support/wchar/utf_ret.h (-24)
- (modified) libc/test/src/__support/CMakeLists.txt (+6)
- (added) libc/test/src/__support/wchar/CMakeLists.txt (+11)
- (added) libc/test/src/__support/wchar/utf32_to_8_test.cpp (+180)
``````````diff
diff --git a/libc/src/__support/wchar/CMakeLists.txt b/libc/src/__support/wchar/CMakeLists.txt
index 5cca58400ff45..6715e354e23e5 100644
--- a/libc/src/__support/wchar/CMakeLists.txt
+++ b/libc/src/__support/wchar/CMakeLists.txt
@@ -15,12 +15,7 @@ add_object_library(
DEPENDS
libc.hdr.types.char8_t
libc.hdr.types.char32_t
+ libc.src.__support.error_or
+ libc.src.__support.math_extras
.mbstate
- .utf_ret
-)
-
-add_header_library(
- utf_ret
- HDRS
- utf_ret.h
)
diff --git a/libc/src/__support/wchar/character_converter.cpp b/libc/src/__support/wchar/character_converter.cpp
index f09c7815a6cc4..10c01b54a663a 100644
--- a/libc/src/__support/wchar/character_converter.cpp
+++ b/libc/src/__support/wchar/character_converter.cpp
@@ -8,8 +8,10 @@
#include "hdr/types/char32_t.h"
#include "hdr/types/char8_t.h"
+#include "src/__support/common.h"
+#include "src/__support/error_or.h"
+#include "src/__support/math_extras.h"
#include "src/__support/wchar/mbstate.h"
-#include "src/__support/wchar/utf_ret.h"
#include "character_converter.h"
@@ -18,17 +20,75 @@ namespace internal {
CharacterConverter::CharacterConverter(mbstate *mbstate) { state = mbstate; }
+void CharacterConverter::clear() {
+ state->partial = 0;
+ state->bytes_processed = 0;
+ state->total_bytes = 0;
+}
+
bool CharacterConverter::isComplete() {
return state->bytes_processed == state->total_bytes;
}
-int CharacterConverter::push(char8_t utf8_byte) {}
+int CharacterConverter::push(char32_t utf32) {
+ // we can't be partially through a conversion when pushing a utf32 value
+ if (!isComplete())
+ return -1;
+
+ state->partial = utf32;
+ state->bytes_processed = 0;
+
+ // determine number of utf-8 bytes needed to represent this utf32 value
+ constexpr char32_t MAX_VALUE_PER_UTF8_LEN[] = {0x7f, 0x7ff, 0xffff, 0x10ffff};
+ constexpr int NUM_RANGES = 4;
+ for (uint8_t i = 0; i < NUM_RANGES; i++) {
+ if (state->partial <= MAX_VALUE_PER_UTF8_LEN[i]) {
+ state->total_bytes = i + 1;
+ return 0;
+ }
+ }
+
+ // `utf32` contains a value that is too large to actually represent a valid
+ // unicode character
+ clear();
+ return -1;
+}
+
+ErrorOr<char8_t> CharacterConverter::pop_utf8() {
+ if (isComplete())
+ return Error(-1);
+
+ constexpr char8_t FIRST_BYTE_HEADERS[] = {0, 0xC0, 0xE0, 0xF0};
+ constexpr char8_t CONTINUING_BYTE_HEADER = 0x80;
-int CharacterConverter::push(char32_t utf32) {}
+ // the number of bits per utf-8 byte that actually encode character
+ // information not metadata (# of bits excluding the byte headers)
+ constexpr size_t ENCODED_BITS_PER_UTF8 = 6;
+ constexpr int MASK_ENCODED_BITS =
+ mask_trailing_ones<unsigned int, ENCODED_BITS_PER_UTF8>();
-utf_ret<char8_t> CharacterConverter::pop_utf8() {}
+ char32_t output;
-utf_ret<char32_t> CharacterConverter::pop_utf32() {}
+ // Shift to get the next 6 bits from the utf32 encoding
+ const char32_t shift_amount =
+ (state->total_bytes - state->bytes_processed - 1) * ENCODED_BITS_PER_UTF8;
+ if (state->bytes_processed == 0) {
+ /*
+ Choose the correct set of most significant bits to encode the length
+ of the utf8 sequence. The remaining bits contain the most significant
+ bits of the unicode value of the character.
+ */
+ output = FIRST_BYTE_HEADERS[state->total_bytes - 1] |
+ (state->partial >> shift_amount);
+ } else {
+ // Get the next 6 bits and format it like so: 10xxxxxx
+ output = CONTINUING_BYTE_HEADER |
+ ((state->partial >> shift_amount) & MASK_ENCODED_BITS);
+ }
+
+ state->bytes_processed++;
+ return (char8_t)output;
+}
} // namespace internal
} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/__support/wchar/character_converter.h b/libc/src/__support/wchar/character_converter.h
index d0602d2defe22..c4ba7cf6b689f 100644
--- a/libc/src/__support/wchar/character_converter.h
+++ b/libc/src/__support/wchar/character_converter.h
@@ -11,8 +11,9 @@
#include "hdr/types/char32_t.h"
#include "hdr/types/char8_t.h"
+#include "src/__support/common.h"
+#include "src/__support/error_or.h"
#include "src/__support/wchar/mbstate.h"
-#include "src/__support/wchar/utf_ret.h"
namespace LIBC_NAMESPACE_DECL {
namespace internal {
@@ -24,13 +25,14 @@ class CharacterConverter {
public:
CharacterConverter(mbstate *mbstate);
+ void clear();
bool isComplete();
int push(char8_t utf8_byte);
int push(char32_t utf32);
- utf_ret<char8_t> pop_utf8();
- utf_ret<char32_t> pop_utf32();
+ ErrorOr<char8_t> pop_utf8();
+ ErrorOr<char32_t> pop_utf32();
};
} // namespace internal
diff --git a/libc/src/__support/wchar/mbstate.h b/libc/src/__support/wchar/mbstate.h
index d33ee354a5443..fb08fb4eaa188 100644
--- a/libc/src/__support/wchar/mbstate.h
+++ b/libc/src/__support/wchar/mbstate.h
@@ -17,8 +17,17 @@ namespace LIBC_NAMESPACE_DECL {
namespace internal {
struct mbstate {
+ // store a partial codepoint (in UTF-32)
char32_t partial;
+
+ /*
+ Progress towards a conversion
+ For utf8 -> utf32, increases with each CharacterConverter::push(utf8_byte)
+ For utf32 -> utf8, increases with each CharacterConverter::pop_utf8()
+ */
uint8_t bytes_processed;
+
+ // Total number of bytes that will be needed to represent this character
uint8_t total_bytes;
};
diff --git a/libc/src/__support/wchar/utf_ret.h b/libc/src/__support/wchar/utf_ret.h
deleted file mode 100644
index fa99b76159bd8..0000000000000
--- a/libc/src/__support/wchar/utf_ret.h
+++ /dev/null
@@ -1,24 +0,0 @@
-//===-- Definition of utf_ret ----------------------------------*-- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIBC_SRC___SUPPORT_UTF_RET_H
-#define LLVM_LIBC_SRC___SUPPORT_UTF_RET_H
-
-#include "src/__support/common.h"
-
-namespace LIBC_NAMESPACE_DECL {
-namespace internal {
-template <typename T> struct utf_ret {
- T out;
- int error;
-};
-
-} // namespace internal
-} // namespace LIBC_NAMESPACE_DECL
-
-#endif // LLVM_LIBC_SRC___SUPPORT_UTF_RET_H
diff --git a/libc/test/src/__support/CMakeLists.txt b/libc/test/src/__support/CMakeLists.txt
index 4fb0dae86e5ca..76218a16e0cf7 100644
--- a/libc/test/src/__support/CMakeLists.txt
+++ b/libc/test/src/__support/CMakeLists.txt
@@ -275,3 +275,9 @@ add_subdirectory(fixed_point)
add_subdirectory(HashTable)
add_subdirectory(time)
add_subdirectory(threads)
+
+# Requires access to uchar header which is not on macos
+# Therefore, cannot currently build this on macos in overlay mode
+if(NOT(LIBC_TARGET_OS_IS_DARWIN))
+ add_subdirectory(wchar)
+endif()
diff --git a/libc/test/src/__support/wchar/CMakeLists.txt b/libc/test/src/__support/wchar/CMakeLists.txt
new file mode 100644
index 0000000000000..5dff6e9115f7d
--- /dev/null
+++ b/libc/test/src/__support/wchar/CMakeLists.txt
@@ -0,0 +1,11 @@
+add_custom_target(libc-support-wchar-tests)
+
+add_libc_test(
+ utf32_to_8_test
+ SUITE
+ libc-support-tests
+ SRCS
+ utf32_to_8_test.cpp
+ DEPENDS
+ libc.src.__support.wchar.character_converter
+)
diff --git a/libc/test/src/__support/wchar/utf32_to_8_test.cpp b/libc/test/src/__support/wchar/utf32_to_8_test.cpp
new file mode 100644
index 0000000000000..178a503ea8f69
--- /dev/null
+++ b/libc/test/src/__support/wchar/utf32_to_8_test.cpp
@@ -0,0 +1,180 @@
+//===-- Unittests for the CharacterConverter class (utf32 -> 8) -----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/common.h"
+#include "src/__support/wchar/character_converter.h"
+#include "src/__support/wchar/mbstate.h"
+
+#include "test/UnitTest/Test.h"
+
+TEST(LlvmLibcCharacterConverterUTF32To8Test, OneByte) {
+ LIBC_NAMESPACE::internal::mbstate state;
+ LIBC_NAMESPACE::internal::CharacterConverter cr(&state);
+ cr.clear();
+
+ // utf8 1-byte encodings are identical to their utf32 representations
+ char32_t utf32_A = 0x41; // 'A'
+ cr.push(utf32_A);
+ auto popped = cr.pop_utf8();
+ ASSERT_TRUE(popped.has_value());
+ ASSERT_EQ(static_cast<char>(popped.value()), 'A');
+ ASSERT_TRUE(cr.isComplete());
+
+ char32_t utf32_B = 0x42; // 'B'
+ cr.push(utf32_B);
+ popped = cr.pop_utf8();
+ ASSERT_TRUE(popped.has_value());
+ ASSERT_EQ(static_cast<char>(popped.value()), 'B');
+ ASSERT_TRUE(cr.isComplete());
+
+ // should error if we try to pop another utf8 byte out
+ popped = cr.pop_utf8();
+ ASSERT_FALSE(popped.has_value());
+}
+
+TEST(LlvmLibcCharacterConverterUTF32To8Test, TwoByte) {
+ LIBC_NAMESPACE::internal::mbstate state;
+ LIBC_NAMESPACE::internal::CharacterConverter cr(&state);
+ cr.clear();
+
+ // testing utf32: 0xff -> utf8: 0xc3 0xbf
+ char32_t utf32 = 0xff;
+ cr.push(utf32);
+ auto popped = cr.pop_utf8();
+ ASSERT_TRUE(popped.has_value());
+ ASSERT_EQ(static_cast<int>(popped.value()), 0xc3);
+ ASSERT_TRUE(!cr.isComplete());
+ popped = cr.pop_utf8();
+ ASSERT_TRUE(popped.has_value());
+ ASSERT_EQ(static_cast<int>(popped.value()), 0xbf);
+ ASSERT_TRUE(cr.isComplete());
+
+ // testing utf32: 0x58e -> utf8: 0xd6 0x8e
+ utf32 = 0x58e;
+ cr.push(utf32);
+ popped = cr.pop_utf8();
+ ASSERT_TRUE(popped.has_value());
+ ASSERT_EQ(static_cast<int>(popped.value()), 0xd6);
+ ASSERT_TRUE(!cr.isComplete());
+ popped = cr.pop_utf8();
+ ASSERT_TRUE(popped.has_value());
+ ASSERT_EQ(static_cast<int>(popped.value()), 0x8e);
+ ASSERT_TRUE(cr.isComplete());
+
+ // should error if we try to pop another utf8 byte out
+ popped = cr.pop_utf8();
+ ASSERT_FALSE(popped.has_value());
+}
+
+TEST(LlvmLibcCharacterConverterUTF32To8Test, ThreeByte) {
+ LIBC_NAMESPACE::internal::mbstate state;
+ LIBC_NAMESPACE::internal::CharacterConverter cr(&state);
+ cr.clear();
+
+ // testing utf32: 0xac15 -> utf8: 0xea 0xb0 0x95
+ char32_t utf32 = 0xac15;
+ cr.push(utf32);
+ auto popped = cr.pop_utf8();
+ ASSERT_TRUE(popped.has_value());
+ ASSERT_EQ(static_cast<int>(popped.value()), 0xea);
+ ASSERT_TRUE(!cr.isComplete());
+ popped = cr.pop_utf8();
+ ASSERT_TRUE(popped.has_value());
+ ASSERT_EQ(static_cast<int>(popped.value()), 0xb0);
+ ASSERT_TRUE(!cr.isComplete());
+ popped = cr.pop_utf8();
+ ASSERT_TRUE(popped.has_value());
+ ASSERT_EQ(static_cast<int>(popped.value()), 0x95);
+ ASSERT_TRUE(cr.isComplete());
+
+ // testing utf32: 0x267b -> utf8: 0xe2 0x99 0xbb
+ utf32 = 0x267b;
+ cr.push(utf32);
+ popped = cr.pop_utf8();
+ ASSERT_TRUE(popped.has_value());
+ ASSERT_EQ(static_cast<int>(popped.value()), 0xe2);
+ ASSERT_TRUE(!cr.isComplete());
+ popped = cr.pop_utf8();
+ ASSERT_TRUE(popped.has_value());
+ ASSERT_EQ(static_cast<int>(popped.value()), 0x99);
+ ASSERT_TRUE(!cr.isComplete());
+ popped = cr.pop_utf8();
+ ASSERT_TRUE(popped.has_value());
+ ASSERT_EQ(static_cast<int>(popped.value()), 0xbb);
+ ASSERT_TRUE(cr.isComplete());
+
+ // should error if we try to pop another utf8 byte out
+ popped = cr.pop_utf8();
+ ASSERT_FALSE(popped.has_value());
+}
+
+TEST(LlvmLibcCharacterConverterUTF32To8Test, FourByte) {
+ LIBC_NAMESPACE::internal::mbstate state;
+ LIBC_NAMESPACE::internal::CharacterConverter cr(&state);
+ cr.clear();
+
+ // testing utf32: 0x1f921 -> utf8: 0xf0 0x9f 0xa4 0xa1
+ char32_t utf32 = 0x1f921;
+ cr.push(utf32);
+ auto popped = cr.pop_utf8();
+ ASSERT_TRUE(popped.has_value());
+ ASSERT_EQ(static_cast<int>(popped.value()), 0xf0);
+ ASSERT_TRUE(!cr.isComplete());
+ popped = cr.pop_utf8();
+ ASSERT_TRUE(popped.has_value());
+ ASSERT_EQ(static_cast<int>(popped.value()), 0x9f);
+ ASSERT_TRUE(!cr.isComplete());
+ popped = cr.pop_utf8();
+ ASSERT_TRUE(popped.has_value());
+ ASSERT_EQ(static_cast<int>(popped.value()), 0xa4);
+ ASSERT_TRUE(!cr.isComplete());
+ popped = cr.pop_utf8();
+ ASSERT_TRUE(popped.has_value());
+ ASSERT_EQ(static_cast<int>(popped.value()), 0xa1);
+ ASSERT_TRUE(cr.isComplete());
+
+ // testing utf32: 0x12121 -> utf8: 0xf0 0x92 0x84 0xa1
+ utf32 = 0x12121;
+ cr.push(utf32);
+ popped = cr.pop_utf8();
+ ASSERT_TRUE(popped.has_value());
+ ASSERT_EQ(static_cast<int>(popped.value()), 0xf0);
+ ASSERT_TRUE(!cr.isComplete());
+ popped = cr.pop_utf8();
+ ASSERT_TRUE(popped.has_value());
+ ASSERT_EQ(static_cast<int>(popped.value()), 0x92);
+ ASSERT_TRUE(!cr.isComplete());
+ popped = cr.pop_utf8();
+ ASSERT_TRUE(popped.has_value());
+ ASSERT_EQ(static_cast<int>(popped.value()), 0x84);
+ ASSERT_TRUE(!cr.isComplete());
+ popped = cr.pop_utf8();
+ ASSERT_TRUE(popped.has_value());
+ ASSERT_EQ(static_cast<int>(popped.value()), 0xa1);
+ ASSERT_TRUE(cr.isComplete());
+
+ // should error if we try to pop another utf8 byte out
+ popped = cr.pop_utf8();
+ ASSERT_FALSE(popped.has_value());
+}
+
+TEST(LlvmLibcCharacterConverterUTF32To8Test, CantPushMidConversion) {
+ LIBC_NAMESPACE::internal::mbstate state;
+ LIBC_NAMESPACE::internal::CharacterConverter cr(&state);
+ cr.clear();
+
+ // testing utf32: 0x12121 -> utf8: 0xf0 0x92 0x84 0xa1
+ char32_t utf32 = 0x12121;
+ ASSERT_EQ(cr.push(utf32), 0);
+ auto popped = cr.pop_utf8();
+ ASSERT_TRUE(popped.has_value());
+
+ // can't push a utf32 without finishing popping the utf8 bytes out
+ int err = cr.push(utf32);
+ ASSERT_EQ(err, -1);
+}
\ No newline at end of file
``````````
</details>
https://github.com/llvm/llvm-project/pull/143971
More information about the libc-commits
mailing list