[libc-commits] [libc] [libc] utf8 to 32 CharacterConverter (PR #143973)
Brooks Moses via libc-commits
libc-commits at lists.llvm.org
Fri Jun 13 15:51:23 PDT 2025
================
@@ -22,13 +25,65 @@ bool CharacterConverter::isComplete() {
return state->bytes_processed == state->total_bytes;
}
-int CharacterConverter::push(char8_t utf8_byte) {}
+int CharacterConverter::push(char8_t utf8_byte) {
+ // Checking the first byte if first push
+ if (state->bytes_processed == 0 && state->total_bytes == 0) {
+ state->partial = static_cast<char32_t>(0);
+ uint8_t numOnes = static_cast<uint8_t>(cpp::countl_one(utf8_byte));
+ // 1 byte total
+ if (numOnes == 0) {
+ state->total_bytes = 1;
+ }
+ // 2 through 4 bytes total
+ else if (numOnes >= 2 && numOnes <= 4) {
+ /* Since the format is 110xxxxx, 1110xxxx, and 11110xxx for 2, 3, and 4,
+ we will make the base mask with 7 ones and right shift it as necessary. */
+ constexpr size_t significant_bits = 7;
+ state->total_bytes = numOnes;
+ utf8_byte &=
+ (mask_trailing_ones<uint32_t, significant_bits>() >> numOnes);
+ }
+ // Invalid first byte
+ else {
+ return -1;
+ }
+ state->partial = static_cast<char32_t>(utf8_byte);
+ state->bytes_processed++;
+ return 0;
+ }
+ // Any subsequent push
+ // Adding 6 more bits so need to left shift
+ constexpr size_t ENCODED_BITS_PER_UTF8 = 6;
+ if (cpp::countl_one(utf8_byte) == 1 && !isComplete()) {
+ char32_t byte =
+ utf8_byte & mask_trailing_ones<uint32_t, ENCODED_BITS_PER_UTF8>();
+ state->partial = state->partial << ENCODED_BITS_PER_UTF8;
+ state->partial |= byte;
+ state->bytes_processed++;
+ return 0;
+ }
+ // Invalid byte -> reset if we didn't get successful complete read
+ if (!isComplete()) {
+ state->partial = static_cast<char32_t>(0);
+ state->bytes_processed = 0;
+ state->total_bytes = 0;
+ }
+ return -1;
+}
-int CharacterConverter::push(char32_t utf32) {}
+ErrorOr<char32_t> CharacterConverter::pop_utf32() {
+ // if pop is called too early
+ if (!isComplete())
+ return Error(-1);
-utf_ret<char8_t> CharacterConverter::pop_utf8() {}
+ char32_t utf32 = state->partial;
-utf_ret<char32_t> CharacterConverter::pop_utf32() {}
+ // reset if successful pop
+ state->bytes_processed = 0;
+ state->total_bytes = 0;
+ state->partial = static_cast<char32_t>(0);
----------------
brooksmoses wrote:
You repeat these three lines often enough that extracting them to a "reset" method seems like it would make things clearer.
https://github.com/llvm/llvm-project/pull/143973
More information about the libc-commits
mailing list