[libc-commits] [libc] [libc] utf8 to 32 CharacterConverter (PR #143973)
Michael Jones via libc-commits
libc-commits at lists.llvm.org
Fri Jun 13 10:08:08 PDT 2025
================
@@ -28,39 +28,30 @@ int CharacterConverter::push(char8_t utf8_byte) {
// Checking the first byte if first push
if (state->bytes_processed == 0 && state->total_bytes == 0) {
state->partial = static_cast<char32_t>(0);
- int numOnes = cpp::countl_one(utf8_byte);
- switch (numOnes) {
+ uint8_t numOnes = static_cast<uint8_t>(cpp::countl_one(utf8_byte));
// 1 byte total
- case 0:
+ if (numOnes == 0) {
state->total_bytes = 1;
- break;
- // 2 bytes total
- case 2:
- state->total_bytes = 2;
- utf8_byte &= 0x1F;
- break;
- // 3 bytes total
- case 3:
- state->total_bytes = 3;
- utf8_byte &= 0x0F;
- break;
- // 4 bytes total
- case 4:
- state->total_bytes = 4;
- utf8_byte &= 0x07;
- break;
+ }
+ // 2 through 4 bytes total
+ else if (numOnes >= 2 && numOnes <= 4) {
+ state->total_bytes = numOnes;
+ utf8_byte &= (0x7F >> numOnes);
+ }
// Invalid first byte
- default:
+ else {
return -1;
}
state->partial = static_cast<char32_t>(utf8_byte);
state->bytes_processed++;
return 0;
}
// Any subsequent push
+ // Adding 6 more bits so need to left shift
+ const int shift_amount = 6;
----------------
michaelrj-google wrote:
6 is the amount the number is shifted by, but why is it 6 specifically? Maybe call this `BITS_PER_UTF8` or something similar
https://github.com/llvm/llvm-project/pull/143973
More information about the libc-commits
mailing list