[libc-commits] [libc] [libc] utf8 to 32 CharacterConverter (PR #143973)

Michael Jones via libc-commits libc-commits at lists.llvm.org
Fri Jun 13 10:08:08 PDT 2025


================
@@ -28,39 +28,30 @@ int CharacterConverter::push(char8_t utf8_byte) {
   // Checking the first byte if first push
   if (state->bytes_processed == 0 && state->total_bytes == 0) {
     state->partial = static_cast<char32_t>(0);
-    int numOnes = cpp::countl_one(utf8_byte);
-    switch (numOnes) {
+    uint8_t numOnes = static_cast<uint8_t>(cpp::countl_one(utf8_byte));
     // 1 byte total
-    case 0:
+    if (numOnes == 0) {
       state->total_bytes = 1;
-      break;
-    // 2 bytes total
-    case 2:
-      state->total_bytes = 2;
-      utf8_byte &= 0x1F;
-      break;
-    // 3 bytes total
-    case 3:
-      state->total_bytes = 3;
-      utf8_byte &= 0x0F;
-      break;
-    // 4 bytes total
-    case 4:
-      state->total_bytes = 4;
-      utf8_byte &= 0x07;
-      break;
+    }
+    // 2 through 4 bytes total
+    else if (numOnes >= 2 && numOnes <= 4) {
+      state->total_bytes = numOnes;
+      utf8_byte &= (0x7F >> numOnes);
+    }
     // Invalid first byte
-    default:
+    else {
       return -1;
     }
     state->partial = static_cast<char32_t>(utf8_byte);
     state->bytes_processed++;
     return 0;
   }
   // Any subsequent push
+  // Adding 6 more bits so need to left shift
+  const int shift_amount = 6;
----------------
michaelrj-google wrote:

6 is the amount the number is shifted by, but why is it 6 specifically? Maybe call this `BITS_PER_UTF8` or something similar

https://github.com/llvm/llvm-project/pull/143973


More information about the libc-commits mailing list