[libc-commits] [libc] [libc] utf8 to 32 CharacterConverter (PR #143973)

Michael Jones via libc-commits libc-commits at lists.llvm.org
Thu Jun 12 16:35:20 PDT 2025


================
@@ -22,13 +23,69 @@ bool CharacterConverter::isComplete() {
   return state->bytes_processed == state->total_bytes;
 }
 
-int CharacterConverter::push(char8_t utf8_byte) {}
-
-int CharacterConverter::push(char32_t utf32) {}
-
-utf_ret<char8_t> CharacterConverter::pop_utf8() {}
+int CharacterConverter::push(char8_t utf8_byte) {
+  // Checking the first byte if first push
+  if (state->bytes_processed == 0 && state->total_bytes == 0) {
+    state->partial = static_cast<char32_t>(0);
+    // 1 byte total
+    if (cpp::countl_one(utf8_byte) == 0) {
+      state->total_bytes = 1;
+    }
+    // 2 bytes total
+    else if (cpp::countl_one(utf8_byte) == 2) {
+      state->total_bytes = 2;
+      utf8_byte &= 0x1F;
+    }
+    // 3 bytes total
+    else if (cpp::countl_one(utf8_byte) == 3) {
+      state->total_bytes = 3;
+      utf8_byte &= 0x0F;
+    }
+    // 4 bytes total
+    else if (cpp::countl_one(utf8_byte) == 4) {
+      state->total_bytes = 4;
+      utf8_byte &= 0x07;
+    }
+    // Invalid byte -> reset mbstate
+    else {
+      return -1;
+    }
+    state->partial = static_cast<char32_t>(utf8_byte);
+    state->bytes_processed++;
+    return 0;
+  }
+  // Any subsequent push
+  if (cpp::countl_one(utf8_byte) == 1 && !isComplete()) {
+    char32_t byte = utf8_byte & 0x3F;
+    state->partial = state->partial << 6;
+    state->partial |= byte;
+    state->bytes_processed++;
+    return 0;
+  }
+  // Invalid byte -> reset if we didn't get successful complete read
+  if (!isComplete()) {
+    state->partial = static_cast<char32_t>(0);
+    state->bytes_processed = 0;
+    state->total_bytes = 0;
+  }
+  return -1;
+}
 
-utf_ret<char32_t> CharacterConverter::pop_utf32() {}
+utf_ret<char32_t> CharacterConverter::pop_utf32() {
+  utf_ret<char32_t> utf32;
+  utf32.error = 0;
+  utf32.out = state->partial;
+  // if pop is called too early
+  if (!isComplete()) {
+    utf32.error = -1;
+    return utf32;
+  }
----------------
michaelrj-google wrote:

it doesn't seem particularly useful to return both the error and the partial value. Consider what the user sees. They get an error code and a value which they can't use. It might be better to only return either the value or the error.

https://github.com/llvm/llvm-project/pull/143973


More information about the libc-commits mailing list