[libc-commits] [libc] [libc][wctype] Implement internal UTF8 wctype classification functions (PR #174607)

Marcell Leleszi via libc-commits libc-commits at lists.llvm.org
Tue Jan 6 07:29:12 PST 2026


https://github.com/mleleszi created https://github.com/llvm/llvm-project/pull/174607

WIP

Closes [#174604](https://github.com/llvm/llvm-project/issues/174604)

This patch implements the internal wide character classification functions with Unicode support.
The ASCII paths which were not implemented are taken from [ctype_utils.h](https://github.com/llvm/llvm-project/blob/main/libc/src/__support/ctype_utils.h)



The tests mainly cover the dispatch logic between ASCII and UTF8, the UTF8 variants are already comprehensively tested in [wctype_classification_utils_test.cpp](https://github.com/mleleszi/llvm-project/blob/main/libc/test/src/__support/wctype/wctype_classification_utils_test.cpp).


>From afad4bdad5db4a5f15797490b8c5861fd48b96e4 Mon Sep 17 00:00:00 2001
From: Marcell Leleszi <mleleszi at google.com>
Date: Tue, 6 Jan 2026 14:58:45 +0000
Subject: [PATCH] Implement internal utf8 wctype classification functions

---
 libc/src/__support/CMakeLists.txt             |  12 +-
 libc/src/__support/wctype_utils.h             | 491 +++++++++++-----
 libc/test/src/__support/CMakeLists.txt        |  11 +
 libc/test/src/__support/wctype_utils_test.cpp | 536 ++++++++++++++++++
 4 files changed, 912 insertions(+), 138 deletions(-)
 create mode 100644 libc/test/src/__support/wctype_utils_test.cpp

diff --git a/libc/src/__support/CMakeLists.txt b/libc/src/__support/CMakeLists.txt
index df524c25cbd8a..fb5aca3a9b1d9 100644
--- a/libc/src/__support/CMakeLists.txt
+++ b/libc/src/__support/CMakeLists.txt
@@ -156,12 +156,22 @@ add_header_library(
     ctype_utils.h
 )
 
+list(APPEND wctype_utils_deps
+  libc.hdr.types.wchar_t
+)
+
+if ("${LIBC_CONF_WCTYPE_MODE}" STREQUAL "LIBC_WCTYPE_MODE_UTF8")
+  list(APPEND wctype_utils_deps
+      libc.src.__support.wctype.wctype_classification_utils
+  )
+endif()
+
 add_header_library(
   wctype_utils
   HDRS
     wctype_utils.h
   DEPENDS
-    libc.hdr.types.wchar_t
+    ${wctype_utils_deps}
 )
 
 add_header_library(
diff --git a/libc/src/__support/wctype_utils.h b/libc/src/__support/wctype_utils.h
index 7f17224104ffb..56e355d99cc65 100644
--- a/libc/src/__support/wctype_utils.h
+++ b/libc/src/__support/wctype_utils.h
@@ -13,6 +13,13 @@
 #include "src/__support/macros/attributes.h" // LIBC_INLINE
 #include "src/__support/macros/config.h"
 
+#define LIBC_WCTYPE_MODE_ASCII 0
+#define LIBC_WCTYPE_MODE_UTF8 1
+
+#if LIBC_CONF_WCTYPE_MODE == LIBC_WCTYPE_MODE_UTF8
+#include "src/__support/wctype/wctype_classification_utils.h"
+#endif
+
 namespace LIBC_NAMESPACE_DECL {
 namespace internal {
 
@@ -38,8 +45,8 @@ namespace internal {
 // This assumes the character ranges are contiguous, which they aren't in
 // EBCDIC. Technically we could use some smaller ranges, but that's even harder
 // to read.
-
-LIBC_INLINE static constexpr bool islower(wchar_t wch) {
+namespace ascii {
+LIBC_INLINE constexpr bool islower(wchar_t wch) {
   switch (wch) {
   case L'a':
   case L'b':
@@ -73,7 +80,7 @@ LIBC_INLINE static constexpr bool islower(wchar_t wch) {
   }
 }
 
-LIBC_INLINE static constexpr bool isupper(wchar_t wch) {
+LIBC_INLINE constexpr bool isupper(wchar_t wch) {
   switch (wch) {
   case L'A':
   case L'B':
@@ -107,7 +114,7 @@ LIBC_INLINE static constexpr bool isupper(wchar_t wch) {
   }
 }
 
-LIBC_INLINE static constexpr bool isdigit(wchar_t wch) {
+LIBC_INLINE constexpr bool isdigit(wchar_t wch) {
   switch (wch) {
   case L'0':
   case L'1':
@@ -125,125 +132,169 @@ LIBC_INLINE static constexpr bool isdigit(wchar_t wch) {
   }
 }
 
-LIBC_INLINE static constexpr wchar_t tolower(wchar_t wch) {
+LIBC_INLINE constexpr bool isalpha(wchar_t wch) {
   switch (wch) {
+  case L'a':
+  case L'b':
+  case L'c':
+  case L'd':
+  case L'e':
+  case L'f':
+  case L'g':
+  case L'h':
+  case L'i':
+  case L'j':
+  case L'k':
+  case L'l':
+  case L'm':
+  case L'n':
+  case L'o':
+  case L'p':
+  case L'q':
+  case L'r':
+  case L's':
+  case L't':
+  case L'u':
+  case L'v':
+  case L'w':
+  case L'x':
+  case L'y':
+  case L'z':
   case L'A':
-    return L'a';
   case L'B':
-    return L'b';
   case L'C':
-    return L'c';
   case L'D':
-    return L'd';
   case L'E':
-    return L'e';
   case L'F':
-    return L'f';
   case L'G':
-    return L'g';
   case L'H':
-    return L'h';
   case L'I':
-    return L'i';
   case L'J':
-    return L'j';
   case L'K':
-    return L'k';
   case L'L':
-    return L'l';
   case L'M':
-    return L'm';
   case L'N':
-    return L'n';
   case L'O':
-    return L'o';
   case L'P':
-    return L'p';
   case L'Q':
-    return L'q';
   case L'R':
-    return L'r';
   case L'S':
-    return L's';
   case L'T':
-    return L't';
   case L'U':
-    return L'u';
   case L'V':
-    return L'v';
   case L'W':
-    return L'w';
   case L'X':
-    return L'x';
   case L'Y':
-    return L'y';
   case L'Z':
-    return L'z';
+    return true;
   default:
-    return wch;
+    return false;
   }
 }
 
-LIBC_INLINE static constexpr wchar_t toupper(wchar_t wch) {
+LIBC_INLINE constexpr bool isalnum(wchar_t wch) {
   switch (wch) {
   case L'a':
-    return L'A';
   case L'b':
-    return L'B';
   case L'c':
-    return L'C';
   case L'd':
-    return L'D';
   case L'e':
-    return L'E';
   case L'f':
-    return L'F';
   case L'g':
-    return L'G';
   case L'h':
-    return L'H';
   case L'i':
-    return L'I';
   case L'j':
-    return L'J';
   case L'k':
-    return L'K';
   case L'l':
-    return L'L';
   case L'm':
-    return L'M';
   case L'n':
-    return L'N';
   case L'o':
-    return L'O';
   case L'p':
-    return L'P';
   case L'q':
-    return L'Q';
   case L'r':
-    return L'R';
   case L's':
-    return L'S';
   case L't':
-    return L'T';
   case L'u':
-    return L'U';
   case L'v':
-    return L'V';
   case L'w':
-    return L'W';
   case L'x':
-    return L'X';
   case L'y':
-    return L'Y';
   case L'z':
-    return L'Z';
+  case L'A':
+  case L'B':
+  case L'C':
+  case L'D':
+  case L'E':
+  case L'F':
+  case L'G':
+  case L'H':
+  case L'I':
+  case L'J':
+  case L'K':
+  case L'L':
+  case L'M':
+  case L'N':
+  case L'O':
+  case L'P':
+  case L'Q':
+  case L'R':
+  case L'S':
+  case L'T':
+  case L'U':
+  case L'V':
+  case L'W':
+  case L'X':
+  case L'Y':
+  case L'Z':
+  case L'0':
+  case L'1':
+  case L'2':
+  case L'3':
+  case L'4':
+  case L'5':
+  case L'6':
+  case L'7':
+  case L'8':
+  case L'9':
+    return true;
   default:
-    return wch;
+    return false;
+  }
+}
+
+LIBC_INLINE constexpr bool isspace(wchar_t wch) {
+  switch (wch) {
+  case L' ':
+  case L'\t':
+  case L'\n':
+  case L'\v':
+  case L'\f':
+  case L'\r':
+    return true;
+  default:
+    return false;
+  }
+}
+
+LIBC_INLINE constexpr bool isblank(wchar_t wch) {
+  switch (wch) {
+  case L' ':
+  case L'\t':
+    return true;
+  default:
+    return false;
   }
 }
 
-LIBC_INLINE static constexpr bool isalpha(wchar_t wch) {
+LIBC_INLINE constexpr bool isgraph(wchar_t wch) {
+  return 0x20 < wch && wch < 0x7f;
+}
+
+LIBC_INLINE constexpr bool isprint(wchar_t wch) {
+  return (static_cast<unsigned>(wch) - ' ') < 95;
+}
+
+LIBC_INLINE constexpr bool isxdigit(wchar_t wch) {
   switch (wch) {
   case L'a':
   case L'b':
@@ -251,129 +302,309 @@ LIBC_INLINE static constexpr bool isalpha(wchar_t wch) {
   case L'd':
   case L'e':
   case L'f':
-  case L'g':
-  case L'h':
-  case L'i':
-  case L'j':
-  case L'k':
-  case L'l':
-  case L'm':
-  case L'n':
-  case L'o':
-  case L'p':
-  case L'q':
-  case L'r':
-  case L's':
-  case L't':
-  case L'u':
-  case L'v':
-  case L'w':
-  case L'x':
-  case L'y':
-  case L'z':
   case L'A':
   case L'B':
   case L'C':
   case L'D':
   case L'E':
   case L'F':
+  case L'0':
+  case L'1':
+  case L'2':
+  case L'3':
+  case L'4':
+  case L'5':
+  case L'6':
+  case L'7':
+  case L'8':
+  case L'9':
+    return true;
+  default:
+    return false;
+  }
+}
+
+LIBC_INLINE constexpr bool iscntrl(wchar_t wch) {
+  return (wch < 0x20 || wch == 0x7f);
+}
+
+LIBC_INLINE constexpr bool ispunct(wchar_t wch) {
+  return !isalnum(wch) && isgraph(wch);
+}
+
+LIBC_INLINE constexpr wchar_t tolower(wchar_t wch) {
+  switch (wch) {
+  case L'A':
+    return L'a';
+  case L'B':
+    return L'b';
+  case L'C':
+    return L'c';
+  case L'D':
+    return L'd';
+  case L'E':
+    return L'e';
+  case L'F':
+    return L'f';
   case L'G':
+    return L'g';
   case L'H':
+    return L'h';
   case L'I':
+    return L'i';
   case L'J':
+    return L'j';
   case L'K':
+    return L'k';
   case L'L':
+    return L'l';
   case L'M':
+    return L'm';
   case L'N':
+    return L'n';
   case L'O':
+    return L'o';
   case L'P':
+    return L'p';
   case L'Q':
+    return L'q';
   case L'R':
+    return L'r';
   case L'S':
+    return L's';
   case L'T':
+    return L't';
   case L'U':
+    return L'u';
   case L'V':
+    return L'v';
   case L'W':
+    return L'w';
   case L'X':
+    return L'x';
   case L'Y':
+    return L'y';
   case L'Z':
-    return true;
+    return L'z';
   default:
-    return false;
+    return wch;
   }
 }
 
-LIBC_INLINE static constexpr bool isalnum(wchar_t wch) {
+LIBC_INLINE constexpr wchar_t toupper(wchar_t wch) {
   switch (wch) {
   case L'a':
+    return L'A';
   case L'b':
+    return L'B';
   case L'c':
+    return L'C';
   case L'd':
+    return L'D';
   case L'e':
+    return L'E';
   case L'f':
+    return L'F';
   case L'g':
+    return L'G';
   case L'h':
+    return L'H';
   case L'i':
+    return L'I';
   case L'j':
+    return L'J';
   case L'k':
+    return L'K';
   case L'l':
+    return L'L';
   case L'm':
+    return L'M';
   case L'n':
+    return L'N';
   case L'o':
+    return L'O';
   case L'p':
+    return L'P';
   case L'q':
+    return L'Q';
   case L'r':
+    return L'R';
   case L's':
+    return L'S';
   case L't':
+    return L'T';
   case L'u':
+    return L'U';
   case L'v':
+    return L'V';
   case L'w':
+    return L'W';
   case L'x':
+    return L'X';
   case L'y':
+    return L'Y';
   case L'z':
-  case L'A':
-  case L'B':
-  case L'C':
-  case L'D':
-  case L'E':
-  case L'F':
-  case L'G':
-  case L'H':
-  case L'I':
-  case L'J':
-  case L'K':
-  case L'L':
-  case L'M':
-  case L'N':
-  case L'O':
-  case L'P':
-  case L'Q':
-  case L'R':
-  case L'S':
-  case L'T':
-  case L'U':
-  case L'V':
-  case L'W':
-  case L'X':
-  case L'Y':
-  case L'Z':
-  case L'0':
-  case L'1':
-  case L'2':
-  case L'3':
-  case L'4':
-  case L'5':
-  case L'6':
-  case L'7':
-  case L'8':
-  case L'9':
-    return true;
+    return L'Z';
   default:
-    return false;
+    return wch;
+  }
+}
+
+} // namespace ascii
+
+LIBC_INLINE constexpr bool islower(wchar_t wch) {
+#if LIBC_CONF_WCTYPE_MODE != LIBC_WCTYPE_MODE_UTF8
+  return ascii::islower(wch);
+#else
+  if (static_cast<uint32_t>(wch) < 128) {
+    return ascii::islower(wch);
+  }
+  return lookup_properties(wch) & PropertyFlag::LOWER;
+#endif
+}
+
+LIBC_INLINE constexpr bool isupper(wchar_t wch) {
+#if LIBC_CONF_WCTYPE_MODE != LIBC_WCTYPE_MODE_UTF8
+  return ascii::isupper(wch);
+#else
+  if (static_cast<uint32_t>(wch) < 128) {
+    return ascii::isupper(wch);
+  }
+  return lookup_properties(wch) & PropertyFlag::UPPER;
+#endif
+}
+
+LIBC_INLINE constexpr bool isdigit(wchar_t wch) {
+  // In C.UT8, only ASCII digits are considered digits
+  return ascii::isdigit(wch);
+}
+
+LIBC_INLINE constexpr bool isalpha(wchar_t wch) {
+#if LIBC_CONF_WCTYPE_MODE != LIBC_WCTYPE_MODE_UTF8
+  return ascii::isalpha(wch);
+#else
+  if (static_cast<uint32_t>(wch) < 128) {
+    return ascii::isalpha(wch);
+  }
+  return lookup_properties(wch) & PropertyFlag::ALPHA;
+#endif
+}
+
+LIBC_INLINE constexpr bool isalnum(wchar_t wch) {
+#if LIBC_CONF_WCTYPE_MODE != LIBC_WCTYPE_MODE_UTF8
+  return ascii::isalnum(wch);
+#else
+  if (static_cast<uint32_t>(wch) < 128) {
+    return ascii::isalnum(wch);
+  }
+  // Only need to check ALPHA, digit cases are covered by ASCII path
+  return lookup_properties(wch) & PropertyFlag::ALPHA;
+#endif
+}
+
+LIBC_INLINE constexpr bool isspace(wchar_t wch) {
+#if LIBC_CONF_WCTYPE_MODE != LIBC_WCTYPE_MODE_UTF8
+  return ascii::isspace(wch);
+#else
+  if (static_cast<uint32_t>(wch) < 128) {
+    return ascii::isspace(wch);
+  }
+  return lookup_properties(wch) & PropertyFlag::SPACE;
+#endif
+}
+
+LIBC_INLINE constexpr bool isblank(wchar_t wch) {
+#if LIBC_CONF_WCTYPE_MODE != LIBC_WCTYPE_MODE_UTF8
+  return ascii::isblank(wch);
+#else
+  if (static_cast<uint32_t>(wch) < 128) {
+    return ascii::isblank(wch);
+  }
+  return lookup_properties(wch) & PropertyFlag::BLANK;
+#endif
+}
+
+LIBC_INLINE constexpr bool isgraph(wchar_t wch) {
+#if LIBC_CONF_WCTYPE_MODE != LIBC_WCTYPE_MODE_UTF8
+  return ascii::isgraph(wch);
+#else
+  if (static_cast<uint32_t>(wch) < 128) {
+    return ascii::isgraph(wch);
+  }
+  // print && !space
+  return (lookup_properties(wch) &
+          (PropertyFlag::PRINT | PropertyFlag::SPACE)) == PropertyFlag::PRINT;
+#endif
+}
+
+LIBC_INLINE constexpr bool isprint(wchar_t wch) {
+#if LIBC_CONF_WCTYPE_MODE != LIBC_WCTYPE_MODE_UTF8
+  return ascii::isprint(wch);
+#else
+  if (static_cast<uint32_t>(wch) < 128) {
+    return ascii::isprint(wch);
   }
+  return lookup_properties(wch) & PropertyFlag::PRINT;
+#endif
 }
 
-LIBC_INLINE static constexpr int b36_char_to_int(wchar_t wch) {
+LIBC_INLINE constexpr bool isxdigit(wchar_t wch) {
+  return ascii::isxdigit(wch);
+}
+
+LIBC_INLINE constexpr bool iscntrl(wchar_t wch) {
+#if LIBC_CONF_WCTYPE_MODE != LIBC_WCTYPE_MODE_UTF8
+  return ascii::iscntrl(wch);
+#else
+  if (static_cast<uint32_t>(wch) < 128) {
+    return ascii::iscntrl(wch);
+  }
+  return lookup_properties(wch) & PropertyFlag::CNTRL;
+#endif
+}
+
+LIBC_INLINE constexpr bool ispunct(wchar_t wch) {
+#if LIBC_CONF_WCTYPE_MODE != LIBC_WCTYPE_MODE_UTF8
+  return ascii::ispunct(wch);
+#else
+  if (static_cast<uint32_t>(wch) < 128) {
+    return ascii::ispunct(wch);
+  }
+  return lookup_properties(wch) & PropertyFlag::PUNCT;
+#endif
+}
+
+LIBC_INLINE constexpr wchar_t tolower(wchar_t wch) {
+#if LIBC_CONF_WCTYPE_MODE != LIBC_WCTYPE_MODE_UTF8
+  return ascii::tolower(wch);
+#else
+  if (static_cast<uint32_t>(wch) < 128) {
+    return ascii::tolower(wch);
+  }
+  // TODO: Add UTF8 implementation. When adding UTF8 implementation, it would
+  // probably be worth to move conversion functions to a seperate header, so it
+  // doesn't pull in classification lookup tables if only conversion is used,
+  // and vice versa.
+  return wch;
+#endif
+}
+
+LIBC_INLINE constexpr wchar_t toupper(wchar_t wch) {
+#if LIBC_CONF_WCTYPE_MODE != LIBC_WCTYPE_MODE_UTF8
+  return ascii::toupper(wch);
+#else
+  if (static_cast<uint32_t>(wch) < 128) {
+    return ascii::toupper(wch);
+  }
+  // TODO: Add UTF8 implementation. When adding UTF8 implementation, it would
+  // probably be worth to move conversion functions to a seperate header, so it
+  // doesn't pull in classification lookup tables if only conversion is used,
+  // and vice versa.
+  return wch;
+#endif
+}
+
+LIBC_INLINE constexpr int b36_char_to_int(wchar_t wch) {
   switch (wch) {
   case L'0':
     return 0;
@@ -561,20 +792,6 @@ LIBC_INLINE static constexpr wchar_t int_to_b36_wchar(int num) {
   }
 }
 
-LIBC_INLINE static constexpr bool isspace(wchar_t wch) {
-  switch (wch) {
-  case L' ':
-  case L'\t':
-  case L'\n':
-  case L'\v':
-  case L'\f':
-  case L'\r':
-    return true;
-  default:
-    return false;
-  }
-}
-
 // An overload which provides a way to compare input with specific character
 // values, when input can be of a regular or a wide character type.
 LIBC_INLINE static constexpr bool
diff --git a/libc/test/src/__support/CMakeLists.txt b/libc/test/src/__support/CMakeLists.txt
index 98980ce66d9b2..b7ce7422c28a8 100644
--- a/libc/test/src/__support/CMakeLists.txt
+++ b/libc/test/src/__support/CMakeLists.txt
@@ -280,6 +280,17 @@ add_libc_test(
     libc.src.__support.CPP.bit
 )
 
+add_libc_test(
+  wctype_utils_test
+  SUITE
+    libc-support-tests
+  SRCS
+    wctype_utils_test.cpp
+  DEPENDS
+    libc.src.__support.wctype_utils
+    libc.src.__support.macros.properties.os
+)
+
 add_subdirectory(CPP)
 add_subdirectory(File)
 add_subdirectory(RPC)
diff --git a/libc/test/src/__support/wctype_utils_test.cpp b/libc/test/src/__support/wctype_utils_test.cpp
new file mode 100644
index 0000000000000..9bc7e818cdbc6
--- /dev/null
+++ b/libc/test/src/__support/wctype_utils_test.cpp
@@ -0,0 +1,536 @@
+#include "src/__support/macros/config.h"
+#include "src/__support/wctype/wctype_classification_utils.h"
+#include "test/UnitTest/Test.h"
+
+namespace {
+
+namespace ascii_mode {
+#undef LIBC_CONF_WCTYPE_MODE
+#define LIBC_CONF_WCTYPE_MODE LIBC_WCTYPE_MODE_ASCII
+
+#undef LLVM_LIBC_SRC___SUPPORT_WCTYPE_UTILS_H
+#include "src/__support/wctype_utils.h"
+} // namespace ascii_mode
+
+namespace utf8_mode {
+#undef LIBC_CONF_WCTYPE_MODE
+#define LIBC_CONF_WCTYPE_MODE LIBC_WCTYPE_MODE_UTF8
+
+namespace LIBC_NAMESPACE_DECL {
+using ::LIBC_NAMESPACE::lookup_properties;
+using ::LIBC_NAMESPACE::PropertyFlag;
+} // namespace LIBC_NAMESPACE_DECL
+
+#undef LLVM_LIBC_SRC___SUPPORT_WCTYPE_UTILS_H
+#include "src/__support/wctype_utils.h"
+} // namespace utf8_mode
+
+struct TestCase {
+  uint32_t wc;
+  const char *name;
+  bool expected;
+};
+
+// Helper function to mark the sections of the ASCII table that are
+// punctuation characters. These are listed below:
+//  Decimal    |         Symbol
+//  -----------------------------------------
+//  33 -  47   |  ! " $ % & ' ( ) * + , - . /
+//  58 -  64   |  : ; < = > ? @
+//  91 -  96   |  [ \ ] ^ _ `
+// 123 - 126   |  { | } ~
+bool is_punctuation_character(int c) {
+  return ('!' <= c && c <= '/') || (':' <= c && c <= '@') ||
+         ('[' <= c && c <= '`') || ('{' <= c && c <= '~');
+}
+
+TEST(LlvmLibcWctypeUtilsTest, IsLowerAscii) {
+  using ascii_mode::LIBC_NAMESPACE::internal::islower;
+
+  // ASCII
+  for (int c = 0; c < 128; ++c) {
+    bool expected = c >= 'a' && c <= 'z';
+    EXPECT_EQ(islower(static_cast<wchar_t>(c)), expected);
+  }
+
+  // Non ASCII
+  TestCase cases[] = {
+      {0x00E9, "LATIN SMALL LETTER E WITH ACUTE", false},
+      {0x03B1, "GREEK SMALL LETTER ALPHA", false},
+      {0x00C0, "LATIN CAPITAL LETTER A WITH GRAVE", false},
+  };
+
+  for (const auto &tc : cases) {
+    EXPECT_EQ(islower(static_cast<wchar_t>(tc.wc)), tc.expected) << tc.name;
+  }
+}
+
+TEST(LlvmLibcWctypeUtilsTest, IsLowerUtf8) {
+  using utf8_mode::LIBC_NAMESPACE::internal::islower;
+
+  // ASCII
+  for (int c = 0; c < 128; ++c) {
+    bool expected = c >= 'a' && c <= 'z';
+    EXPECT_EQ(islower(static_cast<wchar_t>(c)), expected);
+  }
+
+  // Non ASCII
+  TestCase cases[] = {
+      {0x00E9, "LATIN SMALL LETTER E WITH ACUTE", true},
+      {0x03B1, "GREEK SMALL LETTER ALPHA", true},
+      {0x00C0, "LATIN CAPITAL LETTER A WITH GRAVE", false},
+  };
+
+  for (const auto &tc : cases) {
+    EXPECT_EQ(islower(static_cast<wchar_t>(tc.wc)), tc.expected) << tc.name;
+  }
+}
+
+TEST(LlvmLibcWctypeUtilsTest, IsUpperAscii) {
+  using ascii_mode::LIBC_NAMESPACE::internal::isupper;
+
+  // ASCII
+  for (int c = 0; c < 128; ++c) {
+    bool expected = c >= 'A' && c <= 'Z';
+    EXPECT_EQ(isupper(static_cast<wchar_t>(c)), expected);
+  }
+
+  // Non ASCII
+  TestCase cases[] = {
+      {0x00E9, "LATIN SMALL LETTER E WITH ACUTE", false},
+      {0x03B1, "GREEK SMALL LETTER ALPHA", false},
+      {0x00C0, "LATIN CAPITAL LETTER A WITH GRAVE", false},
+  };
+
+  for (const auto &tc : cases) {
+    EXPECT_EQ(isupper(static_cast<wchar_t>(tc.wc)), tc.expected) << tc.name;
+  }
+}
+
+TEST(LlvmLibcWctypeUtilsTest, IsUpperUtf8) {
+  using utf8_mode::LIBC_NAMESPACE::internal::isupper;
+
+  // ASCII
+  for (int c = 0; c < 128; ++c) {
+    bool expected = c >= 'A' && c <= 'Z';
+    EXPECT_EQ(isupper(static_cast<wchar_t>(c)), expected);
+  }
+
+  // Non ASCII
+  TestCase cases[] = {
+      {0x00E9, "LATIN SMALL LETTER E WITH ACUTE", false},
+      {0x03B1, "GREEK SMALL LETTER ALPHA", false},
+      {0x00C0, "LATIN CAPITAL LETTER A WITH GRAVE", true},
+  };
+
+  for (const auto &tc : cases) {
+    EXPECT_EQ(isupper(static_cast<wchar_t>(tc.wc)), tc.expected) << tc.name;
+  }
+}
+
+TEST(LlvmLibcWctypeUtilsTest, IsAlphaAscii) {
+  using ascii_mode::LIBC_NAMESPACE::internal::isalpha;
+
+  // ASCII
+  for (int c = 0; c < 128; ++c) {
+    bool expected = (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
+    EXPECT_EQ(isalpha(static_cast<wchar_t>(c)), expected);
+  }
+
+  // Non ASCII
+  TestCase cases[] = {
+      {0x00E9, "LATIN SMALL LETTER E WITH ACUTE", false},
+      {0x03B1, "GREEK SMALL LETTER ALPHA", false},
+      {0x00C0, "LATIN CAPITAL LETTER A WITH GRAVE", false},
+  };
+
+  for (const auto &tc : cases) {
+    EXPECT_EQ(isalpha(static_cast<wchar_t>(tc.wc)), tc.expected) << tc.name;
+  }
+}
+
+TEST(LlvmLibcWctypeUtilsTest, IsAlphaUtf8) {
+  using utf8_mode::LIBC_NAMESPACE::internal::isalpha;
+
+  // ASCII
+  for (int c = 0; c < 128; ++c) {
+    bool expected = (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
+    EXPECT_EQ(isalpha(static_cast<wchar_t>(c)), expected);
+  }
+
+  // Non ASCII
+  TestCase cases[] = {
+      {0x00E9, "LATIN SMALL LETTER E WITH ACUTE", true},
+      {0x03B1, "GREEK SMALL LETTER ALPHA", true},
+      {0x00C0, "LATIN CAPITAL LETTER A WITH GRAVE", true},
+  };
+
+  for (const auto &tc : cases) {
+    EXPECT_EQ(isalpha(static_cast<wchar_t>(tc.wc)), tc.expected) << tc.name;
+  }
+}
+
+TEST(LlvmLibcWctypeUtilsTest, IsDigitAscii) {
+  using ascii_mode::LIBC_NAMESPACE::internal::isdigit;
+
+  // ASCII
+  for (int c = 0; c < 128; ++c) {
+    bool expected = (c >= '0' && c <= '9');
+    EXPECT_EQ(isdigit(static_cast<wchar_t>(c)), expected);
+  }
+
+  // Non ASCII
+  TestCase cases[] = {
+      {0x00E9, "LATIN SMALL LETTER E WITH ACUTE", false},
+      {0x0660, "ARABIC-INDIC DIGIT ZERO", false},
+  };
+
+  for (const auto &tc : cases) {
+    EXPECT_EQ(isdigit(static_cast<wchar_t>(tc.wc)), tc.expected) << tc.name;
+  }
+}
+
+TEST(LlvmLibcWctypeUtilsTest, IsDigitUtf8) {
+  using utf8_mode::LIBC_NAMESPACE::internal::isdigit;
+
+  // ASCII
+  for (int c = 0; c < 128; ++c) {
+    bool expected = (c >= '0' && c <= '9');
+    EXPECT_EQ(isdigit(static_cast<wchar_t>(c)), expected);
+  }
+
+  // Non ASCII
+  // In C.UTF-8, isdigit only returns true for ASCII digits.
+  TestCase cases[] = {
+      {0x00E9, "LATIN SMALL LETTER E WITH ACUTE", false},
+      {0x0660, "ARABIC-INDIC DIGIT ZERO", false},
+  };
+
+  for (const auto &tc : cases) {
+    EXPECT_EQ(isdigit(static_cast<wchar_t>(tc.wc)), tc.expected) << tc.name;
+  }
+}
+
+TEST(LlvmLibcWctypeUtilsTest, IsAlnumAscii) {
+  using ascii_mode::LIBC_NAMESPACE::internal::isalnum;
+
+  // ASCII
+  for (int c = 0; c < 128; ++c) {
+    bool expected = (c >= '0' && c <= '9') || (c >= 'a' && c <= 'z') ||
+                    (c >= 'A' && c <= 'Z');
+    EXPECT_EQ(isalnum(static_cast<wchar_t>(c)), expected);
+  }
+
+  // Non ASCII
+  TestCase cases[] = {
+      {0x00E9, "LATIN SMALL LETTER E WITH ACUTE", false},
+      {0x0660, "ARABIC-INDIC DIGIT ZERO", false},
+      {0x0030, "DIGIT ZERO", true},
+  };
+
+  for (const auto &tc : cases) {
+    EXPECT_EQ(isalnum(static_cast<wchar_t>(tc.wc)), tc.expected) << tc.name;
+  }
+}
+
+TEST(LlvmLibcWctypeUtilsTest, IsAlnumUtf8) {
+  using utf8_mode::LIBC_NAMESPACE::internal::isalnum;
+
+  // ASCII
+  for (int c = 0; c < 128; ++c) {
+    bool expected = (c >= '0' && c <= '9') || (c >= 'a' && c <= 'z') ||
+                    (c >= 'A' && c <= 'Z');
+    EXPECT_EQ(isalnum(static_cast<wchar_t>(c)), expected);
+  }
+
+  // Non ASCII
+  TestCase cases[] = {
+      {0x00E9, "LATIN SMALL LETTER E WITH ACUTE", true},
+      {0x0660, "ARABIC-INDIC DIGIT ZERO", true},
+      {0x0030, "DIGIT ZERO", true},
+  };
+
+  for (const auto &tc : cases) {
+    EXPECT_EQ(isalnum(static_cast<wchar_t>(tc.wc)), tc.expected) << tc.name;
+  }
+}
+
+TEST(LlvmLibcWctypeUtilsTest, IsXDigitAscii) {
+  using ascii_mode::LIBC_NAMESPACE::internal::isxdigit;
+
+  for (int c = 0; c < 128; ++c) {
+    bool expected = (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') ||
+                    (c >= 'A' && c <= 'F');
+    EXPECT_EQ(isxdigit(static_cast<wchar_t>(c)), expected);
+  }
+}
+
+TEST(LlvmLibcWctypeUtilsTest, IsXDigitUtf8) {
+  using utf8_mode::LIBC_NAMESPACE::internal::isxdigit;
+
+  for (int c = 0; c < 128; ++c) {
+    bool expected = (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') ||
+                    (c >= 'A' && c <= 'F');
+    EXPECT_EQ(isxdigit(static_cast<wchar_t>(c)), expected);
+  }
+}
+
+TEST(LlvmLibcWctypeUtilsTest, IsSpaceAscii) {
+  using ascii_mode::LIBC_NAMESPACE::internal::isspace;
+
+  // ASCII
+  for (int c = 0; c < 128; ++c) {
+    bool expected = (c == ' ' || c == '\t' || c == '\n' || c == '\v' ||
+                     c == '\f' || c == '\r');
+    EXPECT_EQ(isspace(static_cast<wchar_t>(c)), expected);
+  }
+
+  // Non ASCII
+  TestCase cases[] = {
+      {0x00A0, "NO-BREAK SPACE", false},
+      {0x2000, "EN QUAD", false},
+      {0x2028, "LINE SEPARATOR", false},
+  };
+
+  for (const auto &tc : cases) {
+    EXPECT_EQ(isspace(static_cast<wchar_t>(tc.wc)), tc.expected) << tc.name;
+  }
+}
+
+TEST(LlvmLibcWctypeUtilsTest, IsSpaceUtf8) {
+  using utf8_mode::LIBC_NAMESPACE::internal::isspace;
+
+  // ASCII
+  for (int c = 0; c < 128; ++c) {
+    bool expected = (c == ' ' || c == '\t' || c == '\n' || c == '\v' ||
+                     c == '\f' || c == '\r');
+    EXPECT_EQ(isspace(static_cast<wchar_t>(c)), expected);
+  }
+
+  // Non ASCII
+  TestCase cases[] = {
+      {0x00A0, "NO-BREAK SPACE", false},
+      {0x1680, "OGHAM SPACE MARK", true},
+      {0x2000, "EN QUAD", true},
+      {0x2028, "LINE SEPARATOR", true},
+  };
+
+  for (const auto &tc : cases) {
+    EXPECT_EQ(isspace(static_cast<wchar_t>(tc.wc)), tc.expected) << tc.name;
+  }
+}
+
+TEST(LlvmLibcWctypeUtilsTest, IsBlankAscii) {
+  using ascii_mode::LIBC_NAMESPACE::internal::isblank;
+
+  // ASCII
+  for (int c = 0; c < 128; ++c) {
+    bool expected = (c == ' ' || c == '\t');
+    EXPECT_EQ(isblank(static_cast<wchar_t>(c)), expected);
+  }
+
+  // Non ASCII
+  TestCase cases[] = {
+      {0x00A0, "NO-BREAK SPACE", false},
+      {0x2000, "EN QUAD", false},
+  };
+
+  for (const auto &tc : cases) {
+    EXPECT_EQ(isblank(static_cast<wchar_t>(tc.wc)), tc.expected) << tc.name;
+  }
+}
+
+TEST(LlvmLibcWctypeUtilsTest, IsBlankUtf8) {
+  using utf8_mode::LIBC_NAMESPACE::internal::isblank;
+
+  // ASCII
+  for (int c = 0; c < 128; ++c) {
+    bool expected = (c == ' ' || c == '\t');
+    EXPECT_EQ(isblank(static_cast<wchar_t>(c)), expected);
+  }
+
+  // Non ASCII
+  TestCase cases[] = {
+      {0x00A0, "NO-BREAK SPACE", false},
+      {0x1680, "OGHAM SPACE MARK", true},
+      {0x2000, "EN QUAD", true},
+  };
+
+  for (const auto &tc : cases) {
+    EXPECT_EQ(isblank(static_cast<wchar_t>(tc.wc)), tc.expected) << tc.name;
+  }
+}
+
+TEST(LlvmLibcWctypeUtilsTest, IsGraphAscii) {
+  using ascii_mode::LIBC_NAMESPACE::internal::isgraph;
+
+  // ASCII
+  for (int c = 0; c < 128; ++c) {
+    bool expected = (c > 0x20 && c < 0x7f);
+    EXPECT_EQ(isgraph(static_cast<wchar_t>(c)), expected);
+  }
+
+  // Non ASCII
+  TestCase cases[] = {
+      {0x00A0, "NO-BREAK SPACE", false},
+      {0x2000, "EN QUAD", false},
+      {0x2603, "SNOWMAN", false},
+  };
+
+  for (const auto &tc : cases) {
+    EXPECT_EQ(isgraph(static_cast<wchar_t>(tc.wc)), tc.expected) << tc.name;
+  }
+}
+
+TEST(LlvmLibcWctypeUtilsTest, IsGraphUtf8) {
+  using utf8_mode::LIBC_NAMESPACE::internal::isgraph;
+
+  // ASCII
+  for (int c = 0; c < 128; ++c) {
+    bool expected = (c > 0x20 && c < 0x7f);
+    EXPECT_EQ(isgraph(static_cast<wchar_t>(c)), expected);
+  }
+
+  // Non ASCII
+  TestCase cases[] = {
+      {0x00A0, "NO-BREAK SPACE", true},
+      {0x2000, "EN QUAD", false},
+      {0x2603, "SNOWMAN", true},
+  };
+
+  for (const auto &tc : cases) {
+    EXPECT_EQ(isgraph(static_cast<wchar_t>(tc.wc)), tc.expected) << tc.name;
+  }
+}
+
+TEST(LlvmLibcWctypeUtilsTest, IsPrintAscii) {
+  using ascii_mode::LIBC_NAMESPACE::internal::isprint;
+
+  // ASCII
+  for (int c = 0; c < 128; ++c) {
+    bool expected = (c >= 0x20 && c < 0x7f);
+    EXPECT_EQ(isprint(static_cast<wchar_t>(c)), expected);
+  }
+
+  // Non ASCII
+  TestCase cases[] = {
+      {0x00A0, "NO-BREAK SPACE", false},
+      {0x2000, "EN QUAD", false},
+      {0x2603, "SNOWMAN", false},
+  };
+
+  for (const auto &tc : cases) {
+    EXPECT_EQ(isprint(static_cast<wchar_t>(tc.wc)), tc.expected) << tc.name;
+  }
+}
+
+TEST(LlvmLibcWctypeUtilsTest, IsPrintUtf8) {
+  using utf8_mode::LIBC_NAMESPACE::internal::isprint;
+
+  // ASCII
+  for (int c = 0; c < 128; ++c) {
+    bool expected = (c >= 0x20 && c < 0x7f);
+    EXPECT_EQ(isprint(static_cast<wchar_t>(c)), expected);
+  }
+
+  // Non ASCII
+  TestCase cases[] = {
+      {0x00A0, "NO-BREAK SPACE", true},
+      {0x2000, "EN QUAD", true},
+      {0x2603, "SNOWMAN", true},
+  };
+
+  for (const auto &tc : cases) {
+    EXPECT_EQ(isprint(static_cast<wchar_t>(tc.wc)), tc.expected) << tc.name;
+  }
+}
+
+TEST(LlvmLibcWctypeUtilsTest, IsPunctAscii) {
+  using ascii_mode::LIBC_NAMESPACE::internal::isalnum;
+  using ascii_mode::LIBC_NAMESPACE::internal::isgraph;
+  using ascii_mode::LIBC_NAMESPACE::internal::ispunct;
+
+  // ASCII
+  for (int c = 0; c < 128; ++c) {
+    bool expected = is_punctuation_character(c);
+    EXPECT_EQ(ispunct(static_cast<wchar_t>(c)), expected);
+  }
+
+  // Non ASCII
+  TestCase cases[] = {
+      {0x00A1, "INVERTED EXCLAMATION MARK", false},
+      {0x2014, "EM DASH", false},
+      {0x20AC, "EURO SIGN", false},
+  };
+
+  for (const auto &tc : cases) {
+    EXPECT_EQ(ispunct(static_cast<wchar_t>(tc.wc)), tc.expected) << tc.name;
+  }
+}
+
+TEST(LlvmLibcWctypeUtilsTest, IsPunctUtf8) {
+  using utf8_mode::LIBC_NAMESPACE::internal::isalnum;
+  using utf8_mode::LIBC_NAMESPACE::internal::isgraph;
+  using utf8_mode::LIBC_NAMESPACE::internal::ispunct;
+
+  // ASCII
+  for (int c = 0; c < 128; ++c) {
+    bool expected = is_punctuation_character(c);
+    EXPECT_EQ(ispunct(static_cast<wchar_t>(c)), expected);
+  }
+
+  // Non ASCII
+  TestCase cases[] = {
+      {0x00A1, "INVERTED EXCLAMATION MARK", true},
+      {0x2014, "EM DASH", true},
+      {0x20AC, "EURO SIGN", true},
+  };
+
+  for (const auto &tc : cases) {
+    EXPECT_EQ(ispunct(static_cast<wchar_t>(tc.wc)), tc.expected) << tc.name;
+  }
+}
+
+TEST(LlvmLibcWctypeUtilsTest, IsCntrlAscii) {
+  using ascii_mode::LIBC_NAMESPACE::internal::iscntrl;
+
+  // ASCII
+  for (int c = 0; c < 128; ++c) {
+    bool expected = (c < 0x20 || c == 0x7f);
+    EXPECT_EQ(iscntrl(static_cast<wchar_t>(c)), expected);
+  }
+
+  // Non ASCII
+  TestCase cases[] = {
+      {0x0080, "PADDING CHARACTER", false},
+      {0x009F, "APPLICATION PROGRAM COMMAND", false},
+      {0x2028, "LINE SEPARATOR", false},
+  };
+
+  for (const auto &tc : cases) {
+    EXPECT_EQ(iscntrl(static_cast<wchar_t>(tc.wc)), tc.expected) << tc.name;
+  }
+}
+
+TEST(LlvmLibcWctypeUtilsTest, IsCntrlUtf8) {
+  using utf8_mode::LIBC_NAMESPACE::internal::iscntrl;
+
+  // ASCII
+  for (int c = 0; c < 128; ++c) {
+    bool expected = (c < 0x20 || c == 0x7f);
+    EXPECT_EQ(iscntrl(static_cast<wchar_t>(c)), expected);
+  }
+
+  // Non ASCII
+  TestCase cases[] = {
+      {0x0080, "PADDING CHARACTER", true},
+      {0x009F, "APPLICATION PROGRAM COMMAND", true},
+      {0x2028, "LINE SEPARATOR", false},
+  };
+
+  for (const auto &tc : cases) {
+    EXPECT_EQ(iscntrl(static_cast<wchar_t>(tc.wc)), tc.expected) << tc.name;
+  }
+}
+
+} // namespace



More information about the libc-commits mailing list