[libc-commits] [libc] [libc][wctype] Implement internal UTF8 wctype classification functions (PR #174607)
Marcell Leleszi via libc-commits
libc-commits at lists.llvm.org
Tue Jan 6 07:29:12 PST 2026
https://github.com/mleleszi created https://github.com/llvm/llvm-project/pull/174607
WIP
Closes [#174604](https://github.com/llvm/llvm-project/issues/174604)
This patch implements the internal wide character classification functions with Unicode support.
The ASCII paths which were not implemented are taken from [ctype_utils.h](https://github.com/llvm/llvm-project/blob/main/libc/src/__support/ctype_utils.h)
The tests mainly cover the dispatch logic between ASCII and UTF8, the UTF8 variants are already comprehensively tested in [wctype_classification_utils_test.cpp](https://github.com/mleleszi/llvm-project/blob/main/libc/test/src/__support/wctype/wctype_classification_utils_test.cpp).
>From afad4bdad5db4a5f15797490b8c5861fd48b96e4 Mon Sep 17 00:00:00 2001
From: Marcell Leleszi <mleleszi at google.com>
Date: Tue, 6 Jan 2026 14:58:45 +0000
Subject: [PATCH] Implement internal utf8 wctype classification functions
---
libc/src/__support/CMakeLists.txt | 12 +-
libc/src/__support/wctype_utils.h | 491 +++++++++++-----
libc/test/src/__support/CMakeLists.txt | 11 +
libc/test/src/__support/wctype_utils_test.cpp | 536 ++++++++++++++++++
4 files changed, 912 insertions(+), 138 deletions(-)
create mode 100644 libc/test/src/__support/wctype_utils_test.cpp
diff --git a/libc/src/__support/CMakeLists.txt b/libc/src/__support/CMakeLists.txt
index df524c25cbd8a..fb5aca3a9b1d9 100644
--- a/libc/src/__support/CMakeLists.txt
+++ b/libc/src/__support/CMakeLists.txt
@@ -156,12 +156,22 @@ add_header_library(
ctype_utils.h
)
+list(APPEND wctype_utils_deps
+ libc.hdr.types.wchar_t
+)
+
+if ("${LIBC_CONF_WCTYPE_MODE}" STREQUAL "LIBC_WCTYPE_MODE_UTF8")
+ list(APPEND wctype_utils_deps
+ libc.src.__support.wctype.wctype_classification_utils
+ )
+endif()
+
add_header_library(
wctype_utils
HDRS
wctype_utils.h
DEPENDS
- libc.hdr.types.wchar_t
+ ${wctype_utils_deps}
)
add_header_library(
diff --git a/libc/src/__support/wctype_utils.h b/libc/src/__support/wctype_utils.h
index 7f17224104ffb..56e355d99cc65 100644
--- a/libc/src/__support/wctype_utils.h
+++ b/libc/src/__support/wctype_utils.h
@@ -13,6 +13,13 @@
#include "src/__support/macros/attributes.h" // LIBC_INLINE
#include "src/__support/macros/config.h"
+#define LIBC_WCTYPE_MODE_ASCII 0
+#define LIBC_WCTYPE_MODE_UTF8 1
+
+#if LIBC_CONF_WCTYPE_MODE == LIBC_WCTYPE_MODE_UTF8
+#include "src/__support/wctype/wctype_classification_utils.h"
+#endif
+
namespace LIBC_NAMESPACE_DECL {
namespace internal {
@@ -38,8 +45,8 @@ namespace internal {
// This assumes the character ranges are contiguous, which they aren't in
// EBCDIC. Technically we could use some smaller ranges, but that's even harder
// to read.
-
-LIBC_INLINE static constexpr bool islower(wchar_t wch) {
+namespace ascii {
+LIBC_INLINE constexpr bool islower(wchar_t wch) {
switch (wch) {
case L'a':
case L'b':
@@ -73,7 +80,7 @@ LIBC_INLINE static constexpr bool islower(wchar_t wch) {
}
}
-LIBC_INLINE static constexpr bool isupper(wchar_t wch) {
+LIBC_INLINE constexpr bool isupper(wchar_t wch) {
switch (wch) {
case L'A':
case L'B':
@@ -107,7 +114,7 @@ LIBC_INLINE static constexpr bool isupper(wchar_t wch) {
}
}
-LIBC_INLINE static constexpr bool isdigit(wchar_t wch) {
+LIBC_INLINE constexpr bool isdigit(wchar_t wch) {
switch (wch) {
case L'0':
case L'1':
@@ -125,125 +132,169 @@ LIBC_INLINE static constexpr bool isdigit(wchar_t wch) {
}
}
-LIBC_INLINE static constexpr wchar_t tolower(wchar_t wch) {
+LIBC_INLINE constexpr bool isalpha(wchar_t wch) {
switch (wch) {
+ case L'a':
+ case L'b':
+ case L'c':
+ case L'd':
+ case L'e':
+ case L'f':
+ case L'g':
+ case L'h':
+ case L'i':
+ case L'j':
+ case L'k':
+ case L'l':
+ case L'm':
+ case L'n':
+ case L'o':
+ case L'p':
+ case L'q':
+ case L'r':
+ case L's':
+ case L't':
+ case L'u':
+ case L'v':
+ case L'w':
+ case L'x':
+ case L'y':
+ case L'z':
case L'A':
- return L'a';
case L'B':
- return L'b';
case L'C':
- return L'c';
case L'D':
- return L'd';
case L'E':
- return L'e';
case L'F':
- return L'f';
case L'G':
- return L'g';
case L'H':
- return L'h';
case L'I':
- return L'i';
case L'J':
- return L'j';
case L'K':
- return L'k';
case L'L':
- return L'l';
case L'M':
- return L'm';
case L'N':
- return L'n';
case L'O':
- return L'o';
case L'P':
- return L'p';
case L'Q':
- return L'q';
case L'R':
- return L'r';
case L'S':
- return L's';
case L'T':
- return L't';
case L'U':
- return L'u';
case L'V':
- return L'v';
case L'W':
- return L'w';
case L'X':
- return L'x';
case L'Y':
- return L'y';
case L'Z':
- return L'z';
+ return true;
default:
- return wch;
+ return false;
}
}
-LIBC_INLINE static constexpr wchar_t toupper(wchar_t wch) {
+LIBC_INLINE constexpr bool isalnum(wchar_t wch) {
switch (wch) {
case L'a':
- return L'A';
case L'b':
- return L'B';
case L'c':
- return L'C';
case L'd':
- return L'D';
case L'e':
- return L'E';
case L'f':
- return L'F';
case L'g':
- return L'G';
case L'h':
- return L'H';
case L'i':
- return L'I';
case L'j':
- return L'J';
case L'k':
- return L'K';
case L'l':
- return L'L';
case L'm':
- return L'M';
case L'n':
- return L'N';
case L'o':
- return L'O';
case L'p':
- return L'P';
case L'q':
- return L'Q';
case L'r':
- return L'R';
case L's':
- return L'S';
case L't':
- return L'T';
case L'u':
- return L'U';
case L'v':
- return L'V';
case L'w':
- return L'W';
case L'x':
- return L'X';
case L'y':
- return L'Y';
case L'z':
- return L'Z';
+ case L'A':
+ case L'B':
+ case L'C':
+ case L'D':
+ case L'E':
+ case L'F':
+ case L'G':
+ case L'H':
+ case L'I':
+ case L'J':
+ case L'K':
+ case L'L':
+ case L'M':
+ case L'N':
+ case L'O':
+ case L'P':
+ case L'Q':
+ case L'R':
+ case L'S':
+ case L'T':
+ case L'U':
+ case L'V':
+ case L'W':
+ case L'X':
+ case L'Y':
+ case L'Z':
+ case L'0':
+ case L'1':
+ case L'2':
+ case L'3':
+ case L'4':
+ case L'5':
+ case L'6':
+ case L'7':
+ case L'8':
+ case L'9':
+ return true;
default:
- return wch;
+ return false;
+ }
+}
+
+LIBC_INLINE constexpr bool isspace(wchar_t wch) {
+ switch (wch) {
+ case L' ':
+ case L'\t':
+ case L'\n':
+ case L'\v':
+ case L'\f':
+ case L'\r':
+ return true;
+ default:
+ return false;
+ }
+}
+
+LIBC_INLINE constexpr bool isblank(wchar_t wch) {
+ switch (wch) {
+ case L' ':
+ case L'\t':
+ return true;
+ default:
+ return false;
}
}
-LIBC_INLINE static constexpr bool isalpha(wchar_t wch) {
+LIBC_INLINE constexpr bool isgraph(wchar_t wch) {
+ return 0x20 < wch && wch < 0x7f;
+}
+
+LIBC_INLINE constexpr bool isprint(wchar_t wch) {
+ return (static_cast<unsigned>(wch) - ' ') < 95;
+}
+
+LIBC_INLINE constexpr bool isxdigit(wchar_t wch) {
switch (wch) {
case L'a':
case L'b':
@@ -251,129 +302,309 @@ LIBC_INLINE static constexpr bool isalpha(wchar_t wch) {
case L'd':
case L'e':
case L'f':
- case L'g':
- case L'h':
- case L'i':
- case L'j':
- case L'k':
- case L'l':
- case L'm':
- case L'n':
- case L'o':
- case L'p':
- case L'q':
- case L'r':
- case L's':
- case L't':
- case L'u':
- case L'v':
- case L'w':
- case L'x':
- case L'y':
- case L'z':
case L'A':
case L'B':
case L'C':
case L'D':
case L'E':
case L'F':
+ case L'0':
+ case L'1':
+ case L'2':
+ case L'3':
+ case L'4':
+ case L'5':
+ case L'6':
+ case L'7':
+ case L'8':
+ case L'9':
+ return true;
+ default:
+ return false;
+ }
+}
+
+LIBC_INLINE constexpr bool iscntrl(wchar_t wch) {
+ return (wch < 0x20 || wch == 0x7f);
+}
+
+LIBC_INLINE constexpr bool ispunct(wchar_t wch) {
+ return !isalnum(wch) && isgraph(wch);
+}
+
+LIBC_INLINE constexpr wchar_t tolower(wchar_t wch) {
+ switch (wch) {
+ case L'A':
+ return L'a';
+ case L'B':
+ return L'b';
+ case L'C':
+ return L'c';
+ case L'D':
+ return L'd';
+ case L'E':
+ return L'e';
+ case L'F':
+ return L'f';
case L'G':
+ return L'g';
case L'H':
+ return L'h';
case L'I':
+ return L'i';
case L'J':
+ return L'j';
case L'K':
+ return L'k';
case L'L':
+ return L'l';
case L'M':
+ return L'm';
case L'N':
+ return L'n';
case L'O':
+ return L'o';
case L'P':
+ return L'p';
case L'Q':
+ return L'q';
case L'R':
+ return L'r';
case L'S':
+ return L's';
case L'T':
+ return L't';
case L'U':
+ return L'u';
case L'V':
+ return L'v';
case L'W':
+ return L'w';
case L'X':
+ return L'x';
case L'Y':
+ return L'y';
case L'Z':
- return true;
+ return L'z';
default:
- return false;
+ return wch;
}
}
-LIBC_INLINE static constexpr bool isalnum(wchar_t wch) {
+LIBC_INLINE constexpr wchar_t toupper(wchar_t wch) {
switch (wch) {
case L'a':
+ return L'A';
case L'b':
+ return L'B';
case L'c':
+ return L'C';
case L'd':
+ return L'D';
case L'e':
+ return L'E';
case L'f':
+ return L'F';
case L'g':
+ return L'G';
case L'h':
+ return L'H';
case L'i':
+ return L'I';
case L'j':
+ return L'J';
case L'k':
+ return L'K';
case L'l':
+ return L'L';
case L'm':
+ return L'M';
case L'n':
+ return L'N';
case L'o':
+ return L'O';
case L'p':
+ return L'P';
case L'q':
+ return L'Q';
case L'r':
+ return L'R';
case L's':
+ return L'S';
case L't':
+ return L'T';
case L'u':
+ return L'U';
case L'v':
+ return L'V';
case L'w':
+ return L'W';
case L'x':
+ return L'X';
case L'y':
+ return L'Y';
case L'z':
- case L'A':
- case L'B':
- case L'C':
- case L'D':
- case L'E':
- case L'F':
- case L'G':
- case L'H':
- case L'I':
- case L'J':
- case L'K':
- case L'L':
- case L'M':
- case L'N':
- case L'O':
- case L'P':
- case L'Q':
- case L'R':
- case L'S':
- case L'T':
- case L'U':
- case L'V':
- case L'W':
- case L'X':
- case L'Y':
- case L'Z':
- case L'0':
- case L'1':
- case L'2':
- case L'3':
- case L'4':
- case L'5':
- case L'6':
- case L'7':
- case L'8':
- case L'9':
- return true;
+ return L'Z';
default:
- return false;
+ return wch;
+ }
+}
+
+} // namespace ascii
+
+LIBC_INLINE constexpr bool islower(wchar_t wch) {
+#if LIBC_CONF_WCTYPE_MODE != LIBC_WCTYPE_MODE_UTF8
+ return ascii::islower(wch);
+#else
+ if (static_cast<uint32_t>(wch) < 128) {
+ return ascii::islower(wch);
+ }
+ return lookup_properties(wch) & PropertyFlag::LOWER;
+#endif
+}
+
+LIBC_INLINE constexpr bool isupper(wchar_t wch) {
+#if LIBC_CONF_WCTYPE_MODE != LIBC_WCTYPE_MODE_UTF8
+ return ascii::isupper(wch);
+#else
+ if (static_cast<uint32_t>(wch) < 128) {
+ return ascii::isupper(wch);
+ }
+ return lookup_properties(wch) & PropertyFlag::UPPER;
+#endif
+}
+
+LIBC_INLINE constexpr bool isdigit(wchar_t wch) {
+ // In C.UT8, only ASCII digits are considered digits
+ return ascii::isdigit(wch);
+}
+
+LIBC_INLINE constexpr bool isalpha(wchar_t wch) {
+#if LIBC_CONF_WCTYPE_MODE != LIBC_WCTYPE_MODE_UTF8
+ return ascii::isalpha(wch);
+#else
+ if (static_cast<uint32_t>(wch) < 128) {
+ return ascii::isalpha(wch);
+ }
+ return lookup_properties(wch) & PropertyFlag::ALPHA;
+#endif
+}
+
+LIBC_INLINE constexpr bool isalnum(wchar_t wch) {
+#if LIBC_CONF_WCTYPE_MODE != LIBC_WCTYPE_MODE_UTF8
+ return ascii::isalnum(wch);
+#else
+ if (static_cast<uint32_t>(wch) < 128) {
+ return ascii::isalnum(wch);
+ }
+ // Only need to check ALPHA, digit cases are covered by ASCII path
+ return lookup_properties(wch) & PropertyFlag::ALPHA;
+#endif
+}
+
+LIBC_INLINE constexpr bool isspace(wchar_t wch) {
+#if LIBC_CONF_WCTYPE_MODE != LIBC_WCTYPE_MODE_UTF8
+ return ascii::isspace(wch);
+#else
+ if (static_cast<uint32_t>(wch) < 128) {
+ return ascii::isspace(wch);
+ }
+ return lookup_properties(wch) & PropertyFlag::SPACE;
+#endif
+}
+
+LIBC_INLINE constexpr bool isblank(wchar_t wch) {
+#if LIBC_CONF_WCTYPE_MODE != LIBC_WCTYPE_MODE_UTF8
+ return ascii::isblank(wch);
+#else
+ if (static_cast<uint32_t>(wch) < 128) {
+ return ascii::isblank(wch);
+ }
+ return lookup_properties(wch) & PropertyFlag::BLANK;
+#endif
+}
+
+LIBC_INLINE constexpr bool isgraph(wchar_t wch) {
+#if LIBC_CONF_WCTYPE_MODE != LIBC_WCTYPE_MODE_UTF8
+ return ascii::isgraph(wch);
+#else
+ if (static_cast<uint32_t>(wch) < 128) {
+ return ascii::isgraph(wch);
+ }
+ // print && !space
+ return (lookup_properties(wch) &
+ (PropertyFlag::PRINT | PropertyFlag::SPACE)) == PropertyFlag::PRINT;
+#endif
+}
+
+LIBC_INLINE constexpr bool isprint(wchar_t wch) {
+#if LIBC_CONF_WCTYPE_MODE != LIBC_WCTYPE_MODE_UTF8
+ return ascii::isprint(wch);
+#else
+ if (static_cast<uint32_t>(wch) < 128) {
+ return ascii::isprint(wch);
}
+ return lookup_properties(wch) & PropertyFlag::PRINT;
+#endif
}
-LIBC_INLINE static constexpr int b36_char_to_int(wchar_t wch) {
+LIBC_INLINE constexpr bool isxdigit(wchar_t wch) {
+ return ascii::isxdigit(wch);
+}
+
+LIBC_INLINE constexpr bool iscntrl(wchar_t wch) {
+#if LIBC_CONF_WCTYPE_MODE != LIBC_WCTYPE_MODE_UTF8
+ return ascii::iscntrl(wch);
+#else
+ if (static_cast<uint32_t>(wch) < 128) {
+ return ascii::iscntrl(wch);
+ }
+ return lookup_properties(wch) & PropertyFlag::CNTRL;
+#endif
+}
+
+LIBC_INLINE constexpr bool ispunct(wchar_t wch) {
+#if LIBC_CONF_WCTYPE_MODE != LIBC_WCTYPE_MODE_UTF8
+ return ascii::ispunct(wch);
+#else
+ if (static_cast<uint32_t>(wch) < 128) {
+ return ascii::ispunct(wch);
+ }
+ return lookup_properties(wch) & PropertyFlag::PUNCT;
+#endif
+}
+
+LIBC_INLINE constexpr wchar_t tolower(wchar_t wch) {
+#if LIBC_CONF_WCTYPE_MODE != LIBC_WCTYPE_MODE_UTF8
+ return ascii::tolower(wch);
+#else
+ if (static_cast<uint32_t>(wch) < 128) {
+ return ascii::tolower(wch);
+ }
+ // TODO: Add UTF8 implementation. When adding UTF8 implementation, it would
+ // probably be worth to move conversion functions to a seperate header, so it
+ // doesn't pull in classification lookup tables if only conversion is used,
+ // and vice versa.
+ return wch;
+#endif
+}
+
+LIBC_INLINE constexpr wchar_t toupper(wchar_t wch) {
+#if LIBC_CONF_WCTYPE_MODE != LIBC_WCTYPE_MODE_UTF8
+ return ascii::toupper(wch);
+#else
+ if (static_cast<uint32_t>(wch) < 128) {
+ return ascii::toupper(wch);
+ }
+ // TODO: Add UTF8 implementation. When adding UTF8 implementation, it would
+ // probably be worth to move conversion functions to a seperate header, so it
+ // doesn't pull in classification lookup tables if only conversion is used,
+ // and vice versa.
+ return wch;
+#endif
+}
+
+LIBC_INLINE constexpr int b36_char_to_int(wchar_t wch) {
switch (wch) {
case L'0':
return 0;
@@ -561,20 +792,6 @@ LIBC_INLINE static constexpr wchar_t int_to_b36_wchar(int num) {
}
}
-LIBC_INLINE static constexpr bool isspace(wchar_t wch) {
- switch (wch) {
- case L' ':
- case L'\t':
- case L'\n':
- case L'\v':
- case L'\f':
- case L'\r':
- return true;
- default:
- return false;
- }
-}
-
// An overload which provides a way to compare input with specific character
// values, when input can be of a regular or a wide character type.
LIBC_INLINE static constexpr bool
diff --git a/libc/test/src/__support/CMakeLists.txt b/libc/test/src/__support/CMakeLists.txt
index 98980ce66d9b2..b7ce7422c28a8 100644
--- a/libc/test/src/__support/CMakeLists.txt
+++ b/libc/test/src/__support/CMakeLists.txt
@@ -280,6 +280,17 @@ add_libc_test(
libc.src.__support.CPP.bit
)
+add_libc_test(
+ wctype_utils_test
+ SUITE
+ libc-support-tests
+ SRCS
+ wctype_utils_test.cpp
+ DEPENDS
+ libc.src.__support.wctype_utils
+ libc.src.__support.macros.properties.os
+)
+
add_subdirectory(CPP)
add_subdirectory(File)
add_subdirectory(RPC)
diff --git a/libc/test/src/__support/wctype_utils_test.cpp b/libc/test/src/__support/wctype_utils_test.cpp
new file mode 100644
index 0000000000000..9bc7e818cdbc6
--- /dev/null
+++ b/libc/test/src/__support/wctype_utils_test.cpp
@@ -0,0 +1,536 @@
+#include "src/__support/macros/config.h"
+#include "src/__support/wctype/wctype_classification_utils.h"
+#include "test/UnitTest/Test.h"
+
+namespace {
+
+namespace ascii_mode {
+#undef LIBC_CONF_WCTYPE_MODE
+#define LIBC_CONF_WCTYPE_MODE LIBC_WCTYPE_MODE_ASCII
+
+#undef LLVM_LIBC_SRC___SUPPORT_WCTYPE_UTILS_H
+#include "src/__support/wctype_utils.h"
+} // namespace ascii_mode
+
+namespace utf8_mode {
+#undef LIBC_CONF_WCTYPE_MODE
+#define LIBC_CONF_WCTYPE_MODE LIBC_WCTYPE_MODE_UTF8
+
+namespace LIBC_NAMESPACE_DECL {
+using ::LIBC_NAMESPACE::lookup_properties;
+using ::LIBC_NAMESPACE::PropertyFlag;
+} // namespace LIBC_NAMESPACE_DECL
+
+#undef LLVM_LIBC_SRC___SUPPORT_WCTYPE_UTILS_H
+#include "src/__support/wctype_utils.h"
+} // namespace utf8_mode
+
+struct TestCase {
+ uint32_t wc;
+ const char *name;
+ bool expected;
+};
+
+// Helper function to mark the sections of the ASCII table that are
+// punctuation characters. These are listed below:
+// Decimal | Symbol
+// -----------------------------------------
+// 33 - 47 | ! " $ % & ' ( ) * + , - . /
+// 58 - 64 | : ; < = > ? @
+// 91 - 96 | [ \ ] ^ _ `
+// 123 - 126 | { | } ~
+bool is_punctuation_character(int c) {
+ return ('!' <= c && c <= '/') || (':' <= c && c <= '@') ||
+ ('[' <= c && c <= '`') || ('{' <= c && c <= '~');
+}
+
+TEST(LlvmLibcWctypeUtilsTest, IsLowerAscii) {
+ using ascii_mode::LIBC_NAMESPACE::internal::islower;
+
+ // ASCII
+ for (int c = 0; c < 128; ++c) {
+ bool expected = c >= 'a' && c <= 'z';
+ EXPECT_EQ(islower(static_cast<wchar_t>(c)), expected);
+ }
+
+ // Non ASCII
+ TestCase cases[] = {
+ {0x00E9, "LATIN SMALL LETTER E WITH ACUTE", false},
+ {0x03B1, "GREEK SMALL LETTER ALPHA", false},
+ {0x00C0, "LATIN CAPITAL LETTER A WITH GRAVE", false},
+ };
+
+ for (const auto &tc : cases) {
+ EXPECT_EQ(islower(static_cast<wchar_t>(tc.wc)), tc.expected) << tc.name;
+ }
+}
+
+TEST(LlvmLibcWctypeUtilsTest, IsLowerUtf8) {
+ using utf8_mode::LIBC_NAMESPACE::internal::islower;
+
+ // ASCII
+ for (int c = 0; c < 128; ++c) {
+ bool expected = c >= 'a' && c <= 'z';
+ EXPECT_EQ(islower(static_cast<wchar_t>(c)), expected);
+ }
+
+ // Non ASCII
+ TestCase cases[] = {
+ {0x00E9, "LATIN SMALL LETTER E WITH ACUTE", true},
+ {0x03B1, "GREEK SMALL LETTER ALPHA", true},
+ {0x00C0, "LATIN CAPITAL LETTER A WITH GRAVE", false},
+ };
+
+ for (const auto &tc : cases) {
+ EXPECT_EQ(islower(static_cast<wchar_t>(tc.wc)), tc.expected) << tc.name;
+ }
+}
+
+TEST(LlvmLibcWctypeUtilsTest, IsUpperAscii) {
+ using ascii_mode::LIBC_NAMESPACE::internal::isupper;
+
+ // ASCII
+ for (int c = 0; c < 128; ++c) {
+ bool expected = c >= 'A' && c <= 'Z';
+ EXPECT_EQ(isupper(static_cast<wchar_t>(c)), expected);
+ }
+
+ // Non ASCII
+ TestCase cases[] = {
+ {0x00E9, "LATIN SMALL LETTER E WITH ACUTE", false},
+ {0x03B1, "GREEK SMALL LETTER ALPHA", false},
+ {0x00C0, "LATIN CAPITAL LETTER A WITH GRAVE", false},
+ };
+
+ for (const auto &tc : cases) {
+ EXPECT_EQ(isupper(static_cast<wchar_t>(tc.wc)), tc.expected) << tc.name;
+ }
+}
+
+TEST(LlvmLibcWctypeUtilsTest, IsUpperUtf8) {
+ using utf8_mode::LIBC_NAMESPACE::internal::isupper;
+
+ // ASCII
+ for (int c = 0; c < 128; ++c) {
+ bool expected = c >= 'A' && c <= 'Z';
+ EXPECT_EQ(isupper(static_cast<wchar_t>(c)), expected);
+ }
+
+ // Non ASCII
+ TestCase cases[] = {
+ {0x00E9, "LATIN SMALL LETTER E WITH ACUTE", false},
+ {0x03B1, "GREEK SMALL LETTER ALPHA", false},
+ {0x00C0, "LATIN CAPITAL LETTER A WITH GRAVE", true},
+ };
+
+ for (const auto &tc : cases) {
+ EXPECT_EQ(isupper(static_cast<wchar_t>(tc.wc)), tc.expected) << tc.name;
+ }
+}
+
+TEST(LlvmLibcWctypeUtilsTest, IsAlphaAscii) {
+ using ascii_mode::LIBC_NAMESPACE::internal::isalpha;
+
+ // ASCII
+ for (int c = 0; c < 128; ++c) {
+ bool expected = (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
+ EXPECT_EQ(isalpha(static_cast<wchar_t>(c)), expected);
+ }
+
+ // Non ASCII
+ TestCase cases[] = {
+ {0x00E9, "LATIN SMALL LETTER E WITH ACUTE", false},
+ {0x03B1, "GREEK SMALL LETTER ALPHA", false},
+ {0x00C0, "LATIN CAPITAL LETTER A WITH GRAVE", false},
+ };
+
+ for (const auto &tc : cases) {
+ EXPECT_EQ(isalpha(static_cast<wchar_t>(tc.wc)), tc.expected) << tc.name;
+ }
+}
+
+TEST(LlvmLibcWctypeUtilsTest, IsAlphaUtf8) {
+ using utf8_mode::LIBC_NAMESPACE::internal::isalpha;
+
+ // ASCII
+ for (int c = 0; c < 128; ++c) {
+ bool expected = (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
+ EXPECT_EQ(isalpha(static_cast<wchar_t>(c)), expected);
+ }
+
+ // Non ASCII
+ TestCase cases[] = {
+ {0x00E9, "LATIN SMALL LETTER E WITH ACUTE", true},
+ {0x03B1, "GREEK SMALL LETTER ALPHA", true},
+ {0x00C0, "LATIN CAPITAL LETTER A WITH GRAVE", true},
+ };
+
+ for (const auto &tc : cases) {
+ EXPECT_EQ(isalpha(static_cast<wchar_t>(tc.wc)), tc.expected) << tc.name;
+ }
+}
+
+TEST(LlvmLibcWctypeUtilsTest, IsDigitAscii) {
+ using ascii_mode::LIBC_NAMESPACE::internal::isdigit;
+
+ // ASCII
+ for (int c = 0; c < 128; ++c) {
+ bool expected = (c >= '0' && c <= '9');
+ EXPECT_EQ(isdigit(static_cast<wchar_t>(c)), expected);
+ }
+
+ // Non ASCII
+ TestCase cases[] = {
+ {0x00E9, "LATIN SMALL LETTER E WITH ACUTE", false},
+ {0x0660, "ARABIC-INDIC DIGIT ZERO", false},
+ };
+
+ for (const auto &tc : cases) {
+ EXPECT_EQ(isdigit(static_cast<wchar_t>(tc.wc)), tc.expected) << tc.name;
+ }
+}
+
+TEST(LlvmLibcWctypeUtilsTest, IsDigitUtf8) {
+ using utf8_mode::LIBC_NAMESPACE::internal::isdigit;
+
+ // ASCII
+ for (int c = 0; c < 128; ++c) {
+ bool expected = (c >= '0' && c <= '9');
+ EXPECT_EQ(isdigit(static_cast<wchar_t>(c)), expected);
+ }
+
+ // Non ASCII
+ // In C.UTF-8, isdigit only returns true for ASCII digits.
+ TestCase cases[] = {
+ {0x00E9, "LATIN SMALL LETTER E WITH ACUTE", false},
+ {0x0660, "ARABIC-INDIC DIGIT ZERO", false},
+ };
+
+ for (const auto &tc : cases) {
+ EXPECT_EQ(isdigit(static_cast<wchar_t>(tc.wc)), tc.expected) << tc.name;
+ }
+}
+
+TEST(LlvmLibcWctypeUtilsTest, IsAlnumAscii) {
+ using ascii_mode::LIBC_NAMESPACE::internal::isalnum;
+
+ // ASCII
+ for (int c = 0; c < 128; ++c) {
+ bool expected = (c >= '0' && c <= '9') || (c >= 'a' && c <= 'z') ||
+ (c >= 'A' && c <= 'Z');
+ EXPECT_EQ(isalnum(static_cast<wchar_t>(c)), expected);
+ }
+
+ // Non ASCII
+ TestCase cases[] = {
+ {0x00E9, "LATIN SMALL LETTER E WITH ACUTE", false},
+ {0x0660, "ARABIC-INDIC DIGIT ZERO", false},
+ {0x0030, "DIGIT ZERO", true},
+ };
+
+ for (const auto &tc : cases) {
+ EXPECT_EQ(isalnum(static_cast<wchar_t>(tc.wc)), tc.expected) << tc.name;
+ }
+}
+
+TEST(LlvmLibcWctypeUtilsTest, IsAlnumUtf8) {
+ using utf8_mode::LIBC_NAMESPACE::internal::isalnum;
+
+ // ASCII
+ for (int c = 0; c < 128; ++c) {
+ bool expected = (c >= '0' && c <= '9') || (c >= 'a' && c <= 'z') ||
+ (c >= 'A' && c <= 'Z');
+ EXPECT_EQ(isalnum(static_cast<wchar_t>(c)), expected);
+ }
+
+ // Non ASCII
+ TestCase cases[] = {
+ {0x00E9, "LATIN SMALL LETTER E WITH ACUTE", true},
+ {0x0660, "ARABIC-INDIC DIGIT ZERO", true},
+ {0x0030, "DIGIT ZERO", true},
+ };
+
+ for (const auto &tc : cases) {
+ EXPECT_EQ(isalnum(static_cast<wchar_t>(tc.wc)), tc.expected) << tc.name;
+ }
+}
+
+TEST(LlvmLibcWctypeUtilsTest, IsXDigitAscii) {
+ using ascii_mode::LIBC_NAMESPACE::internal::isxdigit;
+
+ for (int c = 0; c < 128; ++c) {
+ bool expected = (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') ||
+ (c >= 'A' && c <= 'F');
+ EXPECT_EQ(isxdigit(static_cast<wchar_t>(c)), expected);
+ }
+}
+
+TEST(LlvmLibcWctypeUtilsTest, IsXDigitUtf8) {
+ using utf8_mode::LIBC_NAMESPACE::internal::isxdigit;
+
+ for (int c = 0; c < 128; ++c) {
+ bool expected = (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') ||
+ (c >= 'A' && c <= 'F');
+ EXPECT_EQ(isxdigit(static_cast<wchar_t>(c)), expected);
+ }
+}
+
+TEST(LlvmLibcWctypeUtilsTest, IsSpaceAscii) {
+ using ascii_mode::LIBC_NAMESPACE::internal::isspace;
+
+ // ASCII
+ for (int c = 0; c < 128; ++c) {
+ bool expected = (c == ' ' || c == '\t' || c == '\n' || c == '\v' ||
+ c == '\f' || c == '\r');
+ EXPECT_EQ(isspace(static_cast<wchar_t>(c)), expected);
+ }
+
+ // Non ASCII
+ TestCase cases[] = {
+ {0x00A0, "NO-BREAK SPACE", false},
+ {0x2000, "EN QUAD", false},
+ {0x2028, "LINE SEPARATOR", false},
+ };
+
+ for (const auto &tc : cases) {
+ EXPECT_EQ(isspace(static_cast<wchar_t>(tc.wc)), tc.expected) << tc.name;
+ }
+}
+
+TEST(LlvmLibcWctypeUtilsTest, IsSpaceUtf8) {
+ using utf8_mode::LIBC_NAMESPACE::internal::isspace;
+
+ // ASCII
+ for (int c = 0; c < 128; ++c) {
+ bool expected = (c == ' ' || c == '\t' || c == '\n' || c == '\v' ||
+ c == '\f' || c == '\r');
+ EXPECT_EQ(isspace(static_cast<wchar_t>(c)), expected);
+ }
+
+ // Non ASCII
+ TestCase cases[] = {
+ {0x00A0, "NO-BREAK SPACE", false},
+ {0x1680, "OGHAM SPACE MARK", true},
+ {0x2000, "EN QUAD", true},
+ {0x2028, "LINE SEPARATOR", true},
+ };
+
+ for (const auto &tc : cases) {
+ EXPECT_EQ(isspace(static_cast<wchar_t>(tc.wc)), tc.expected) << tc.name;
+ }
+}
+
+TEST(LlvmLibcWctypeUtilsTest, IsBlankAscii) {
+ using ascii_mode::LIBC_NAMESPACE::internal::isblank;
+
+ // ASCII
+ for (int c = 0; c < 128; ++c) {
+ bool expected = (c == ' ' || c == '\t');
+ EXPECT_EQ(isblank(static_cast<wchar_t>(c)), expected);
+ }
+
+ // Non ASCII
+ TestCase cases[] = {
+ {0x00A0, "NO-BREAK SPACE", false},
+ {0x2000, "EN QUAD", false},
+ };
+
+ for (const auto &tc : cases) {
+ EXPECT_EQ(isblank(static_cast<wchar_t>(tc.wc)), tc.expected) << tc.name;
+ }
+}
+
+TEST(LlvmLibcWctypeUtilsTest, IsBlankUtf8) {
+ using utf8_mode::LIBC_NAMESPACE::internal::isblank;
+
+ // ASCII
+ for (int c = 0; c < 128; ++c) {
+ bool expected = (c == ' ' || c == '\t');
+ EXPECT_EQ(isblank(static_cast<wchar_t>(c)), expected);
+ }
+
+ // Non ASCII
+ TestCase cases[] = {
+ {0x00A0, "NO-BREAK SPACE", false},
+ {0x1680, "OGHAM SPACE MARK", true},
+ {0x2000, "EN QUAD", true},
+ };
+
+ for (const auto &tc : cases) {
+ EXPECT_EQ(isblank(static_cast<wchar_t>(tc.wc)), tc.expected) << tc.name;
+ }
+}
+
+TEST(LlvmLibcWctypeUtilsTest, IsGraphAscii) {
+ using ascii_mode::LIBC_NAMESPACE::internal::isgraph;
+
+ // ASCII
+ for (int c = 0; c < 128; ++c) {
+ bool expected = (c > 0x20 && c < 0x7f);
+ EXPECT_EQ(isgraph(static_cast<wchar_t>(c)), expected);
+ }
+
+ // Non ASCII
+ TestCase cases[] = {
+ {0x00A0, "NO-BREAK SPACE", false},
+ {0x2000, "EN QUAD", false},
+ {0x2603, "SNOWMAN", false},
+ };
+
+ for (const auto &tc : cases) {
+ EXPECT_EQ(isgraph(static_cast<wchar_t>(tc.wc)), tc.expected) << tc.name;
+ }
+}
+
+TEST(LlvmLibcWctypeUtilsTest, IsGraphUtf8) {
+ using utf8_mode::LIBC_NAMESPACE::internal::isgraph;
+
+ // ASCII
+ for (int c = 0; c < 128; ++c) {
+ bool expected = (c > 0x20 && c < 0x7f);
+ EXPECT_EQ(isgraph(static_cast<wchar_t>(c)), expected);
+ }
+
+ // Non ASCII
+ TestCase cases[] = {
+ {0x00A0, "NO-BREAK SPACE", true},
+ {0x2000, "EN QUAD", false},
+ {0x2603, "SNOWMAN", true},
+ };
+
+ for (const auto &tc : cases) {
+ EXPECT_EQ(isgraph(static_cast<wchar_t>(tc.wc)), tc.expected) << tc.name;
+ }
+}
+
+TEST(LlvmLibcWctypeUtilsTest, IsPrintAscii) {
+ using ascii_mode::LIBC_NAMESPACE::internal::isprint;
+
+ // ASCII
+ for (int c = 0; c < 128; ++c) {
+ bool expected = (c >= 0x20 && c < 0x7f);
+ EXPECT_EQ(isprint(static_cast<wchar_t>(c)), expected);
+ }
+
+ // Non ASCII
+ TestCase cases[] = {
+ {0x00A0, "NO-BREAK SPACE", false},
+ {0x2000, "EN QUAD", false},
+ {0x2603, "SNOWMAN", false},
+ };
+
+ for (const auto &tc : cases) {
+ EXPECT_EQ(isprint(static_cast<wchar_t>(tc.wc)), tc.expected) << tc.name;
+ }
+}
+
+TEST(LlvmLibcWctypeUtilsTest, IsPrintUtf8) {
+ using utf8_mode::LIBC_NAMESPACE::internal::isprint;
+
+ // ASCII
+ for (int c = 0; c < 128; ++c) {
+ bool expected = (c >= 0x20 && c < 0x7f);
+ EXPECT_EQ(isprint(static_cast<wchar_t>(c)), expected);
+ }
+
+ // Non ASCII
+ TestCase cases[] = {
+ {0x00A0, "NO-BREAK SPACE", true},
+ {0x2000, "EN QUAD", true},
+ {0x2603, "SNOWMAN", true},
+ };
+
+ for (const auto &tc : cases) {
+ EXPECT_EQ(isprint(static_cast<wchar_t>(tc.wc)), tc.expected) << tc.name;
+ }
+}
+
+TEST(LlvmLibcWctypeUtilsTest, IsPunctAscii) {
+ using ascii_mode::LIBC_NAMESPACE::internal::isalnum;
+ using ascii_mode::LIBC_NAMESPACE::internal::isgraph;
+ using ascii_mode::LIBC_NAMESPACE::internal::ispunct;
+
+ // ASCII
+ for (int c = 0; c < 128; ++c) {
+ bool expected = is_punctuation_character(c);
+ EXPECT_EQ(ispunct(static_cast<wchar_t>(c)), expected);
+ }
+
+ // Non ASCII
+ TestCase cases[] = {
+ {0x00A1, "INVERTED EXCLAMATION MARK", false},
+ {0x2014, "EM DASH", false},
+ {0x20AC, "EURO SIGN", false},
+ };
+
+ for (const auto &tc : cases) {
+ EXPECT_EQ(ispunct(static_cast<wchar_t>(tc.wc)), tc.expected) << tc.name;
+ }
+}
+
+TEST(LlvmLibcWctypeUtilsTest, IsPunctUtf8) {
+ using utf8_mode::LIBC_NAMESPACE::internal::isalnum;
+ using utf8_mode::LIBC_NAMESPACE::internal::isgraph;
+ using utf8_mode::LIBC_NAMESPACE::internal::ispunct;
+
+ // ASCII
+ for (int c = 0; c < 128; ++c) {
+ bool expected = is_punctuation_character(c);
+ EXPECT_EQ(ispunct(static_cast<wchar_t>(c)), expected);
+ }
+
+ // Non ASCII
+ TestCase cases[] = {
+ {0x00A1, "INVERTED EXCLAMATION MARK", true},
+ {0x2014, "EM DASH", true},
+ {0x20AC, "EURO SIGN", true},
+ };
+
+ for (const auto &tc : cases) {
+ EXPECT_EQ(ispunct(static_cast<wchar_t>(tc.wc)), tc.expected) << tc.name;
+ }
+}
+
+TEST(LlvmLibcWctypeUtilsTest, IsCntrlAscii) {
+ using ascii_mode::LIBC_NAMESPACE::internal::iscntrl;
+
+ // ASCII
+ for (int c = 0; c < 128; ++c) {
+ bool expected = (c < 0x20 || c == 0x7f);
+ EXPECT_EQ(iscntrl(static_cast<wchar_t>(c)), expected);
+ }
+
+ // Non ASCII
+ TestCase cases[] = {
+ {0x0080, "PADDING CHARACTER", false},
+ {0x009F, "APPLICATION PROGRAM COMMAND", false},
+ {0x2028, "LINE SEPARATOR", false},
+ };
+
+ for (const auto &tc : cases) {
+ EXPECT_EQ(iscntrl(static_cast<wchar_t>(tc.wc)), tc.expected) << tc.name;
+ }
+}
+
+TEST(LlvmLibcWctypeUtilsTest, IsCntrlUtf8) {
+ using utf8_mode::LIBC_NAMESPACE::internal::iscntrl;
+
+ // ASCII
+ for (int c = 0; c < 128; ++c) {
+ bool expected = (c < 0x20 || c == 0x7f);
+ EXPECT_EQ(iscntrl(static_cast<wchar_t>(c)), expected);
+ }
+
+ // Non ASCII
+ TestCase cases[] = {
+ {0x0080, "PADDING CHARACTER", true},
+ {0x009F, "APPLICATION PROGRAM COMMAND", true},
+ {0x2028, "LINE SEPARATOR", false},
+ };
+
+ for (const auto &tc : cases) {
+ EXPECT_EQ(iscntrl(static_cast<wchar_t>(tc.wc)), tc.expected) << tc.name;
+ }
+}
+
+} // namespace
More information about the libc-commits
mailing list