[libc-commits] [libc] [libc][wctype] Create generation script for classification lookup tables (PR #172042)
Marcell Leleszi via libc-commits
libc-commits at lists.llvm.org
Tue Dec 30 09:42:43 PST 2025
================
@@ -0,0 +1,540 @@
+//===-- Unittests for wctype classification utils -------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/macros/properties/os.h"
+#include "src/__support/wctype/wctype_classification_utils.h"
+#include "test/UnitTest/Test.h"
+
+namespace {
+
+// On Windows, wchar_t is 16 bits. We guard the cases that do not fit
+// within 16 bits to prevent narrowing conversion and incorrect test results.
+struct TestCase {
+ uint32_t wc;
+ const char *name;
+ bool expected;
+};
+
+TEST(LlvmLibcWctypeClassificationUtilsTest, Lower) {
+ TestCase cases[] = {// ASCII lowercase
+ {0x0061, "LATIN SMALL LETTER A", true},
+ {0x007A, "LATIN SMALL LETTER Z", true},
+
+ // ASCII uppercase
+ {0x0041, "LATIN CAPITAL LETTER A", false},
+ {0x005A, "LATIN CAPITAL LETTER Z", false},
+
+ // ASCII non-letters
+ {0x0030, "DIGIT ZERO", false},
+ {0x0020, "SPACE", false},
+ {0x0021, "EXCLAMATION MARK", false},
+
+ // Latin Extended lowercase
+ {0x00E0, "LATIN SMALL LETTER A WITH GRAVE", true},
+ {0x00E9, "LATIN SMALL LETTER E WITH ACUTE", true},
+ {0x00FF, "LATIN SMALL LETTER Y WITH DIAERESIS", true},
+
+ // Latin Extended uppercase
+ {0x00C0, "LATIN CAPITAL LETTER A WITH GRAVE", false},
+ {0x00C9, "LATIN CAPITAL LETTER E WITH ACUTE", false},
+
+ // Greek lowercase
+ {0x03B1, "GREEK SMALL LETTER ALPHA", true},
+ {0x03C9, "GREEK SMALL LETTER OMEGA", true},
+
+ // Greek uppercase
+ {0x0391, "GREEK CAPITAL LETTER ALPHA", false},
+ {0x03A9, "GREEK CAPITAL LETTER OMEGA", false},
+
+ // Cyrillic lowercase
+ {0x0430, "CYRILLIC SMALL LETTER A", true},
+ {0x044F, "CYRILLIC SMALL LETTER YA", true},
+
+ // Cyrillic uppercase
+ {0x0410, "CYRILLIC CAPITAL LETTER A", false},
+ {0x042F, "CYRILLIC CAPITAL LETTER YA", false},
+
+ // Caseless scripts
+ {0x05D0, "HEBREW LETTER ALEF", false},
+ {0x0627, "ARABIC LETTER ALEF", false},
+ {0x4E00, "CJK UNIFIED IDEOGRAPH-4E00", false}};
+
+ for (const auto &tc : cases) {
+ bool res = LIBC_NAMESPACE::lookup_properties(static_cast<wchar_t>(tc.wc)) &
+ LIBC_NAMESPACE::PropertyFlag::LOWER;
+ EXPECT_EQ(res, tc.expected) << tc.name << "\n";
+ }
+}
+
+TEST(LlvmLibcWctypeClassificationUtilsTest, Upper) {
+ TestCase cases[] = {
+ // ASCII lowercase
+ {0x0061, "LATIN SMALL LETTER A", false},
+ {0x007A, "LATIN SMALL LETTER Z", false},
+
+ // ASCII uppercase
+ {0x0041, "LATIN CAPITAL LETTER A", true},
+ {0x005A, "LATIN CAPITAL LETTER Z", true},
+
+ // ASCII non-letters
+ {0x0030, "DIGIT ZERO", false},
+ {0x0020, "SPACE", false},
+ {0x0021, "EXCLAMATION MARK", false},
+
+ // Titlecase
+ {0x01C5, "LATIN CAPITAL LETTER D WITH SMALL LETTER Z WITH CARON", true},
+
+ // Latin Extended lowercase
+ {0x00E0, "LATIN SMALL LETTER A WITH GRAVE", false},
+ {0x00E9, "LATIN SMALL LETTER E WITH ACUTE", false},
+ {0x00FF, "LATIN SMALL LETTER Y WITH DIAERESIS", false},
+
+ // Latin Extended uppercase
+ {0x00C0, "LATIN CAPITAL LETTER A WITH GRAVE", true},
+ {0x00C9, "LATIN CAPITAL LETTER E WITH ACUTE", true},
+
+ // Greek lowercase
+ {0x03B1, "GREEK SMALL LETTER ALPHA", false},
+ {0x03C9, "GREEK SMALL LETTER OMEGA", false},
+
+ // Greek uppercase
+ {0x0391, "GREEK CAPITAL LETTER ALPHA", true},
+ {0x03A9, "GREEK CAPITAL LETTER OMEGA", true},
+
+ // Cyrillic lowercase
+ {0x0430, "CYRILLIC SMALL LETTER A", false},
+ {0x044F, "CYRILLIC SMALL LETTER YA", false},
+
+ // Cyrillic uppercase
+ {0x0410, "CYRILLIC CAPITAL LETTER A", true},
+ {0x042F, "CYRILLIC CAPITAL LETTER YA", true},
+
+ // Caseless scripts
+ {0x05D0, "HEBREW LETTER ALEF", false},
+ {0x0627, "ARABIC LETTER ALEF", false},
+ {0x4E00, "CJK UNIFIED IDEOGRAPH-4E00", false}};
+
+ for (const auto &tc : cases) {
+ bool res = LIBC_NAMESPACE::lookup_properties(static_cast<wchar_t>(tc.wc)) &
+ LIBC_NAMESPACE::PropertyFlag::UPPER;
+ EXPECT_EQ(res, tc.expected) << tc.name << "\n";
+ }
+}
+
+TEST(LlvmLibcWctypeClassificationUtilsTest, Alpha) {
+ TestCase cases[] = {
+ // ASCII letters
+ {0x0041, "LATIN CAPITAL LETTER A", true},
+ {0x0061, "LATIN SMALL LETTER A", true},
+ {0x005A, "LATIN CAPITAL LETTER Z", true},
+ {0x007A, "LATIN SMALL LETTER Z", true},
+
+ // ASCII non-letters
+ {0x0030, "DIGIT ZERO", false},
+ {0x0039, "DIGIT NINE", false},
+ {0x0020, "SPACE", false},
+ {0x0021, "EXCLAMATION MARK", false},
+ {0x007E, "TILDE", false},
+
+ // Modified letters
+ {0x02B0, "MODIFIED LETTER SMALL H", true},
+
+ // Latin Extended
+ {0x00C0, "LATIN CAPITAL LETTER A WITH GRAVE", true},
+ {0x00E9, "LATIN SMALL LETTER E WITH ACUTE", true},
+ {0x00FF, "LATIN SMALL LETTER Y WITH DIAERESIS", true},
+
+ // Greek
+ {0x0391, "GREEK CAPITAL LETTER ALPHA", true},
+ {0x03B1, "GREEK SMALL LETTER ALPHA", true},
+ {0x03C9, "GREEK SMALL LETTER OMEGA", true},
+
+ // Cyrillic
+ {0x0410, "CYRILLIC CAPITAL LETTER A", true},
+ {0x0430, "CYRILLIC SMALL LETTER A", true},
+ {0x044F, "CYRILLIC SMALL LETTER YA", true},
+
+ // Arabic
+ {0x0627, "ARABIC LETTER ALEF", true},
+ {0x0628, "ARABIC LETTER BEH", true},
+
+ // CJK
+ {0x4E00, "CJK UNIFIED IDEOGRAPH-4E00 (first)", true},
+ {0x4E01, "CJK UNIFIED IDEOGRAPH-4E01", true},
+ {0x9FFF, "CJK UNIFIED IDEOGRAPH-9FFF (last in BMP)", true},
+
+ // Emoji and symbols
+ {0x2764, "HEAVY BLACK HEART", false},
+#ifndef LIBC_TARGET_OS_IS_WINDOWS
+ {0x1F600, "GRINNING FACE", false},
+#endif // LIBC_TARGET_OS_IS_WINDOWS
----------------
mleleszi wrote:
Done
https://github.com/llvm/llvm-project/pull/172042
More information about the libc-commits
mailing list