[libcxx-commits] [libcxx] [llvm] Finish supporting Unicode 18 (PR #199111)
Charlie Lin via libcxx-commits
libcxx-commits at lists.llvm.org
Fri May 22 06:24:43 PDT 2026
https://github.com/clin1234 updated https://github.com/llvm/llvm-project/pull/199111
>From e80090631e85bc24b8742d0922bf88eff998c09e Mon Sep 17 00:00:00 2001
From: Charlie Lin <tuug at gmx.us>
Date: Thu, 21 May 2026 13:52:02 -0500
Subject: [PATCH 01/14] Update Unicode version in comments to 18.0.0 in
generate_escaped_output_table.py
---
libcxx/utils/generate_escaped_output_table.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/libcxx/utils/generate_escaped_output_table.py b/libcxx/utils/generate_escaped_output_table.py
index 30583dd8cdc85..dd7173201b466 100755
--- a/libcxx/utils/generate_escaped_output_table.py
+++ b/libcxx/utils/generate_escaped_output_table.py
@@ -322,7 +322,7 @@ def generate_data_tables() -> str:
# The output table has two large entries at the end, with a small "gap"
# E0100..E01EF ; Grapheme_Extend # Mn [240] VARIATION SELECTOR-17..VARIATION SELECTOR-256
- # Based on Unicode 15.1.0:
+ # Based on Unicode 18.0.0:
# - Encoding all these entries in the table requires 1173 entries.
# - Manually handling these last two blocks reduces the size to 729 entries.
# This not only reduces the binary size, but also improves the performance
>From e3ddeb6cc37f69abeb9f6f7e7303030a17e3af9c Mon Sep 17 00:00:00 2001
From: Charlie Lin <tuug at gmx.us>
Date: Thu, 21 May 2026 13:55:08 -0500
Subject: [PATCH 02/14] Update Unicode data URL in
UnicodeNameMappingGenerator.cpp
---
llvm/utils/UnicodeData/UnicodeNameMappingGenerator.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/utils/UnicodeData/UnicodeNameMappingGenerator.cpp b/llvm/utils/UnicodeData/UnicodeNameMappingGenerator.cpp
index 5796c94afef4a..106a51b80afca 100644
--- a/llvm/utils/UnicodeData/UnicodeNameMappingGenerator.cpp
+++ b/llvm/utils/UnicodeData/UnicodeNameMappingGenerator.cpp
@@ -8,7 +8,7 @@
//
// This file is used to generate lib/Support/UnicodeNameToCodepointGenerated.cpp
// using UnicodeData.txt and NameAliases.txt available at
-// https://unicode.org/Public/15.1.0/ucd/
+// https://unicode.org/Public/18.0.0/ucd/
//===----------------------------------------------------------------------===//
#include "llvm/ADT/STLExtras.h"
>From 06d685b9d2814f5fa286b993bea38d90f1f4e458 Mon Sep 17 00:00:00 2001
From: Charlie Lin <tuug at gmx.us>
Date: Thu, 21 May 2026 14:38:07 -0500
Subject: [PATCH 03/14] Update __entries array size and values for Unicode 18
---
.../include/__format/escaped_output_table.h | 141 ++++++++++--------
1 file changed, 75 insertions(+), 66 deletions(-)
diff --git a/libcxx/include/__format/escaped_output_table.h b/libcxx/include/__format/escaped_output_table.h
index a62c715811a0c..c0d401d9a00c1 100644
--- a/libcxx/include/__format/escaped_output_table.h
+++ b/libcxx/include/__format/escaped_output_table.h
@@ -110,7 +110,7 @@ namespace __escaped_output_table {
/// - bits [14, 31] The lower bound code point of the range. The upper bound of
/// the range is lower bound + size. Note the code expects code units the fit
/// into 18 bits, instead of the 21 bits needed for the full Unicode range.
-_LIBCPP_HIDE_FROM_ABI inline constexpr uint32_t __entries[735] = {
+_LIBCPP_HIDE_FROM_ABI inline constexpr uint32_t __entries[744] = {
0x00000020 /* 00000000 - 00000020 [ 33] */,
0x001fc021 /* 0000007f - 000000a0 [ 34] */,
0x002b4000 /* 000000ad - 000000ad [ 1] */,
@@ -120,10 +120,9 @@ _LIBCPP_HIDE_FROM_ABI inline constexpr uint32_t __entries[735] = {
0x00e34000 /* 0000038d - 0000038d [ 1] */,
0x00e88000 /* 000003a2 - 000003a2 [ 1] */,
0x014c0000 /* 00000530 - 00000530 [ 1] */,
- 0x0155c001 /* 00000557 - 00000558 [ 2] */,
- 0x0162c001 /* 0000058b - 0000058c [ 2] */,
+ 0x0155c000 /* 00000557 - 00000557 [ 1] */,
0x01640000 /* 00000590 - 00000590 [ 1] */,
- 0x01720007 /* 000005c8 - 000005cf [ 8] */,
+ 0x01728005 /* 000005ca - 000005cf [ 6] */,
0x017ac003 /* 000005eb - 000005ee [ 4] */,
0x017d4010 /* 000005f5 - 00000605 [ 17] */,
0x01870000 /* 0000061c - 0000061c [ 1] */,
@@ -137,7 +136,7 @@ _LIBCPP_HIDE_FROM_ABI inline constexpr uint32_t __entries[735] = {
0x02170001 /* 0000085c - 0000085d [ 2] */,
0x0217c000 /* 0000085f - 0000085f [ 1] */,
0x021ac004 /* 0000086b - 0000086f [ 5] */,
- 0x0223c007 /* 0000088f - 00000896 [ 8] */,
+ 0x02240006 /* 00000890 - 00000896 [ 7] */,
0x02388000 /* 000008e2 - 000008e2 [ 1] */,
0x02610000 /* 00000984 - 00000984 [ 1] */,
0x02634001 /* 0000098d - 0000098e [ 2] */,
@@ -192,7 +191,7 @@ _LIBCPP_HIDE_FROM_ABI inline constexpr uint32_t __entries[735] = {
0x02ce8001 /* 00000b3a - 00000b3b [ 2] */,
0x02d14001 /* 00000b45 - 00000b46 [ 2] */,
0x02d24001 /* 00000b49 - 00000b4a [ 2] */,
- 0x02d38006 /* 00000b4e - 00000b54 [ 7] */,
+ 0x02d38004 /* 00000b4e - 00000b52 [ 5] */,
0x02d60003 /* 00000b58 - 00000b5b [ 4] */,
0x02d78000 /* 00000b5e - 00000b5e [ 1] */,
0x02d90001 /* 00000b64 - 00000b65 [ 2] */,
@@ -221,7 +220,7 @@ _LIBCPP_HIDE_FROM_ABI inline constexpr uint32_t __entries[735] = {
0x03124000 /* 00000c49 - 00000c49 [ 1] */,
0x03138006 /* 00000c4e - 00000c54 [ 7] */,
0x0315c000 /* 00000c57 - 00000c57 [ 1] */,
- 0x0316c001 /* 00000c5b - 00000c5c [ 2] */,
+ 0x0316c000 /* 00000c5b - 00000c5b [ 1] */,
0x03178001 /* 00000c5e - 00000c5f [ 2] */,
0x03190001 /* 00000c64 - 00000c65 [ 2] */,
0x031c0006 /* 00000c70 - 00000c76 [ 7] */,
@@ -233,7 +232,7 @@ _LIBCPP_HIDE_FROM_ABI inline constexpr uint32_t __entries[735] = {
0x03314000 /* 00000cc5 - 00000cc5 [ 1] */,
0x03324000 /* 00000cc9 - 00000cc9 [ 1] */,
0x03338006 /* 00000cce - 00000cd4 [ 7] */,
- 0x0335c005 /* 00000cd7 - 00000cdc [ 6] */,
+ 0x0335c004 /* 00000cd7 - 00000cdb [ 5] */,
0x0337c000 /* 00000cdf - 00000cdf [ 1] */,
0x03390001 /* 00000ce4 - 00000ce5 [ 2] */,
0x033c0000 /* 00000cf0 - 00000cf0 [ 1] */,
@@ -331,7 +330,7 @@ _LIBCPP_HIDE_FROM_ABI inline constexpr uint32_t __entries[735] = {
0x06a28005 /* 00001a8a - 00001a8f [ 6] */,
0x06a68005 /* 00001a9a - 00001a9f [ 6] */,
0x06ab8001 /* 00001aae - 00001aaf [ 2] */,
- 0x06b3c030 /* 00001acf - 00001aff [ 49] */,
+ 0x06bc400e /* 00001af1 - 00001aff [ 15] */,
0x06d34000 /* 00001b4d - 00001b4d [ 1] */,
0x06fd0007 /* 00001bf4 - 00001bfb [ 8] */,
0x070e0002 /* 00001c38 - 00001c3a [ 3] */,
@@ -359,15 +358,12 @@ _LIBCPP_HIDE_FROM_ABI inline constexpr uint32_t __entries[735] = {
0x080a0007 /* 00002028 - 0000202f [ 8] */,
0x0817c010 /* 0000205f - 0000206f [ 17] */,
0x081c8001 /* 00002072 - 00002073 [ 2] */,
- 0x0823c000 /* 0000208f - 0000208f [ 1] */,
- 0x08274002 /* 0000209d - 0000209f [ 3] */,
- 0x0830400e /* 000020c1 - 000020cf [ 15] */,
+ 0x0831400a /* 000020c5 - 000020cf [ 11] */,
0x083c400e /* 000020f1 - 000020ff [ 15] */,
0x08630003 /* 0000218c - 0000218f [ 4] */,
0x090a8015 /* 0000242a - 0000243f [ 22] */,
0x0912c014 /* 0000244b - 0000245f [ 21] */,
0x0add0001 /* 00002b74 - 00002b75 [ 2] */,
- 0x0ae58000 /* 00002b96 - 00002b96 [ 1] */,
0x0b3d0004 /* 00002cf4 - 00002cf8 [ 5] */,
0x0b498000 /* 00002d26 - 00002d26 [ 1] */,
0x0b4a0004 /* 00002d28 - 00002d2c [ 5] */,
@@ -383,7 +379,8 @@ _LIBCPP_HIDE_FROM_ABI inline constexpr uint32_t __entries[735] = {
0x0b73c000 /* 00002dcf - 00002dcf [ 1] */,
0x0b75c000 /* 00002dd7 - 00002dd7 [ 1] */,
0x0b77c000 /* 00002ddf - 00002ddf [ 1] */,
- 0x0b978021 /* 00002e5e - 00002e7f [ 34] */,
+ 0x0b978001 /* 00002e5e - 00002e5f [ 2] */,
+ 0x0b99001b /* 00002e64 - 00002e7f [ 28] */,
0x0ba68000 /* 00002e9a - 00002e9a [ 1] */,
0x0bbd000b /* 00002ef4 - 00002eff [ 12] */,
0x0bf58019 /* 00002fd6 - 00002fef [ 26] */,
@@ -399,10 +396,8 @@ _LIBCPP_HIDE_FROM_ABI inline constexpr uint32_t __entries[735] = {
0x2931c008 /* 0000a4c7 - 0000a4cf [ 9] */,
0x298b0013 /* 0000a62c - 0000a63f [ 20] */,
0x29be0007 /* 0000a6f8 - 0000a6ff [ 8] */,
- 0x29f38001 /* 0000a7ce - 0000a7cf [ 2] */,
- 0x29f48000 /* 0000a7d2 - 0000a7d2 [ 1] */,
- 0x29f50000 /* 0000a7d4 - 0000a7d4 [ 1] */,
- 0x29f74014 /* 0000a7dd - 0000a7f1 [ 21] */,
+ 0x29f78003 /* 0000a7de - 0000a7e1 [ 4] */,
+ 0x29f8c00d /* 0000a7e3 - 0000a7f0 [ 14] */,
0x2a0b4002 /* 0000a82d - 0000a82f [ 3] */,
0x2a0e8005 /* 0000a83a - 0000a83f [ 6] */,
0x2a1e0007 /* 0000a878 - 0000a87f [ 8] */,
@@ -423,7 +418,7 @@ _LIBCPP_HIDE_FROM_ABI inline constexpr uint32_t __entries[735] = {
0x2ac5c008 /* 0000ab17 - 0000ab1f [ 9] */,
0x2ac9c000 /* 0000ab27 - 0000ab27 [ 1] */,
0x2acbc000 /* 0000ab2f - 0000ab2f [ 1] */,
- 0x2adb0003 /* 0000ab6c - 0000ab6f [ 4] */,
+ 0x2adb8001 /* 0000ab6e - 0000ab6f [ 2] */,
0x2afb8001 /* 0000abee - 0000abef [ 2] */,
0x2afe8005 /* 0000abfa - 0000abff [ 6] */,
0x35e9000b /* 0000d7a4 - 0000d7af [ 12] */,
@@ -438,9 +433,6 @@ _LIBCPP_HIDE_FROM_ABI inline constexpr uint32_t __entries[735] = {
0x3ecfc000 /* 0000fb3f - 0000fb3f [ 1] */,
0x3ed08000 /* 0000fb42 - 0000fb42 [ 1] */,
0x3ed14000 /* 0000fb45 - 0000fb45 [ 1] */,
- 0x3ef0c00f /* 0000fbc3 - 0000fbd2 [ 16] */,
- 0x3f640001 /* 0000fd90 - 0000fd91 [ 2] */,
- 0x3f720006 /* 0000fdc8 - 0000fdce [ 7] */,
0x3f74001f /* 0000fdd0 - 0000fdef [ 32] */,
0x3f868005 /* 0000fe1a - 0000fe1f [ 6] */,
0x3f94c000 /* 0000fe53 - 0000fe53 [ 1] */,
@@ -498,7 +490,7 @@ _LIBCPP_HIDE_FROM_ABI inline constexpr uint32_t __entries[735] = {
0x41da0017 /* 00010768 - 0001077f [ 24] */,
0x41e18000 /* 00010786 - 00010786 [ 1] */,
0x41ec4000 /* 000107b1 - 000107b1 [ 1] */,
- 0x41eec044 /* 000107bb - 000107ff [ 69] */,
+ 0x41f0003f /* 000107c0 - 000107ff [ 64] */,
0x42018001 /* 00010806 - 00010807 [ 2] */,
0x42024000 /* 00010809 - 00010809 [ 1] */,
0x420d8000 /* 00010836 - 00010836 [ 1] */,
@@ -511,7 +503,7 @@ _LIBCPP_HIDE_FROM_ABI inline constexpr uint32_t __entries[735] = {
0x423d8004 /* 000108f6 - 000108fa [ 5] */,
0x42470002 /* 0001091c - 0001091e [ 3] */,
0x424e8004 /* 0001093a - 0001093e [ 5] */,
- 0x4250003f /* 00010940 - 0001097f [ 64] */,
+ 0x42568025 /* 0001095a - 0001097f [ 38] */,
0x426e0003 /* 000109b8 - 000109bb [ 4] */,
0x42740001 /* 000109d0 - 000109d1 [ 2] */,
0x42810000 /* 00010a04 - 00010a04 [ 1] */,
@@ -543,7 +535,8 @@ _LIBCPP_HIDE_FROM_ABI inline constexpr uint32_t __entries[735] = {
0x43aa8000 /* 00010eaa - 00010eaa [ 1] */,
0x43ab8001 /* 00010eae - 00010eaf [ 2] */,
0x43ac800f /* 00010eb2 - 00010ec1 [ 16] */,
- 0x43b14036 /* 00010ec5 - 00010efb [ 55] */,
+ 0x43b20000 /* 00010ec8 - 00010ec8 [ 1] */,
+ 0x43bbc000 /* 00010eef - 00010eef [ 1] */,
0x43ca0007 /* 00010f28 - 00010f2f [ 8] */,
0x43d68015 /* 00010f5a - 00010f6f [ 22] */,
0x43e28025 /* 00010f8a - 00010faf [ 38] */,
@@ -626,7 +619,8 @@ _LIBCPP_HIDE_FROM_ABI inline constexpr uint32_t __entries[735] = {
0x46920007 /* 00011a48 - 00011a4f [ 8] */,
0x46a8c00c /* 00011aa3 - 00011aaf [ 13] */,
0x46be4006 /* 00011af9 - 00011aff [ 7] */,
- 0x46c280b5 /* 00011b0a - 00011bbf [ 182] */,
+ 0x46c2c054 /* 00011b0b - 00011b5f [ 85] */,
+ 0x46da0057 /* 00011b68 - 00011bbf [ 88] */,
0x46f8800d /* 00011be2 - 00011bef [ 14] */,
0x46fe8005 /* 00011bfa - 00011bff [ 6] */,
0x47024000 /* 00011c09 - 00011c09 [ 1] */,
@@ -648,7 +642,10 @@ _LIBCPP_HIDE_FROM_ABI inline constexpr uint32_t __entries[735] = {
0x4763c000 /* 00011d8f - 00011d8f [ 1] */,
0x47648000 /* 00011d92 - 00011d92 [ 1] */,
0x47664006 /* 00011d99 - 00011d9f [ 7] */,
- 0x476a8135 /* 00011daa - 00011edf [ 310] */,
+ 0x476a8005 /* 00011daa - 00011daf [ 6] */,
+ 0x47770003 /* 00011ddc - 00011ddf [ 4] */,
+ 0x477a8005 /* 00011dea - 00011def [ 6] */,
+ 0x477c80ed /* 00011df2 - 00011edf [ 238] */,
0x47be4006 /* 00011ef9 - 00011eff [ 7] */,
0x47c44000 /* 00011f11 - 00011f11 [ 1] */,
0x47cec002 /* 00011f3b - 00011f3d [ 3] */,
@@ -656,9 +653,8 @@ _LIBCPP_HIDE_FROM_ABI inline constexpr uint32_t __entries[735] = {
0x47ec400e /* 00011fb1 - 00011fbf [ 15] */,
0x47fc800c /* 00011ff2 - 00011ffe [ 13] */,
0x48e68065 /* 0001239a - 000123ff [ 102] */,
- 0x491bc000 /* 0001246f - 0001246f [ 1] */,
- 0x491d400a /* 00012475 - 0001247f [ 11] */,
- 0x49510a4b /* 00012544 - 00012f8f [ 2636] */,
+ 0x4951000b /* 00012544 - 0001254f [ 12] */,
+ 0x49a1c908 /* 00012687 - 00012f8f [ 2313] */,
0x4bfcc00c /* 00012ff3 - 00012fff [ 13] */,
0x4d0c000f /* 00013430 - 0001343f [ 16] */,
0x4d158009 /* 00013456 - 0001345f [ 10] */,
@@ -677,40 +673,48 @@ _LIBCPP_HIDE_FROM_ABI inline constexpr uint32_t __entries[735] = {
0x5ad88000 /* 00016b62 - 00016b62 [ 1] */,
0x5ade0004 /* 00016b78 - 00016b7c [ 5] */,
0x5ae401af /* 00016b90 - 00016d3f [ 432] */,
- 0x5b5e80c5 /* 00016d7a - 00016e3f [ 198] */,
- 0x5ba6c064 /* 00016e9b - 00016eff [ 101] */,
+ 0x5b5e8005 /* 00016d7a - 00016d7f [ 6] */,
+ 0x5b678001 /* 00016d9e - 00016d9f [ 2] */,
+ 0x5b6a8095 /* 00016daa - 00016e3f [ 150] */,
+ 0x5ba6c004 /* 00016e9b - 00016e9f [ 5] */,
+ 0x5bae4001 /* 00016eb9 - 00016eba [ 2] */,
+ 0x5bb5002b /* 00016ed4 - 00016eff [ 44] */,
0x5bd2c003 /* 00016f4b - 00016f4e [ 4] */,
0x5be20006 /* 00016f88 - 00016f8e [ 7] */,
0x5be8003f /* 00016fa0 - 00016fdf [ 64] */,
0x5bf9400a /* 00016fe5 - 00016fef [ 11] */,
- 0x5bfc800d /* 00016ff2 - 00016fff [ 14] */,
- 0x61fe0007 /* 000187f8 - 000187ff [ 8] */,
- 0x63358028 /* 00018cd6 - 00018cfe [ 41] */,
- 0x634262e6 /* 00018d09 - 0001afef [ 8935] */,
+ 0x5bfdc008 /* 00016ff7 - 00016fff [ 9] */,
+ 0x6336c023 /* 00018cdb - 00018cfe [ 36] */,
+ 0x6348405e /* 00018d21 - 00018d7f [ 95] */,
+ 0x637cc00c /* 00018df3 - 00018dff [ 13] */,
+ 0x6464800d /* 00019192 - 0001919f [ 14] */,
+ 0x6474de1c /* 000191d3 - 0001afef [ 7709] */,
0x6bfd0000 /* 0001aff4 - 0001aff4 [ 1] */,
0x6bff0000 /* 0001affc - 0001affc [ 1] */,
0x6bffc000 /* 0001afff - 0001afff [ 1] */,
- 0x6c48c00e /* 0001b123 - 0001b131 [ 15] */,
+ 0x6c4a4008 /* 0001b129 - 0001b131 [ 9] */,
0x6c4cc01c /* 0001b133 - 0001b14f [ 29] */,
0x6c54c001 /* 0001b153 - 0001b154 [ 2] */,
0x6c55800d /* 0001b156 - 0001b163 [ 14] */,
- 0x6c5a0007 /* 0001b168 - 0001b16f [ 8] */,
+ 0x6c5a4006 /* 0001b169 - 0001b16f [ 7] */,
0x6cbf0903 /* 0001b2fc - 0001bbff [ 2308] */,
0x6f1ac004 /* 0001bc6b - 0001bc6f [ 5] */,
0x6f1f4002 /* 0001bc7d - 0001bc7f [ 3] */,
0x6f224006 /* 0001bc89 - 0001bc8f [ 7] */,
0x6f268001 /* 0001bc9a - 0001bc9b [ 2] */,
0x6f280f5f /* 0001bca0 - 0001cbff [ 3936] */,
- 0x733e8005 /* 0001ccfa - 0001ccff [ 6] */,
- 0x73ad004b /* 0001ceb4 - 0001ceff [ 76] */,
+ 0x733f4002 /* 0001ccfd - 0001ccff [ 3] */,
+ 0x73ad0005 /* 0001ceb4 - 0001ceb9 [ 6] */,
+ 0x73b44000 /* 0001ced1 - 0001ced1 [ 1] */,
+ 0x73b54007 /* 0001ced5 - 0001cedc [ 8] */,
+ 0x73bf8001 /* 0001cefe - 0001ceff [ 2] */,
0x73cb8001 /* 0001cf2e - 0001cf2f [ 2] */,
0x73d1c008 /* 0001cf47 - 0001cf4f [ 9] */,
0x73f1003b /* 0001cfc4 - 0001cfff [ 60] */,
0x743d8009 /* 0001d0f6 - 0001d0ff [ 10] */,
- 0x7449c001 /* 0001d127 - 0001d128 [ 2] */,
0x745cc007 /* 0001d173 - 0001d17a [ 8] */,
- 0x747ac014 /* 0001d1eb - 0001d1ff [ 21] */,
- 0x74918079 /* 0001d246 - 0001d2bf [ 122] */,
+ 0x74918009 /* 0001d246 - 0001d24f [ 10] */,
+ 0x74a0803d /* 0001d282 - 0001d2bf [ 62] */,
0x74b5000b /* 0001d2d4 - 0001d2df [ 12] */,
0x74bd000b /* 0001d2f4 - 0001d2ff [ 12] */,
0x74d5c008 /* 0001d357 - 0001d35f [ 9] */,
@@ -733,13 +737,14 @@ _LIBCPP_HIDE_FROM_ABI inline constexpr uint32_t __entries[735] = {
0x75514000 /* 0001d545 - 0001d545 [ 1] */,
0x7551c002 /* 0001d547 - 0001d549 [ 3] */,
0x75544000 /* 0001d551 - 0001d551 [ 1] */,
- 0x75a98001 /* 0001d6a6 - 0001d6a7 [ 2] */,
+ 0x75a9c000 /* 0001d6a7 - 0001d6a7 [ 1] */,
0x75f30001 /* 0001d7cc - 0001d7cd [ 2] */,
0x76a3000e /* 0001da8c - 0001da9a [ 15] */,
0x76a80000 /* 0001daa0 - 0001daa0 [ 1] */,
- 0x76ac044f /* 0001dab0 - 0001deff [ 1104] */,
- 0x77c7c005 /* 0001df1f - 0001df24 [ 6] */,
- 0x77cac0d4 /* 0001df2b - 0001dfff [ 213] */,
+ 0x76ac004f /* 0001dab0 - 0001daff [ 80] */,
+ 0x76c743e2 /* 0001db1d - 0001deff [ 995] */,
+ 0x77e0800d /* 0001df82 - 0001df8f [ 14] */,
+ 0x77e5c035 /* 0001df97 - 0001dfcc [ 54] */,
0x7801c000 /* 0001e007 - 0001e007 [ 1] */,
0x78064001 /* 0001e019 - 0001e01a [ 2] */,
0x78088000 /* 0001e022 - 0001e022 [ 1] */,
@@ -756,7 +761,10 @@ _LIBCPP_HIDE_FROM_ABI inline constexpr uint32_t __entries[735] = {
0x78c001cf /* 0001e300 - 0001e4cf [ 464] */,
0x793e80d5 /* 0001e4fa - 0001e5cf [ 214] */,
0x797ec003 /* 0001e5fb - 0001e5fe [ 4] */,
- 0x798001df /* 0001e600 - 0001e7df [ 480] */,
+ 0x798000bf /* 0001e600 - 0001e6bf [ 192] */,
+ 0x79b7c000 /* 0001e6df - 0001e6df [ 1] */,
+ 0x79bd8007 /* 0001e6f6 - 0001e6fd [ 8] */,
+ 0x79c000df /* 0001e700 - 0001e7df [ 224] */,
0x79f9c000 /* 0001e7e7 - 0001e7e7 [ 1] */,
0x79fb0000 /* 0001e7ec - 0001e7ec [ 1] */,
0x79fbc000 /* 0001e7ef - 0001e7ef [ 1] */,
@@ -808,44 +816,45 @@ _LIBCPP_HIDE_FROM_ABI inline constexpr uint32_t __entries[735] = {
0x7c300000 /* 0001f0c0 - 0001f0c0 [ 1] */,
0x7c340000 /* 0001f0d0 - 0001f0d0 [ 1] */,
0x7c3d8009 /* 0001f0f6 - 0001f0ff [ 10] */,
- 0x7c6b8037 /* 0001f1ae - 0001f1e5 [ 56] */,
+ 0x7c6bc036 /* 0001f1af - 0001f1e5 [ 55] */,
0x7c80c00c /* 0001f203 - 0001f20f [ 13] */,
0x7c8f0003 /* 0001f23c - 0001f23f [ 4] */,
0x7c924006 /* 0001f249 - 0001f24f [ 7] */,
0x7c94800d /* 0001f252 - 0001f25f [ 14] */,
0x7c998099 /* 0001f266 - 0001f2ff [ 154] */,
- 0x7db60003 /* 0001f6d8 - 0001f6db [ 4] */,
+ 0x7db68001 /* 0001f6da - 0001f6db [ 2] */,
0x7dbb4002 /* 0001f6ed - 0001f6ef [ 3] */,
0x7dbf4002 /* 0001f6fd - 0001f6ff [ 3] */,
- 0x7dddc003 /* 0001f777 - 0001f77a [ 4] */,
- 0x7df68005 /* 0001f7da - 0001f7df [ 6] */,
+ 0x7df70003 /* 0001f7dc - 0001f7df [ 4] */,
0x7dfb0003 /* 0001f7ec - 0001f7ef [ 4] */,
- 0x7dfc400e /* 0001f7f1 - 0001f7ff [ 15] */,
0x7e030003 /* 0001f80c - 0001f80f [ 4] */,
0x7e120007 /* 0001f848 - 0001f84f [ 8] */,
0x7e168005 /* 0001f85a - 0001f85f [ 6] */,
0x7e220007 /* 0001f888 - 0001f88f [ 8] */,
0x7e2b8001 /* 0001f8ae - 0001f8af [ 2] */,
0x7e2f0003 /* 0001f8bc - 0001f8bf [ 4] */,
- 0x7e30803d /* 0001f8c2 - 0001f8ff [ 62] */,
- 0x7e95000b /* 0001fa54 - 0001fa5f [ 12] */,
+ 0x7e30800d /* 0001f8c2 - 0001f8cf [ 14] */,
+ 0x7e364026 /* 0001f8d9 - 0001f8ff [ 39] */,
+ 0x7e960007 /* 0001fa58 - 0001fa5f [ 8] */,
0x7e9b8001 /* 0001fa6e - 0001fa6f [ 2] */,
0x7e9f4002 /* 0001fa7d - 0001fa7f [ 3] */,
- 0x7ea28004 /* 0001fa8a - 0001fa8e [ 5] */,
- 0x7eb1c006 /* 0001fac7 - 0001facd [ 7] */,
- 0x7eb74001 /* 0001fadd - 0001fade [ 2] */,
- 0x7eba8005 /* 0001faea - 0001faef [ 6] */,
- 0x7ebe4006 /* 0001faf9 - 0001faff [ 7] */,
+ 0x7eb1c000 /* 0001fac7 - 0001fac7 [ 1] */,
+ 0x7eb24002 /* 0001fac9 - 0001facb [ 3] */,
+ 0x7eb78000 /* 0001fade - 0001fade [ 1] */,
+ 0x7ebb0002 /* 0001faec - 0001faee [ 3] */,
+ 0x7ebec004 /* 0001fafb - 0001faff [ 5] */,
0x7ee4c000 /* 0001fb93 - 0001fb93 [ 1] */,
- 0x7efe8405 /* 0001fbfa - 0001ffff [ 1030] */,
+ 0x7efec404 /* 0001fbfb - 0001ffff [ 1029] */,
0xa9b8001f /* 0002a6e0 - 0002a6ff [ 32] */,
- 0xadce8005 /* 0002b73a - 0002b73f [ 6] */,
- 0xae078001 /* 0002b81e - 0002b81f [ 2] */,
- 0xb3a8800d /* 0002cea2 - 0002ceaf [ 14] */,
+ 0xae07c000 /* 0002b81f - 0002b81f [ 1] */,
+ 0xb3ab8001 /* 0002ceae - 0002ceaf [ 2] */,
0xbaf8400e /* 0002ebe1 - 0002ebef [ 15] */,
0xbb9789a1 /* 0002ee5e - 0002f7ff [ 2466] */,
0xbe8785e1 /* 0002fa1e - 0002ffff [ 1506] */,
- 0xc4d2c004 /* 0003134b - 0003134f [ 5] */};
+ 0xc4d2c004 /* 0003134b - 0003134f [ 5] */,
+ 0xcd1ebfff /* 0003347a - 00037479 [16384] */,
+ 0xdd1ebfff /* 0003747a - 0003b479 [16384] */,
+ 0xed1e9b85 /* 0003b47a - 0003cfff [ 7046] */};
/// Returns whether the code unit needs to be escaped.
///
@@ -866,7 +875,7 @@ _LIBCPP_HIDE_FROM_ABI inline constexpr uint32_t __entries[735] = {
return false;
// The entries at the end.
- if (__code_point >= 0x000323b0)
+ if (__code_point >= 0x0003fc40)
return true;
ptrdiff_t __i = std::upper_bound(std::begin(__entries), std::end(__entries), (__code_point << 14) | 0x3fffu) - __entries;
>From b796ef4a6534e80ef6c1d6e821fde7a5300ab2e0 Mon Sep 17 00:00:00 2001
From: Charlie Lin <tuug at gmx.us>
Date: Thu, 21 May 2026 15:27:36 -0500
Subject: [PATCH 04/14] Update unicode-case-fold.py
---
llvm/utils/unicode-case-fold.py | 44 +++++++++++++++++----------------
1 file changed, 23 insertions(+), 21 deletions(-)
diff --git a/llvm/utils/unicode-case-fold.py b/llvm/utils/unicode-case-fold.py
index 4afb41d4060f0..ca0423082b724 100755
--- a/llvm/utils/unicode-case-fold.py
+++ b/llvm/utils/unicode-case-fold.py
@@ -36,7 +36,9 @@ def mappings(f):
m = expr.match(line)
if not m:
continue
- from_char = int(m.group(1), 16)
+ group_1 = m.group(1)
+ if "b" in group_1: group_1 = group_1[2:]
+ from_char = int(group_1, 16)
to_char = int(m.group(2), 16)
from_name = m.group(3)
@@ -101,28 +103,28 @@ def dump_block(b):
current_block = []
-f = urlopen(sys.argv[1])
-for m in mappings(f):
- if len(current_block) == 0:
- current_block.append(m)
- continue
-
- if shift(current_block[0]) != shift(m):
- # Incompatible shift, start a new block.
+with urlopen(sys.argv[1]) as f:
+ g = [str(x) for x in f.readlines()]
+ for m in mappings(g):
+ if len(current_block) == 0:
+ current_block.append(m)
+ continue
+
+ if shift(current_block[0]) != shift(m):
+ # Incompatible shift, start a new block.
+ dump_block(current_block)
+ current_block = [m]
+ continue
+
+ if len(current_block) == 1 or stride(current_block) == stride2(
+ current_block[-1], m
+ ):
+ current_block.append(m)
+ continue
+
+ # Incompatible stride, start a new block.
dump_block(current_block)
current_block = [m]
- continue
-
- if len(current_block) == 1 or stride(current_block) == stride2(
- current_block[-1], m
- ):
- current_block.append(m)
- continue
-
- # Incompatible stride, start a new block.
- dump_block(current_block)
- current_block = [m]
-f.close()
dump_block(current_block)
>From 0b6e8367e58be1c003430942372b81166faef7f3 Mon Sep 17 00:00:00 2001
From: Charlie Lin <tuug at gmx.us>
Date: Thu, 21 May 2026 15:28:40 -0500
Subject: [PATCH 05/14] Update UnicodeCaseFold.cpp
---
llvm/lib/Support/UnicodeCaseFold.cpp | 285 ++++++++++++++++-----------
1 file changed, 170 insertions(+), 115 deletions(-)
diff --git a/llvm/lib/Support/UnicodeCaseFold.cpp b/llvm/lib/Support/UnicodeCaseFold.cpp
index abf6df8d1e3c3..007c6261568ca 100644
--- a/llvm/lib/Support/UnicodeCaseFold.cpp
+++ b/llvm/lib/Support/UnicodeCaseFold.cpp
@@ -2,11 +2,11 @@
//
// This file was generated by utils/unicode-case-fold.py from the Unicode
// case folding database at
-// http://www.unicode.org/Public/15.1.0/ucd/CaseFolding.txt
+// https://www.unicode.org/Public/draft/ucd/CaseFolding.txt
//
// To regenerate this file, run:
// utils/unicode-case-fold.py \
-// "http://www.unicode.org/Public/15.1.0/ucd/CaseFolding.txt" \
+// "https://www.unicode.org/Public/draft/ucd/CaseFolding.txt" \
// > lib/Support/UnicodeCaseFold.cpp
//
//===----------------------------------------------------------------------===//
@@ -19,7 +19,7 @@ int llvm::sys::unicode::foldCharSimple(int C) {
// 26 characters
if (C <= 0x005a)
return C + 32;
- // MICRO SIGN
+ // MICRO SIGN\n'
if (C == 0x00b5)
return 0x03bc;
if (C < 0x00c0)
@@ -52,7 +52,7 @@ int llvm::sys::unicode::foldCharSimple(int C) {
// 23 characters
if (C <= 0x0176)
return C | 1;
- // LATIN CAPITAL LETTER Y WITH DIAERESIS
+ // LATIN CAPITAL LETTER Y WITH DIAERESIS\n'
if (C == 0x0178)
return 0x00ff;
if (C < 0x0179)
@@ -60,10 +60,10 @@ int llvm::sys::unicode::foldCharSimple(int C) {
// 3 characters
if (C <= 0x017d && C % 2 == 1)
return C + 1;
- // LATIN SMALL LETTER LONG S
+ // LATIN SMALL LETTER LONG S\n'
if (C == 0x017f)
return 0x0073;
- // LATIN CAPITAL LETTER B WITH HOOK
+ // LATIN CAPITAL LETTER B WITH HOOK\n'
if (C == 0x0181)
return 0x0253;
if (C < 0x0182)
@@ -71,10 +71,10 @@ int llvm::sys::unicode::foldCharSimple(int C) {
// 2 characters
if (C <= 0x0184)
return C | 1;
- // LATIN CAPITAL LETTER OPEN O
+ // LATIN CAPITAL LETTER OPEN O\n'
if (C == 0x0186)
return 0x0254;
- // LATIN CAPITAL LETTER C WITH HOOK
+ // LATIN CAPITAL LETTER C WITH HOOK\n'
if (C == 0x0187)
return 0x0188;
if (C < 0x0189)
@@ -82,43 +82,43 @@ int llvm::sys::unicode::foldCharSimple(int C) {
// 2 characters
if (C <= 0x018a)
return C + 205;
- // LATIN CAPITAL LETTER D WITH TOPBAR
+ // LATIN CAPITAL LETTER D WITH TOPBAR\n'
if (C == 0x018b)
return 0x018c;
- // LATIN CAPITAL LETTER REVERSED E
+ // LATIN CAPITAL LETTER REVERSED E\n'
if (C == 0x018e)
return 0x01dd;
- // LATIN CAPITAL LETTER SCHWA
+ // LATIN CAPITAL LETTER SCHWA\n'
if (C == 0x018f)
return 0x0259;
- // LATIN CAPITAL LETTER OPEN E
+ // LATIN CAPITAL LETTER OPEN E\n'
if (C == 0x0190)
return 0x025b;
- // LATIN CAPITAL LETTER F WITH HOOK
+ // LATIN CAPITAL LETTER F WITH HOOK\n'
if (C == 0x0191)
return 0x0192;
- // LATIN CAPITAL LETTER G WITH HOOK
+ // LATIN CAPITAL LETTER G WITH HOOK\n'
if (C == 0x0193)
return 0x0260;
- // LATIN CAPITAL LETTER GAMMA
+ // LATIN CAPITAL LETTER GAMMA\n'
if (C == 0x0194)
return 0x0263;
- // LATIN CAPITAL LETTER IOTA
+ // LATIN CAPITAL LETTER IOTA\n'
if (C == 0x0196)
return 0x0269;
- // LATIN CAPITAL LETTER I WITH STROKE
+ // LATIN CAPITAL LETTER I WITH STROKE\n'
if (C == 0x0197)
return 0x0268;
- // LATIN CAPITAL LETTER K WITH HOOK
+ // LATIN CAPITAL LETTER K WITH HOOK\n'
if (C == 0x0198)
return 0x0199;
- // LATIN CAPITAL LETTER TURNED M
+ // LATIN CAPITAL LETTER TURNED M\n'
if (C == 0x019c)
return 0x026f;
- // LATIN CAPITAL LETTER N WITH LEFT HOOK
+ // LATIN CAPITAL LETTER N WITH LEFT HOOK\n'
if (C == 0x019d)
return 0x0272;
- // LATIN CAPITAL LETTER O WITH MIDDLE TILDE
+ // LATIN CAPITAL LETTER O WITH MIDDLE TILDE\n'
if (C == 0x019f)
return 0x0275;
if (C < 0x01a0)
@@ -126,22 +126,22 @@ int llvm::sys::unicode::foldCharSimple(int C) {
// 3 characters
if (C <= 0x01a4)
return C | 1;
- // LATIN LETTER YR
+ // LATIN LETTER YR\n'
if (C == 0x01a6)
return 0x0280;
- // LATIN CAPITAL LETTER TONE TWO
+ // LATIN CAPITAL LETTER TONE TWO\n'
if (C == 0x01a7)
return 0x01a8;
- // LATIN CAPITAL LETTER ESH
+ // LATIN CAPITAL LETTER ESH\n'
if (C == 0x01a9)
return 0x0283;
- // LATIN CAPITAL LETTER T WITH HOOK
+ // LATIN CAPITAL LETTER T WITH HOOK\n'
if (C == 0x01ac)
return 0x01ad;
- // LATIN CAPITAL LETTER T WITH RETROFLEX HOOK
+ // LATIN CAPITAL LETTER T WITH RETROFLEX HOOK\n'
if (C == 0x01ae)
return 0x0288;
- // LATIN CAPITAL LETTER U WITH HORN
+ // LATIN CAPITAL LETTER U WITH HORN\n'
if (C == 0x01af)
return 0x01b0;
if (C < 0x01b1)
@@ -154,7 +154,7 @@ int llvm::sys::unicode::foldCharSimple(int C) {
// 2 characters
if (C <= 0x01b5 && C % 2 == 1)
return C + 1;
- // LATIN CAPITAL LETTER EZH
+ // LATIN CAPITAL LETTER EZH\n'
if (C == 0x01b7)
return 0x0292;
if (C < 0x01b8)
@@ -162,19 +162,19 @@ int llvm::sys::unicode::foldCharSimple(int C) {
// 2 characters
if (C <= 0x01bc && C % 4 == 0)
return C + 1;
- // LATIN CAPITAL LETTER DZ WITH CARON
+ // LATIN CAPITAL LETTER DZ WITH CARON\n'
if (C == 0x01c4)
return 0x01c6;
- // LATIN CAPITAL LETTER D WITH SMALL LETTER Z WITH CARON
+ // LATIN CAPITAL LETTER D WITH SMALL LETTER Z WITH CARON\n'
if (C == 0x01c5)
return 0x01c6;
- // LATIN CAPITAL LETTER LJ
+ // LATIN CAPITAL LETTER LJ\n'
if (C == 0x01c7)
return 0x01c9;
- // LATIN CAPITAL LETTER L WITH SMALL LETTER J
+ // LATIN CAPITAL LETTER L WITH SMALL LETTER J\n'
if (C == 0x01c8)
return 0x01c9;
- // LATIN CAPITAL LETTER NJ
+ // LATIN CAPITAL LETTER NJ\n'
if (C == 0x01ca)
return 0x01cc;
if (C < 0x01cb)
@@ -187,7 +187,7 @@ int llvm::sys::unicode::foldCharSimple(int C) {
// 9 characters
if (C <= 0x01ee)
return C | 1;
- // LATIN CAPITAL LETTER DZ
+ // LATIN CAPITAL LETTER DZ\n'
if (C == 0x01f1)
return 0x01f3;
if (C < 0x01f2)
@@ -195,10 +195,10 @@ int llvm::sys::unicode::foldCharSimple(int C) {
// 2 characters
if (C <= 0x01f4)
return C | 1;
- // LATIN CAPITAL LETTER HWAIR
+ // LATIN CAPITAL LETTER HWAIR\n'
if (C == 0x01f6)
return 0x0195;
- // LATIN CAPITAL LETTER WYNN
+ // LATIN CAPITAL LETTER WYNN\n'
if (C == 0x01f7)
return 0x01bf;
if (C < 0x01f8)
@@ -206,7 +206,7 @@ int llvm::sys::unicode::foldCharSimple(int C) {
// 20 characters
if (C <= 0x021e)
return C | 1;
- // LATIN CAPITAL LETTER N WITH LONG RIGHT LEG
+ // LATIN CAPITAL LETTER N WITH LONG RIGHT LEG\n'
if (C == 0x0220)
return 0x019e;
if (C < 0x0222)
@@ -214,28 +214,28 @@ int llvm::sys::unicode::foldCharSimple(int C) {
// 9 characters
if (C <= 0x0232)
return C | 1;
- // LATIN CAPITAL LETTER A WITH STROKE
+ // LATIN CAPITAL LETTER A WITH STROKE\n'
if (C == 0x023a)
return 0x2c65;
- // LATIN CAPITAL LETTER C WITH STROKE
+ // LATIN CAPITAL LETTER C WITH STROKE\n'
if (C == 0x023b)
return 0x023c;
- // LATIN CAPITAL LETTER L WITH BAR
+ // LATIN CAPITAL LETTER L WITH BAR\n'
if (C == 0x023d)
return 0x019a;
- // LATIN CAPITAL LETTER T WITH DIAGONAL STROKE
+ // LATIN CAPITAL LETTER T WITH DIAGONAL STROKE\n'
if (C == 0x023e)
return 0x2c66;
- // LATIN CAPITAL LETTER GLOTTAL STOP
+ // LATIN CAPITAL LETTER GLOTTAL STOP\n'
if (C == 0x0241)
return 0x0242;
- // LATIN CAPITAL LETTER B WITH STROKE
+ // LATIN CAPITAL LETTER B WITH STROKE\n'
if (C == 0x0243)
return 0x0180;
- // LATIN CAPITAL LETTER U BAR
+ // LATIN CAPITAL LETTER U BAR\n'
if (C == 0x0244)
return 0x0289;
- // LATIN CAPITAL LETTER TURNED V
+ // LATIN CAPITAL LETTER TURNED V\n'
if (C == 0x0245)
return 0x028c;
if (C < 0x0246)
@@ -243,7 +243,7 @@ int llvm::sys::unicode::foldCharSimple(int C) {
// 5 characters
if (C <= 0x024e)
return C | 1;
- // COMBINING GREEK YPOGEGRAMMENI
+ // COMBINING GREEK YPOGEGRAMMENI\n'
if (C == 0x0345)
return 0x03b9;
if (C < 0x0370)
@@ -251,13 +251,13 @@ int llvm::sys::unicode::foldCharSimple(int C) {
// 2 characters
if (C <= 0x0372)
return C | 1;
- // GREEK CAPITAL LETTER PAMPHYLIAN DIGAMMA
+ // GREEK CAPITAL LETTER PAMPHYLIAN DIGAMMA\n'
if (C == 0x0376)
return 0x0377;
- // GREEK CAPITAL LETTER YOT
+ // GREEK CAPITAL LETTER YOT\n'
if (C == 0x037f)
return 0x03f3;
- // GREEK CAPITAL LETTER ALPHA WITH TONOS
+ // GREEK CAPITAL LETTER ALPHA WITH TONOS\n'
if (C == 0x0386)
return 0x03ac;
if (C < 0x0388)
@@ -265,7 +265,7 @@ int llvm::sys::unicode::foldCharSimple(int C) {
// 3 characters
if (C <= 0x038a)
return C + 37;
- // GREEK CAPITAL LETTER OMICRON WITH TONOS
+ // GREEK CAPITAL LETTER OMICRON WITH TONOS\n'
if (C == 0x038c)
return 0x03cc;
if (C < 0x038e)
@@ -283,22 +283,22 @@ int llvm::sys::unicode::foldCharSimple(int C) {
// 9 characters
if (C <= 0x03ab)
return C + 32;
- // GREEK SMALL LETTER FINAL SIGMA
+ // GREEK SMALL LETTER FINAL SIGMA\n'
if (C == 0x03c2)
return 0x03c3;
- // GREEK CAPITAL KAI SYMBOL
+ // GREEK CAPITAL KAI SYMBOL\n'
if (C == 0x03cf)
return 0x03d7;
- // GREEK BETA SYMBOL
+ // GREEK BETA SYMBOL\n'
if (C == 0x03d0)
return 0x03b2;
- // GREEK THETA SYMBOL
+ // GREEK THETA SYMBOL\n'
if (C == 0x03d1)
return 0x03b8;
- // GREEK PHI SYMBOL
+ // GREEK PHI SYMBOL\n'
if (C == 0x03d5)
return 0x03c6;
- // GREEK PI SYMBOL
+ // GREEK PI SYMBOL\n'
if (C == 0x03d6)
return 0x03c0;
if (C < 0x03d8)
@@ -306,25 +306,25 @@ int llvm::sys::unicode::foldCharSimple(int C) {
// 12 characters
if (C <= 0x03ee)
return C | 1;
- // GREEK KAPPA SYMBOL
+ // GREEK KAPPA SYMBOL\n'
if (C == 0x03f0)
return 0x03ba;
- // GREEK RHO SYMBOL
+ // GREEK RHO SYMBOL\n'
if (C == 0x03f1)
return 0x03c1;
- // GREEK CAPITAL THETA SYMBOL
+ // GREEK CAPITAL THETA SYMBOL\n'
if (C == 0x03f4)
return 0x03b8;
- // GREEK LUNATE EPSILON SYMBOL
+ // GREEK LUNATE EPSILON SYMBOL\n'
if (C == 0x03f5)
return 0x03b5;
- // GREEK CAPITAL LETTER SHO
+ // GREEK CAPITAL LETTER SHO\n'
if (C == 0x03f7)
return 0x03f8;
- // GREEK CAPITAL LUNATE SIGMA SYMBOL
+ // GREEK CAPITAL LUNATE SIGMA SYMBOL\n'
if (C == 0x03f9)
return 0x03f2;
- // GREEK CAPITAL LETTER SAN
+ // GREEK CAPITAL LETTER SAN\n'
if (C == 0x03fa)
return 0x03fb;
if (C < 0x03fd)
@@ -352,7 +352,7 @@ int llvm::sys::unicode::foldCharSimple(int C) {
// 27 characters
if (C <= 0x04be)
return C | 1;
- // CYRILLIC LETTER PALOCHKA
+ // CYRILLIC LETTER PALOCHKA\n'
if (C == 0x04c0)
return 0x04cf;
if (C < 0x04c1)
@@ -385,13 +385,13 @@ int llvm::sys::unicode::foldCharSimple(int C) {
// 6 characters
if (C <= 0x13fd)
return C + -8;
- // CYRILLIC SMALL LETTER ROUNDED VE
+ // CYRILLIC SMALL LETTER ROUNDED VE\n'
if (C == 0x1c80)
return 0x0432;
- // CYRILLIC SMALL LETTER LONG-LEGGED DE
+ // CYRILLIC SMALL LETTER LONG-LEGGED DE\n'
if (C == 0x1c81)
return 0x0434;
- // CYRILLIC SMALL LETTER NARROW O
+ // CYRILLIC SMALL LETTER NARROW O\n'
if (C == 0x1c82)
return 0x043e;
if (C < 0x1c83)
@@ -399,18 +399,21 @@ int llvm::sys::unicode::foldCharSimple(int C) {
// 2 characters
if (C <= 0x1c84)
return C + -6210;
- // CYRILLIC SMALL LETTER THREE-LEGGED TE
+ // CYRILLIC SMALL LETTER THREE-LEGGED TE\n'
if (C == 0x1c85)
return 0x0442;
- // CYRILLIC SMALL LETTER TALL HARD SIGN
+ // CYRILLIC SMALL LETTER TALL HARD SIGN\n'
if (C == 0x1c86)
return 0x044a;
- // CYRILLIC SMALL LETTER TALL YAT
+ // CYRILLIC SMALL LETTER TALL YAT\n'
if (C == 0x1c87)
return 0x0463;
- // CYRILLIC SMALL LETTER UNBLENDED UK
+ // CYRILLIC SMALL LETTER UNBLENDED UK\n'
if (C == 0x1c88)
return 0xa64b;
+ // CYRILLIC CAPITAL LETTER TJE\n'
+ if (C == 0x1c89)
+ return 0x1c8a;
if (C < 0x1c90)
return C;
// 43 characters
@@ -426,10 +429,10 @@ int llvm::sys::unicode::foldCharSimple(int C) {
// 75 characters
if (C <= 0x1e94)
return C | 1;
- // LATIN SMALL LETTER LONG S WITH DOT ABOVE
+ // LATIN SMALL LETTER LONG S WITH DOT ABOVE\n'
if (C == 0x1e9b)
return 0x1e61;
- // LATIN CAPITAL LETTER SHARP S
+ // LATIN CAPITAL LETTER SHARP S\n'
if (C == 0x1e9e)
return 0x00df;
if (C < 0x1ea0)
@@ -497,10 +500,10 @@ int llvm::sys::unicode::foldCharSimple(int C) {
// 2 characters
if (C <= 0x1fbb)
return C + -74;
- // GREEK CAPITAL LETTER ALPHA WITH PROSGEGRAMMENI
+ // GREEK CAPITAL LETTER ALPHA WITH PROSGEGRAMMENI\n'
if (C == 0x1fbc)
return 0x1fb3;
- // GREEK PROSGEGRAMMENI
+ // GREEK PROSGEGRAMMENI\n'
if (C == 0x1fbe)
return 0x03b9;
if (C < 0x1fc8)
@@ -508,10 +511,10 @@ int llvm::sys::unicode::foldCharSimple(int C) {
// 4 characters
if (C <= 0x1fcb)
return C + -86;
- // GREEK CAPITAL LETTER ETA WITH PROSGEGRAMMENI
+ // GREEK CAPITAL LETTER ETA WITH PROSGEGRAMMENI\n'
if (C == 0x1fcc)
return 0x1fc3;
- // GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA
+ // GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA\n'
if (C == 0x1fd3)
return 0x0390;
if (C < 0x1fd8)
@@ -524,7 +527,7 @@ int llvm::sys::unicode::foldCharSimple(int C) {
// 2 characters
if (C <= 0x1fdb)
return C + -100;
- // GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA
+ // GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA\n'
if (C == 0x1fe3)
return 0x03b0;
if (C < 0x1fe8)
@@ -537,7 +540,7 @@ int llvm::sys::unicode::foldCharSimple(int C) {
// 2 characters
if (C <= 0x1feb)
return C + -112;
- // GREEK CAPITAL LETTER RHO WITH DASIA
+ // GREEK CAPITAL LETTER RHO WITH DASIA\n'
if (C == 0x1fec)
return 0x1fe5;
if (C < 0x1ff8)
@@ -550,19 +553,19 @@ int llvm::sys::unicode::foldCharSimple(int C) {
// 2 characters
if (C <= 0x1ffb)
return C + -126;
- // GREEK CAPITAL LETTER OMEGA WITH PROSGEGRAMMENI
+ // GREEK CAPITAL LETTER OMEGA WITH PROSGEGRAMMENI\n'
if (C == 0x1ffc)
return 0x1ff3;
- // OHM SIGN
+ // OHM SIGN\n'
if (C == 0x2126)
return 0x03c9;
- // KELVIN SIGN
+ // KELVIN SIGN\n'
if (C == 0x212a)
return 0x006b;
- // ANGSTROM SIGN
+ // ANGSTROM SIGN\n'
if (C == 0x212b)
return 0x00e5;
- // TURNED CAPITAL F
+ // TURNED CAPITAL F\n'
if (C == 0x2132)
return 0x214e;
if (C < 0x2160)
@@ -570,7 +573,7 @@ int llvm::sys::unicode::foldCharSimple(int C) {
// 16 characters
if (C <= 0x216f)
return C + 16;
- // ROMAN NUMERAL REVERSED ONE HUNDRED
+ // ROMAN NUMERAL REVERSED ONE HUNDRED\n'
if (C == 0x2183)
return 0x2184;
if (C < 0x24b6)
@@ -583,16 +586,16 @@ int llvm::sys::unicode::foldCharSimple(int C) {
// 48 characters
if (C <= 0x2c2f)
return C + 48;
- // LATIN CAPITAL LETTER L WITH DOUBLE BAR
+ // LATIN CAPITAL LETTER L WITH DOUBLE BAR\n'
if (C == 0x2c60)
return 0x2c61;
- // LATIN CAPITAL LETTER L WITH MIDDLE TILDE
+ // LATIN CAPITAL LETTER L WITH MIDDLE TILDE\n'
if (C == 0x2c62)
return 0x026b;
- // LATIN CAPITAL LETTER P WITH STROKE
+ // LATIN CAPITAL LETTER P WITH STROKE\n'
if (C == 0x2c63)
return 0x1d7d;
- // LATIN CAPITAL LETTER R WITH TAIL
+ // LATIN CAPITAL LETTER R WITH TAIL\n'
if (C == 0x2c64)
return 0x027d;
if (C < 0x2c67)
@@ -600,16 +603,16 @@ int llvm::sys::unicode::foldCharSimple(int C) {
// 3 characters
if (C <= 0x2c6b && C % 2 == 1)
return C + 1;
- // LATIN CAPITAL LETTER ALPHA
+ // LATIN CAPITAL LETTER ALPHA\n'
if (C == 0x2c6d)
return 0x0251;
- // LATIN CAPITAL LETTER M WITH HOOK
+ // LATIN CAPITAL LETTER M WITH HOOK\n'
if (C == 0x2c6e)
return 0x0271;
- // LATIN CAPITAL LETTER TURNED A
+ // LATIN CAPITAL LETTER TURNED A\n'
if (C == 0x2c6f)
return 0x0250;
- // LATIN CAPITAL LETTER TURNED ALPHA
+ // LATIN CAPITAL LETTER TURNED ALPHA\n'
if (C == 0x2c70)
return 0x0252;
if (C < 0x2c72)
@@ -662,7 +665,7 @@ int llvm::sys::unicode::foldCharSimple(int C) {
// 2 characters
if (C <= 0xa77b && C % 2 == 1)
return C + 1;
- // LATIN CAPITAL LETTER INSULAR G
+ // LATIN CAPITAL LETTER INSULAR G\n'
if (C == 0xa77d)
return 0x1d79;
if (C < 0xa77e)
@@ -670,10 +673,10 @@ int llvm::sys::unicode::foldCharSimple(int C) {
// 5 characters
if (C <= 0xa786)
return C | 1;
- // LATIN CAPITAL LETTER SALTILLO
+ // LATIN CAPITAL LETTER SALTILLO\n'
if (C == 0xa78b)
return 0xa78c;
- // LATIN CAPITAL LETTER TURNED H
+ // LATIN CAPITAL LETTER TURNED H\n'
if (C == 0xa78d)
return 0x0265;
if (C < 0xa790)
@@ -686,31 +689,31 @@ int llvm::sys::unicode::foldCharSimple(int C) {
// 10 characters
if (C <= 0xa7a8)
return C | 1;
- // LATIN CAPITAL LETTER H WITH HOOK
+ // LATIN CAPITAL LETTER H WITH HOOK\n'
if (C == 0xa7aa)
return 0x0266;
- // LATIN CAPITAL LETTER REVERSED OPEN E
+ // LATIN CAPITAL LETTER REVERSED OPEN E\n'
if (C == 0xa7ab)
return 0x025c;
- // LATIN CAPITAL LETTER SCRIPT G
+ // LATIN CAPITAL LETTER SCRIPT G\n'
if (C == 0xa7ac)
return 0x0261;
- // LATIN CAPITAL LETTER L WITH BELT
+ // LATIN CAPITAL LETTER L WITH BELT\n'
if (C == 0xa7ad)
return 0x026c;
- // LATIN CAPITAL LETTER SMALL CAPITAL I
+ // LATIN CAPITAL LETTER SMALL CAPITAL I\n'
if (C == 0xa7ae)
return 0x026a;
- // LATIN CAPITAL LETTER TURNED K
+ // LATIN CAPITAL LETTER TURNED K\n'
if (C == 0xa7b0)
return 0x029e;
- // LATIN CAPITAL LETTER TURNED T
+ // LATIN CAPITAL LETTER TURNED T\n'
if (C == 0xa7b1)
return 0x0287;
- // LATIN CAPITAL LETTER J WITH CROSSED-TAIL
+ // LATIN CAPITAL LETTER J WITH CROSSED-TAIL\n'
if (C == 0xa7b2)
return 0x029d;
- // LATIN CAPITAL LETTER CHI
+ // LATIN CAPITAL LETTER CHI\n'
if (C == 0xa7b3)
return 0xab53;
if (C < 0xa7b4)
@@ -718,13 +721,13 @@ int llvm::sys::unicode::foldCharSimple(int C) {
// 8 characters
if (C <= 0xa7c2)
return C | 1;
- // LATIN CAPITAL LETTER C WITH PALATAL HOOK
+ // LATIN CAPITAL LETTER C WITH PALATAL HOOK\n'
if (C == 0xa7c4)
return 0xa794;
- // LATIN CAPITAL LETTER S WITH HOOK
+ // LATIN CAPITAL LETTER S WITH HOOK\n'
if (C == 0xa7c5)
return 0x0282;
- // LATIN CAPITAL LETTER Z WITH PALATAL HOOK
+ // LATIN CAPITAL LETTER Z WITH PALATAL HOOK\n'
if (C == 0xa7c6)
return 0x1d8e;
if (C < 0xa7c7)
@@ -732,22 +735,36 @@ int llvm::sys::unicode::foldCharSimple(int C) {
// 2 characters
if (C <= 0xa7c9 && C % 2 == 1)
return C + 1;
- if (C < 0xa7d0)
+ // LATIN CAPITAL LETTER RAMS HORN\n'
+ if (C == 0xa7cb)
+ return 0x0264;
+ if (C < 0xa7cc)
+ return C;
+ // 8 characters
+ if (C <= 0xa7da)
+ return C | 1;
+ // LATIN CAPITAL LETTER LAMBDA WITH STROKE\n'
+ if (C == 0xa7dc)
+ return 0x019b;
+ if (C < 0xa7dd)
return C;
// 2 characters
- if (C <= 0xa7d6 && C % 6 == 0)
- return C + 1;
- if (C < 0xa7d8)
+ if (C <= 0xa7e2 && C % 5 == 3)
+ return C + -42342;
+ // LATIN CAPITAL LETTER REVERSED HALF H\n'
+ if (C == 0xa7f5)
+ return 0xa7f6;
+ if (C < 0xab6c)
return C;
// 2 characters
- if (C <= 0xa7f5 && C % 29 == 19)
- return C + 1;
+ if (C <= 0xab6d)
+ return C + -33;
if (C < 0xab70)
return C;
// 80 characters
if (C <= 0xabbf)
return C + -38864;
- // LATIN SMALL LIGATURE LONG S T
+ // LATIN SMALL LIGATURE LONG S T\n'
if (C == 0xfb05)
return 0xfb06;
if (C < 0xff21)
@@ -790,6 +807,11 @@ int llvm::sys::unicode::foldCharSimple(int C) {
// 51 characters
if (C <= 0x10cb2)
return C + 64;
+ if (C < 0x10d50)
+ return C;
+ // 22 characters
+ if (C <= 0x10d65)
+ return C + 32;
if (C < 0x118a0)
return C;
// 32 characters
@@ -800,6 +822,39 @@ int llvm::sys::unicode::foldCharSimple(int C) {
// 32 characters
if (C <= 0x16e5f)
return C + 32;
+ if (C < 0x16ea0)
+ return C;
+ // 25 characters
+ if (C <= 0x16eb8)
+ return C + 27;
+ if (C < 0x1df40)
+ return C;
+ // 2 characters
+ if (C <= 0x1df48 && C % 8 == 0)
+ return C + 1;
+ if (C < 0x1df4a)
+ return C;
+ // 2 characters
+ if (C <= 0x1df4d && C % 3 == 1)
+ return C + 1;
+ if (C < 0x1df51)
+ return C;
+ // 2 characters
+ if (C <= 0x1df68 && C % 23 == 0)
+ return C + 1;
+ if (C < 0x1df6a)
+ return C;
+ // 3 characters
+ if (C <= 0x1df6e)
+ return C | 1;
+ if (C < 0x1df72)
+ return C;
+ // 7 characters
+ if (C <= 0x1df7e)
+ return C | 1;
+ // LATIN SMALL LIGATURE LONG S WITH DESCENDER S\n'
+ if (C == 0x1df95)
+ return 0x00df;
if (C < 0x1e900)
return C;
// 34 characters
>From 23a8738ac88fe6400efcfb7a6bb6d907cf9d6976 Mon Sep 17 00:00:00 2001
From: Charlie Lin <tuug at gmx.us>
Date: Thu, 21 May 2026 15:40:45 -0500
Subject: [PATCH 06/14] Update unicode-case-fold.py
---
llvm/utils/unicode-case-fold.py | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/llvm/utils/unicode-case-fold.py b/llvm/utils/unicode-case-fold.py
index ca0423082b724..cfa5b279facf3 100755
--- a/llvm/utils/unicode-case-fold.py
+++ b/llvm/utils/unicode-case-fold.py
@@ -40,7 +40,9 @@ def mappings(f):
if "b" in group_1: group_1 = group_1[2:]
from_char = int(group_1, 16)
to_char = int(m.group(2), 16)
- from_name = m.group(3)
+ group_3 = m.group(3)
+ if "\\n" in group_3: group_3 = group_3[:-3]
+ from_name = group_3
if from_char <= previous_from:
raise Exception("Duplicate or unsorted characters in input")
@@ -68,7 +70,6 @@ def stride(block):
# shift and the stride between adjecant mappings (if any) is constant.
def dump_block(b):
global body
-
if len(b) == 1:
# Special case for handling blocks of length 1. We don't even need to
# emit the "if (C < X) return C" check below as all characters in this
>From ead2efbcc813f7072f7a15547eb651c81f831541 Mon Sep 17 00:00:00 2001
From: Charlie Lin <tuug at gmx.us>
Date: Thu, 21 May 2026 15:41:50 -0500
Subject: [PATCH 07/14] Update UnicodeCaseFold.cpp
---
llvm/lib/Support/UnicodeCaseFold.cpp | 224 +++++++++++++--------------
1 file changed, 112 insertions(+), 112 deletions(-)
diff --git a/llvm/lib/Support/UnicodeCaseFold.cpp b/llvm/lib/Support/UnicodeCaseFold.cpp
index 007c6261568ca..3e44ac5210f71 100644
--- a/llvm/lib/Support/UnicodeCaseFold.cpp
+++ b/llvm/lib/Support/UnicodeCaseFold.cpp
@@ -19,7 +19,7 @@ int llvm::sys::unicode::foldCharSimple(int C) {
// 26 characters
if (C <= 0x005a)
return C + 32;
- // MICRO SIGN\n'
+ // MICRO SIGN
if (C == 0x00b5)
return 0x03bc;
if (C < 0x00c0)
@@ -52,7 +52,7 @@ int llvm::sys::unicode::foldCharSimple(int C) {
// 23 characters
if (C <= 0x0176)
return C | 1;
- // LATIN CAPITAL LETTER Y WITH DIAERESIS\n'
+ // LATIN CAPITAL LETTER Y WITH DIAERESIS
if (C == 0x0178)
return 0x00ff;
if (C < 0x0179)
@@ -60,10 +60,10 @@ int llvm::sys::unicode::foldCharSimple(int C) {
// 3 characters
if (C <= 0x017d && C % 2 == 1)
return C + 1;
- // LATIN SMALL LETTER LONG S\n'
+ // LATIN SMALL LETTER LONG S
if (C == 0x017f)
return 0x0073;
- // LATIN CAPITAL LETTER B WITH HOOK\n'
+ // LATIN CAPITAL LETTER B WITH HOOK
if (C == 0x0181)
return 0x0253;
if (C < 0x0182)
@@ -71,10 +71,10 @@ int llvm::sys::unicode::foldCharSimple(int C) {
// 2 characters
if (C <= 0x0184)
return C | 1;
- // LATIN CAPITAL LETTER OPEN O\n'
+ // LATIN CAPITAL LETTER OPEN O
if (C == 0x0186)
return 0x0254;
- // LATIN CAPITAL LETTER C WITH HOOK\n'
+ // LATIN CAPITAL LETTER C WITH HOOK
if (C == 0x0187)
return 0x0188;
if (C < 0x0189)
@@ -82,43 +82,43 @@ int llvm::sys::unicode::foldCharSimple(int C) {
// 2 characters
if (C <= 0x018a)
return C + 205;
- // LATIN CAPITAL LETTER D WITH TOPBAR\n'
+ // LATIN CAPITAL LETTER D WITH TOPBAR
if (C == 0x018b)
return 0x018c;
- // LATIN CAPITAL LETTER REVERSED E\n'
+ // LATIN CAPITAL LETTER REVERSED E
if (C == 0x018e)
return 0x01dd;
- // LATIN CAPITAL LETTER SCHWA\n'
+ // LATIN CAPITAL LETTER SCHWA
if (C == 0x018f)
return 0x0259;
- // LATIN CAPITAL LETTER OPEN E\n'
+ // LATIN CAPITAL LETTER OPEN E
if (C == 0x0190)
return 0x025b;
- // LATIN CAPITAL LETTER F WITH HOOK\n'
+ // LATIN CAPITAL LETTER F WITH HOOK
if (C == 0x0191)
return 0x0192;
- // LATIN CAPITAL LETTER G WITH HOOK\n'
+ // LATIN CAPITAL LETTER G WITH HOOK
if (C == 0x0193)
return 0x0260;
- // LATIN CAPITAL LETTER GAMMA\n'
+ // LATIN CAPITAL LETTER GAMMA
if (C == 0x0194)
return 0x0263;
- // LATIN CAPITAL LETTER IOTA\n'
+ // LATIN CAPITAL LETTER IOTA
if (C == 0x0196)
return 0x0269;
- // LATIN CAPITAL LETTER I WITH STROKE\n'
+ // LATIN CAPITAL LETTER I WITH STROKE
if (C == 0x0197)
return 0x0268;
- // LATIN CAPITAL LETTER K WITH HOOK\n'
+ // LATIN CAPITAL LETTER K WITH HOOK
if (C == 0x0198)
return 0x0199;
- // LATIN CAPITAL LETTER TURNED M\n'
+ // LATIN CAPITAL LETTER TURNED M
if (C == 0x019c)
return 0x026f;
- // LATIN CAPITAL LETTER N WITH LEFT HOOK\n'
+ // LATIN CAPITAL LETTER N WITH LEFT HOOK
if (C == 0x019d)
return 0x0272;
- // LATIN CAPITAL LETTER O WITH MIDDLE TILDE\n'
+ // LATIN CAPITAL LETTER O WITH MIDDLE TILDE
if (C == 0x019f)
return 0x0275;
if (C < 0x01a0)
@@ -126,22 +126,22 @@ int llvm::sys::unicode::foldCharSimple(int C) {
// 3 characters
if (C <= 0x01a4)
return C | 1;
- // LATIN LETTER YR\n'
+ // LATIN LETTER YR
if (C == 0x01a6)
return 0x0280;
- // LATIN CAPITAL LETTER TONE TWO\n'
+ // LATIN CAPITAL LETTER TONE TWO
if (C == 0x01a7)
return 0x01a8;
- // LATIN CAPITAL LETTER ESH\n'
+ // LATIN CAPITAL LETTER ESH
if (C == 0x01a9)
return 0x0283;
- // LATIN CAPITAL LETTER T WITH HOOK\n'
+ // LATIN CAPITAL LETTER T WITH HOOK
if (C == 0x01ac)
return 0x01ad;
- // LATIN CAPITAL LETTER T WITH RETROFLEX HOOK\n'
+ // LATIN CAPITAL LETTER T WITH RETROFLEX HOOK
if (C == 0x01ae)
return 0x0288;
- // LATIN CAPITAL LETTER U WITH HORN\n'
+ // LATIN CAPITAL LETTER U WITH HORN
if (C == 0x01af)
return 0x01b0;
if (C < 0x01b1)
@@ -154,7 +154,7 @@ int llvm::sys::unicode::foldCharSimple(int C) {
// 2 characters
if (C <= 0x01b5 && C % 2 == 1)
return C + 1;
- // LATIN CAPITAL LETTER EZH\n'
+ // LATIN CAPITAL LETTER EZH
if (C == 0x01b7)
return 0x0292;
if (C < 0x01b8)
@@ -162,19 +162,19 @@ int llvm::sys::unicode::foldCharSimple(int C) {
// 2 characters
if (C <= 0x01bc && C % 4 == 0)
return C + 1;
- // LATIN CAPITAL LETTER DZ WITH CARON\n'
+ // LATIN CAPITAL LETTER DZ WITH CARON
if (C == 0x01c4)
return 0x01c6;
- // LATIN CAPITAL LETTER D WITH SMALL LETTER Z WITH CARON\n'
+ // LATIN CAPITAL LETTER D WITH SMALL LETTER Z WITH CARON
if (C == 0x01c5)
return 0x01c6;
- // LATIN CAPITAL LETTER LJ\n'
+ // LATIN CAPITAL LETTER LJ
if (C == 0x01c7)
return 0x01c9;
- // LATIN CAPITAL LETTER L WITH SMALL LETTER J\n'
+ // LATIN CAPITAL LETTER L WITH SMALL LETTER J
if (C == 0x01c8)
return 0x01c9;
- // LATIN CAPITAL LETTER NJ\n'
+ // LATIN CAPITAL LETTER NJ
if (C == 0x01ca)
return 0x01cc;
if (C < 0x01cb)
@@ -187,7 +187,7 @@ int llvm::sys::unicode::foldCharSimple(int C) {
// 9 characters
if (C <= 0x01ee)
return C | 1;
- // LATIN CAPITAL LETTER DZ\n'
+ // LATIN CAPITAL LETTER DZ
if (C == 0x01f1)
return 0x01f3;
if (C < 0x01f2)
@@ -195,10 +195,10 @@ int llvm::sys::unicode::foldCharSimple(int C) {
// 2 characters
if (C <= 0x01f4)
return C | 1;
- // LATIN CAPITAL LETTER HWAIR\n'
+ // LATIN CAPITAL LETTER HWAIR
if (C == 0x01f6)
return 0x0195;
- // LATIN CAPITAL LETTER WYNN\n'
+ // LATIN CAPITAL LETTER WYNN
if (C == 0x01f7)
return 0x01bf;
if (C < 0x01f8)
@@ -206,7 +206,7 @@ int llvm::sys::unicode::foldCharSimple(int C) {
// 20 characters
if (C <= 0x021e)
return C | 1;
- // LATIN CAPITAL LETTER N WITH LONG RIGHT LEG\n'
+ // LATIN CAPITAL LETTER N WITH LONG RIGHT LEG
if (C == 0x0220)
return 0x019e;
if (C < 0x0222)
@@ -214,28 +214,28 @@ int llvm::sys::unicode::foldCharSimple(int C) {
// 9 characters
if (C <= 0x0232)
return C | 1;
- // LATIN CAPITAL LETTER A WITH STROKE\n'
+ // LATIN CAPITAL LETTER A WITH STROKE
if (C == 0x023a)
return 0x2c65;
- // LATIN CAPITAL LETTER C WITH STROKE\n'
+ // LATIN CAPITAL LETTER C WITH STROKE
if (C == 0x023b)
return 0x023c;
- // LATIN CAPITAL LETTER L WITH BAR\n'
+ // LATIN CAPITAL LETTER L WITH BAR
if (C == 0x023d)
return 0x019a;
- // LATIN CAPITAL LETTER T WITH DIAGONAL STROKE\n'
+ // LATIN CAPITAL LETTER T WITH DIAGONAL STROKE
if (C == 0x023e)
return 0x2c66;
- // LATIN CAPITAL LETTER GLOTTAL STOP\n'
+ // LATIN CAPITAL LETTER GLOTTAL STOP
if (C == 0x0241)
return 0x0242;
- // LATIN CAPITAL LETTER B WITH STROKE\n'
+ // LATIN CAPITAL LETTER B WITH STROKE
if (C == 0x0243)
return 0x0180;
- // LATIN CAPITAL LETTER U BAR\n'
+ // LATIN CAPITAL LETTER U BAR
if (C == 0x0244)
return 0x0289;
- // LATIN CAPITAL LETTER TURNED V\n'
+ // LATIN CAPITAL LETTER TURNED V
if (C == 0x0245)
return 0x028c;
if (C < 0x0246)
@@ -243,7 +243,7 @@ int llvm::sys::unicode::foldCharSimple(int C) {
// 5 characters
if (C <= 0x024e)
return C | 1;
- // COMBINING GREEK YPOGEGRAMMENI\n'
+ // COMBINING GREEK YPOGEGRAMMENI
if (C == 0x0345)
return 0x03b9;
if (C < 0x0370)
@@ -251,13 +251,13 @@ int llvm::sys::unicode::foldCharSimple(int C) {
// 2 characters
if (C <= 0x0372)
return C | 1;
- // GREEK CAPITAL LETTER PAMPHYLIAN DIGAMMA\n'
+ // GREEK CAPITAL LETTER PAMPHYLIAN DIGAMMA
if (C == 0x0376)
return 0x0377;
- // GREEK CAPITAL LETTER YOT\n'
+ // GREEK CAPITAL LETTER YOT
if (C == 0x037f)
return 0x03f3;
- // GREEK CAPITAL LETTER ALPHA WITH TONOS\n'
+ // GREEK CAPITAL LETTER ALPHA WITH TONOS
if (C == 0x0386)
return 0x03ac;
if (C < 0x0388)
@@ -265,7 +265,7 @@ int llvm::sys::unicode::foldCharSimple(int C) {
// 3 characters
if (C <= 0x038a)
return C + 37;
- // GREEK CAPITAL LETTER OMICRON WITH TONOS\n'
+ // GREEK CAPITAL LETTER OMICRON WITH TONOS
if (C == 0x038c)
return 0x03cc;
if (C < 0x038e)
@@ -283,22 +283,22 @@ int llvm::sys::unicode::foldCharSimple(int C) {
// 9 characters
if (C <= 0x03ab)
return C + 32;
- // GREEK SMALL LETTER FINAL SIGMA\n'
+ // GREEK SMALL LETTER FINAL SIGMA
if (C == 0x03c2)
return 0x03c3;
- // GREEK CAPITAL KAI SYMBOL\n'
+ // GREEK CAPITAL KAI SYMBOL
if (C == 0x03cf)
return 0x03d7;
- // GREEK BETA SYMBOL\n'
+ // GREEK BETA SYMBOL
if (C == 0x03d0)
return 0x03b2;
- // GREEK THETA SYMBOL\n'
+ // GREEK THETA SYMBOL
if (C == 0x03d1)
return 0x03b8;
- // GREEK PHI SYMBOL\n'
+ // GREEK PHI SYMBOL
if (C == 0x03d5)
return 0x03c6;
- // GREEK PI SYMBOL\n'
+ // GREEK PI SYMBOL
if (C == 0x03d6)
return 0x03c0;
if (C < 0x03d8)
@@ -306,25 +306,25 @@ int llvm::sys::unicode::foldCharSimple(int C) {
// 12 characters
if (C <= 0x03ee)
return C | 1;
- // GREEK KAPPA SYMBOL\n'
+ // GREEK KAPPA SYMBOL
if (C == 0x03f0)
return 0x03ba;
- // GREEK RHO SYMBOL\n'
+ // GREEK RHO SYMBOL
if (C == 0x03f1)
return 0x03c1;
- // GREEK CAPITAL THETA SYMBOL\n'
+ // GREEK CAPITAL THETA SYMBOL
if (C == 0x03f4)
return 0x03b8;
- // GREEK LUNATE EPSILON SYMBOL\n'
+ // GREEK LUNATE EPSILON SYMBOL
if (C == 0x03f5)
return 0x03b5;
- // GREEK CAPITAL LETTER SHO\n'
+ // GREEK CAPITAL LETTER SHO
if (C == 0x03f7)
return 0x03f8;
- // GREEK CAPITAL LUNATE SIGMA SYMBOL\n'
+ // GREEK CAPITAL LUNATE SIGMA SYMBOL
if (C == 0x03f9)
return 0x03f2;
- // GREEK CAPITAL LETTER SAN\n'
+ // GREEK CAPITAL LETTER SAN
if (C == 0x03fa)
return 0x03fb;
if (C < 0x03fd)
@@ -352,7 +352,7 @@ int llvm::sys::unicode::foldCharSimple(int C) {
// 27 characters
if (C <= 0x04be)
return C | 1;
- // CYRILLIC LETTER PALOCHKA\n'
+ // CYRILLIC LETTER PALOCHKA
if (C == 0x04c0)
return 0x04cf;
if (C < 0x04c1)
@@ -385,13 +385,13 @@ int llvm::sys::unicode::foldCharSimple(int C) {
// 6 characters
if (C <= 0x13fd)
return C + -8;
- // CYRILLIC SMALL LETTER ROUNDED VE\n'
+ // CYRILLIC SMALL LETTER ROUNDED VE
if (C == 0x1c80)
return 0x0432;
- // CYRILLIC SMALL LETTER LONG-LEGGED DE\n'
+ // CYRILLIC SMALL LETTER LONG-LEGGED DE
if (C == 0x1c81)
return 0x0434;
- // CYRILLIC SMALL LETTER NARROW O\n'
+ // CYRILLIC SMALL LETTER NARROW O
if (C == 0x1c82)
return 0x043e;
if (C < 0x1c83)
@@ -399,19 +399,19 @@ int llvm::sys::unicode::foldCharSimple(int C) {
// 2 characters
if (C <= 0x1c84)
return C + -6210;
- // CYRILLIC SMALL LETTER THREE-LEGGED TE\n'
+ // CYRILLIC SMALL LETTER THREE-LEGGED TE
if (C == 0x1c85)
return 0x0442;
- // CYRILLIC SMALL LETTER TALL HARD SIGN\n'
+ // CYRILLIC SMALL LETTER TALL HARD SIGN
if (C == 0x1c86)
return 0x044a;
- // CYRILLIC SMALL LETTER TALL YAT\n'
+ // CYRILLIC SMALL LETTER TALL YAT
if (C == 0x1c87)
return 0x0463;
- // CYRILLIC SMALL LETTER UNBLENDED UK\n'
+ // CYRILLIC SMALL LETTER UNBLENDED UK
if (C == 0x1c88)
return 0xa64b;
- // CYRILLIC CAPITAL LETTER TJE\n'
+ // CYRILLIC CAPITAL LETTER TJE
if (C == 0x1c89)
return 0x1c8a;
if (C < 0x1c90)
@@ -429,10 +429,10 @@ int llvm::sys::unicode::foldCharSimple(int C) {
// 75 characters
if (C <= 0x1e94)
return C | 1;
- // LATIN SMALL LETTER LONG S WITH DOT ABOVE\n'
+ // LATIN SMALL LETTER LONG S WITH DOT ABOVE
if (C == 0x1e9b)
return 0x1e61;
- // LATIN CAPITAL LETTER SHARP S\n'
+ // LATIN CAPITAL LETTER SHARP S
if (C == 0x1e9e)
return 0x00df;
if (C < 0x1ea0)
@@ -500,10 +500,10 @@ int llvm::sys::unicode::foldCharSimple(int C) {
// 2 characters
if (C <= 0x1fbb)
return C + -74;
- // GREEK CAPITAL LETTER ALPHA WITH PROSGEGRAMMENI\n'
+ // GREEK CAPITAL LETTER ALPHA WITH PROSGEGRAMMENI
if (C == 0x1fbc)
return 0x1fb3;
- // GREEK PROSGEGRAMMENI\n'
+ // GREEK PROSGEGRAMMENI
if (C == 0x1fbe)
return 0x03b9;
if (C < 0x1fc8)
@@ -511,10 +511,10 @@ int llvm::sys::unicode::foldCharSimple(int C) {
// 4 characters
if (C <= 0x1fcb)
return C + -86;
- // GREEK CAPITAL LETTER ETA WITH PROSGEGRAMMENI\n'
+ // GREEK CAPITAL LETTER ETA WITH PROSGEGRAMMENI
if (C == 0x1fcc)
return 0x1fc3;
- // GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA\n'
+ // GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA
if (C == 0x1fd3)
return 0x0390;
if (C < 0x1fd8)
@@ -527,7 +527,7 @@ int llvm::sys::unicode::foldCharSimple(int C) {
// 2 characters
if (C <= 0x1fdb)
return C + -100;
- // GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA\n'
+ // GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA
if (C == 0x1fe3)
return 0x03b0;
if (C < 0x1fe8)
@@ -540,7 +540,7 @@ int llvm::sys::unicode::foldCharSimple(int C) {
// 2 characters
if (C <= 0x1feb)
return C + -112;
- // GREEK CAPITAL LETTER RHO WITH DASIA\n'
+ // GREEK CAPITAL LETTER RHO WITH DASIA
if (C == 0x1fec)
return 0x1fe5;
if (C < 0x1ff8)
@@ -553,19 +553,19 @@ int llvm::sys::unicode::foldCharSimple(int C) {
// 2 characters
if (C <= 0x1ffb)
return C + -126;
- // GREEK CAPITAL LETTER OMEGA WITH PROSGEGRAMMENI\n'
+ // GREEK CAPITAL LETTER OMEGA WITH PROSGEGRAMMENI
if (C == 0x1ffc)
return 0x1ff3;
- // OHM SIGN\n'
+ // OHM SIGN
if (C == 0x2126)
return 0x03c9;
- // KELVIN SIGN\n'
+ // KELVIN SIGN
if (C == 0x212a)
return 0x006b;
- // ANGSTROM SIGN\n'
+ // ANGSTROM SIGN
if (C == 0x212b)
return 0x00e5;
- // TURNED CAPITAL F\n'
+ // TURNED CAPITAL F
if (C == 0x2132)
return 0x214e;
if (C < 0x2160)
@@ -573,7 +573,7 @@ int llvm::sys::unicode::foldCharSimple(int C) {
// 16 characters
if (C <= 0x216f)
return C + 16;
- // ROMAN NUMERAL REVERSED ONE HUNDRED\n'
+ // ROMAN NUMERAL REVERSED ONE HUNDRED
if (C == 0x2183)
return 0x2184;
if (C < 0x24b6)
@@ -586,16 +586,16 @@ int llvm::sys::unicode::foldCharSimple(int C) {
// 48 characters
if (C <= 0x2c2f)
return C + 48;
- // LATIN CAPITAL LETTER L WITH DOUBLE BAR\n'
+ // LATIN CAPITAL LETTER L WITH DOUBLE BAR
if (C == 0x2c60)
return 0x2c61;
- // LATIN CAPITAL LETTER L WITH MIDDLE TILDE\n'
+ // LATIN CAPITAL LETTER L WITH MIDDLE TILDE
if (C == 0x2c62)
return 0x026b;
- // LATIN CAPITAL LETTER P WITH STROKE\n'
+ // LATIN CAPITAL LETTER P WITH STROKE
if (C == 0x2c63)
return 0x1d7d;
- // LATIN CAPITAL LETTER R WITH TAIL\n'
+ // LATIN CAPITAL LETTER R WITH TAIL
if (C == 0x2c64)
return 0x027d;
if (C < 0x2c67)
@@ -603,16 +603,16 @@ int llvm::sys::unicode::foldCharSimple(int C) {
// 3 characters
if (C <= 0x2c6b && C % 2 == 1)
return C + 1;
- // LATIN CAPITAL LETTER ALPHA\n'
+ // LATIN CAPITAL LETTER ALPHA
if (C == 0x2c6d)
return 0x0251;
- // LATIN CAPITAL LETTER M WITH HOOK\n'
+ // LATIN CAPITAL LETTER M WITH HOOK
if (C == 0x2c6e)
return 0x0271;
- // LATIN CAPITAL LETTER TURNED A\n'
+ // LATIN CAPITAL LETTER TURNED A
if (C == 0x2c6f)
return 0x0250;
- // LATIN CAPITAL LETTER TURNED ALPHA\n'
+ // LATIN CAPITAL LETTER TURNED ALPHA
if (C == 0x2c70)
return 0x0252;
if (C < 0x2c72)
@@ -665,7 +665,7 @@ int llvm::sys::unicode::foldCharSimple(int C) {
// 2 characters
if (C <= 0xa77b && C % 2 == 1)
return C + 1;
- // LATIN CAPITAL LETTER INSULAR G\n'
+ // LATIN CAPITAL LETTER INSULAR G
if (C == 0xa77d)
return 0x1d79;
if (C < 0xa77e)
@@ -673,10 +673,10 @@ int llvm::sys::unicode::foldCharSimple(int C) {
// 5 characters
if (C <= 0xa786)
return C | 1;
- // LATIN CAPITAL LETTER SALTILLO\n'
+ // LATIN CAPITAL LETTER SALTILLO
if (C == 0xa78b)
return 0xa78c;
- // LATIN CAPITAL LETTER TURNED H\n'
+ // LATIN CAPITAL LETTER TURNED H
if (C == 0xa78d)
return 0x0265;
if (C < 0xa790)
@@ -689,31 +689,31 @@ int llvm::sys::unicode::foldCharSimple(int C) {
// 10 characters
if (C <= 0xa7a8)
return C | 1;
- // LATIN CAPITAL LETTER H WITH HOOK\n'
+ // LATIN CAPITAL LETTER H WITH HOOK
if (C == 0xa7aa)
return 0x0266;
- // LATIN CAPITAL LETTER REVERSED OPEN E\n'
+ // LATIN CAPITAL LETTER REVERSED OPEN E
if (C == 0xa7ab)
return 0x025c;
- // LATIN CAPITAL LETTER SCRIPT G\n'
+ // LATIN CAPITAL LETTER SCRIPT G
if (C == 0xa7ac)
return 0x0261;
- // LATIN CAPITAL LETTER L WITH BELT\n'
+ // LATIN CAPITAL LETTER L WITH BELT
if (C == 0xa7ad)
return 0x026c;
- // LATIN CAPITAL LETTER SMALL CAPITAL I\n'
+ // LATIN CAPITAL LETTER SMALL CAPITAL I
if (C == 0xa7ae)
return 0x026a;
- // LATIN CAPITAL LETTER TURNED K\n'
+ // LATIN CAPITAL LETTER TURNED K
if (C == 0xa7b0)
return 0x029e;
- // LATIN CAPITAL LETTER TURNED T\n'
+ // LATIN CAPITAL LETTER TURNED T
if (C == 0xa7b1)
return 0x0287;
- // LATIN CAPITAL LETTER J WITH CROSSED-TAIL\n'
+ // LATIN CAPITAL LETTER J WITH CROSSED-TAIL
if (C == 0xa7b2)
return 0x029d;
- // LATIN CAPITAL LETTER CHI\n'
+ // LATIN CAPITAL LETTER CHI
if (C == 0xa7b3)
return 0xab53;
if (C < 0xa7b4)
@@ -721,13 +721,13 @@ int llvm::sys::unicode::foldCharSimple(int C) {
// 8 characters
if (C <= 0xa7c2)
return C | 1;
- // LATIN CAPITAL LETTER C WITH PALATAL HOOK\n'
+ // LATIN CAPITAL LETTER C WITH PALATAL HOOK
if (C == 0xa7c4)
return 0xa794;
- // LATIN CAPITAL LETTER S WITH HOOK\n'
+ // LATIN CAPITAL LETTER S WITH HOOK
if (C == 0xa7c5)
return 0x0282;
- // LATIN CAPITAL LETTER Z WITH PALATAL HOOK\n'
+ // LATIN CAPITAL LETTER Z WITH PALATAL HOOK
if (C == 0xa7c6)
return 0x1d8e;
if (C < 0xa7c7)
@@ -735,7 +735,7 @@ int llvm::sys::unicode::foldCharSimple(int C) {
// 2 characters
if (C <= 0xa7c9 && C % 2 == 1)
return C + 1;
- // LATIN CAPITAL LETTER RAMS HORN\n'
+ // LATIN CAPITAL LETTER RAMS HORN
if (C == 0xa7cb)
return 0x0264;
if (C < 0xa7cc)
@@ -743,7 +743,7 @@ int llvm::sys::unicode::foldCharSimple(int C) {
// 8 characters
if (C <= 0xa7da)
return C | 1;
- // LATIN CAPITAL LETTER LAMBDA WITH STROKE\n'
+ // LATIN CAPITAL LETTER LAMBDA WITH STROKE
if (C == 0xa7dc)
return 0x019b;
if (C < 0xa7dd)
@@ -751,7 +751,7 @@ int llvm::sys::unicode::foldCharSimple(int C) {
// 2 characters
if (C <= 0xa7e2 && C % 5 == 3)
return C + -42342;
- // LATIN CAPITAL LETTER REVERSED HALF H\n'
+ // LATIN CAPITAL LETTER REVERSED HALF H
if (C == 0xa7f5)
return 0xa7f6;
if (C < 0xab6c)
@@ -764,7 +764,7 @@ int llvm::sys::unicode::foldCharSimple(int C) {
// 80 characters
if (C <= 0xabbf)
return C + -38864;
- // LATIN SMALL LIGATURE LONG S T\n'
+ // LATIN SMALL LIGATURE LONG S T
if (C == 0xfb05)
return 0xfb06;
if (C < 0xff21)
@@ -852,7 +852,7 @@ int llvm::sys::unicode::foldCharSimple(int C) {
// 7 characters
if (C <= 0x1df7e)
return C | 1;
- // LATIN SMALL LIGATURE LONG S WITH DESCENDER S\n'
+ // LATIN SMALL LIGATURE LONG S WITH DESCENDER S
if (C == 0x1df95)
return 0x00df;
if (C < 0x1e900)
>From 293470d13ba0bdfbd254a8a2917daa597c2ac54e Mon Sep 17 00:00:00 2001
From: Charlie Lin <tuug at gmx.us>
Date: Thu, 21 May 2026 15:51:48 -0500
Subject: [PATCH 08/14] Update UnicodeNameMappingGenerator.cpp
---
llvm/utils/UnicodeData/UnicodeNameMappingGenerator.cpp | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/llvm/utils/UnicodeData/UnicodeNameMappingGenerator.cpp b/llvm/utils/UnicodeData/UnicodeNameMappingGenerator.cpp
index 106a51b80afca..97dd47fba7a79 100644
--- a/llvm/utils/UnicodeData/UnicodeNameMappingGenerator.cpp
+++ b/llvm/utils/UnicodeData/UnicodeNameMappingGenerator.cpp
@@ -354,9 +354,9 @@ int main(int argc, char **argv) {
"Usage: %s UnicodeData.txt NameAliases.txt output\n\n",
argv[0]);
printf("NameAliases.txt can be found at "
- "https://unicode.org/Public/15.1.0/ucd/NameAliases.txt\n"
+ "https://unicode.org/Public/18.0.0/ucd/NameAliases.txt\n"
"UnicodeData.txt can be found at "
- "https://unicode.org/Public/15.1.0/ucd/UnicodeData.txt\n\n");
+ "https://unicode.org/Public/18.0.0/ucd/UnicodeData.txt\n\n");
if (argc != 4)
return EXIT_FAILURE;
>From c78ce9bb0fd7ed90c131722592c3186184127023 Mon Sep 17 00:00:00 2001
From: Charlie Lin <tuug at gmx.us>
Date: Fri, 22 May 2026 06:03:06 -0500
Subject: [PATCH 09/14] Update generate_escaped_output_table.py
---
libcxx/utils/generate_escaped_output_table.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/libcxx/utils/generate_escaped_output_table.py b/libcxx/utils/generate_escaped_output_table.py
index dd7173201b466..20bf572dc6dbe 100755
--- a/libcxx/utils/generate_escaped_output_table.py
+++ b/libcxx/utils/generate_escaped_output_table.py
@@ -105,7 +105,7 @@ def compactPropertyRanges(input: list[PropertyRange]) -> list[PropertyRange]:
/// - Unassigned.
///
/// The data is generated from
-/// - https://www.unicode.org/Public/UCD/latest/ucd/extracted/DerivedGeneralCategory.txt
+/// - https://www.unicode.org/Public/UCD/18.0.0/ucd/extracted/DerivedGeneralCategory.txt
///
/// The table is similar to the table
/// __extended_grapheme_custer_property_boundary::__entries
>From f8ffabc8541eb32acae5d0656be5501e4d0ad174 Mon Sep 17 00:00:00 2001
From: Charlie Lin <tuug at gmx.us>
Date: Fri, 22 May 2026 06:11:56 -0500
Subject: [PATCH 10/14] Update entries of last two blocks for 18.0.0
---
libcxx/utils/generate_escaped_output_table.py | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/libcxx/utils/generate_escaped_output_table.py b/libcxx/utils/generate_escaped_output_table.py
index 20bf572dc6dbe..d043dfb1a681c 100755
--- a/libcxx/utils/generate_escaped_output_table.py
+++ b/libcxx/utils/generate_escaped_output_table.py
@@ -330,10 +330,10 @@ def generate_data_tables() -> str:
# The exact entries may differ between Unicode versions. When these numbers
# change the test needs to be updated too.
# libcxx/test/libcxx/utilities/format/format.string/format.string.std/escaped_output.pass.cpp
- assert (data[-2].lower) == 0x323B0
- assert (data[-2].upper) == 0xE00FF
- assert (data[-1].lower) == 0xE01F0
- assert (data[-1].upper) == 0x10FFFF
+ assert (data[-2].lower) == 0x3FC40, hex(data[-2].lower)
+ assert (data[-2].upper) == 0xE00FF, hex(data[-2].upper)
+ assert (data[-1].lower) == 0xE01F0, hex(data[-1].lower)
+ assert (data[-1].upper) == 0x10FFFF, hex(data[-1].upper)
return "\n".join(
[
>From 8a8a1cf2b8851c03d97deafc01bc9d3b2ead2773 Mon Sep 17 00:00:00 2001
From: Charlie Lin <tuug at gmx.us>
Date: Fri, 22 May 2026 06:13:33 -0500
Subject: [PATCH 11/14] Update excluded range
---
.../format.string/format.string.std/escaped_output.pass.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/libcxx/test/libcxx/utilities/format/format.string/format.string.std/escaped_output.pass.cpp b/libcxx/test/libcxx/utilities/format/format.string/format.string.std/escaped_output.pass.cpp
index d24db360173b1..39a21fbeb9778 100644
--- a/libcxx/test/libcxx/utilities/format/format.string/format.string.std/escaped_output.pass.cpp
+++ b/libcxx/test/libcxx/utilities/format/format.string/format.string.std/escaped_output.pass.cpp
@@ -73,7 +73,7 @@ inline constexpr int C = Cc + Cf + Cs + Co + Cn;
// [323B0, 10FFFF] use a hand-crafted algorithm.
//
// This means a number of entries are omitted
-inline constexpr int excluded = ((0x10FFFF - 0x323B0) + 1) - 240;
+inline constexpr int excluded = ((0x10FFFF - 0x3FC40) + 1) - 240;
inline constexpr int entries = Z + C - excluded;
>From c6c47f6c028ef3e862661c0209996cc43bb7b2d6 Mon Sep 17 00:00:00 2001
From: Charlie Lin <tuug at gmx.us>
Date: Fri, 22 May 2026 06:14:43 -0500
Subject: [PATCH 12/14] Update escaped_output.pass.cpp
---
.../format.string/format.string.std/escaped_output.pass.cpp | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/libcxx/test/libcxx/utilities/format/format.string/format.string.std/escaped_output.pass.cpp b/libcxx/test/libcxx/utilities/format/format.string/format.string.std/escaped_output.pass.cpp
index 39a21fbeb9778..3c5502652866b 100644
--- a/libcxx/test/libcxx/utilities/format/format.string/format.string.std/escaped_output.pass.cpp
+++ b/libcxx/test/libcxx/utilities/format/format.string/format.string.std/escaped_output.pass.cpp
@@ -66,11 +66,11 @@ inline constexpr int C = Cc + Cf + Cs + Co + Cn;
// 100000..10FFFD; Co # [65534] <private-use-100000>..<private-use-10FFFD>
// 10FFFE..10FFFF; Cn # [2] <noncharacter-10FFFE>..<noncharacter-10FFFF>
//
-// It can be observed all entries in the range 323B0..10FFFF are in the
+// It can be observed all entries in the range 3FC40..10FFFF are in the
// categories Cf, Co, Cn, except a small range with the property Mn.
// In order to reduce the size of the table only the entires in the range
-// [0000, 323B0) are stored in the table. The entries in the range
-// [323B0, 10FFFF] use a hand-crafted algorithm.
+// [0000, 3FC40) are stored in the table. The entries in the range
+// [3FC40, 10FFFF] use a hand-crafted algorithm.
//
// This means a number of entries are omitted
inline constexpr int excluded = ((0x10FFFF - 0x3FC40) + 1) - 240;
>From b6f14c307888132e9ee09fd1651ae2764ae75344 Mon Sep 17 00:00:00 2001
From: Charlie Lin <tuug at gmx.us>
Date: Fri, 22 May 2026 07:47:12 -0500
Subject: [PATCH 13/14] Account for new Unicode property ranges in 18
---
.../format.string.std/escaped_output.pass.cpp | 10 ++++++----
1 file changed, 6 insertions(+), 4 deletions(-)
diff --git a/libcxx/test/libcxx/utilities/format/format.string/format.string.std/escaped_output.pass.cpp b/libcxx/test/libcxx/utilities/format/format.string/format.string.std/escaped_output.pass.cpp
index 3c5502652866b..ee1fae71f66ea 100644
--- a/libcxx/test/libcxx/utilities/format/format.string/format.string.std/escaped_output.pass.cpp
+++ b/libcxx/test/libcxx/utilities/format/format.string/format.string.std/escaped_output.pass.cpp
@@ -48,13 +48,15 @@ inline constexpr int Cc = 65;
inline constexpr int Cf = 170;
inline constexpr int Cs = 2'048;
inline constexpr int Co = 137'468;
-inline constexpr int Cn = 819'533;
+inline constexpr int Cn = 801'683;
inline constexpr int C = Cc + Cf + Cs + Co + Cn;
// This is the final part of the Unicode properties table:
//
-// 31350..323AF ; Lo # [4192] CJK UNIFIED IDEOGRAPH-31350..CJK UNIFIED IDEOGRAPH-323AF
-// 323B0..E0000 ; Cn # [711761] <reserved-323B0>..<reserved-E0000>
+// 31350..33479 ; Lo # [8490] CJK UNIFIED IDEOGRAPH-31350..CJK UNIFIED IDEOGRAPH-33479
+// 3347A..3CFFF ; Cn # [39814] <reserved-3347A>..<reserved-3CFFF>
+// 3D000..3FC3F ; Lo # [11328] SEAL CHARACTER-3D000..SEAL CHARACTER-3FC3F
+// 3FC40..E0000 ; Cn # [656321] <reserved-3FC40>..<reserved-E0000>
// E0001 ; Cf # LANGUAGE TAG
// E0002..E001F ; Cn # [30] <reserved-E0002>..<reserved-E001F>
// E0020..E007F ; Cf # [96] TAG SPACE..CANCEL TAG
@@ -73,7 +75,7 @@ inline constexpr int C = Cc + Cf + Cs + Co + Cn;
// [3FC40, 10FFFF] use a hand-crafted algorithm.
//
// This means a number of entries are omitted
-inline constexpr int excluded = ((0x10FFFF - 0x3FC40) + 1) - 240;
+inline constexpr int excluded = ((0x10FFFF - 0x3347A) + 1) - 240;
inline constexpr int entries = Z + C - excluded;
>From 6b810bc4666d02b99eea232db0d5eab566aa8c4c Mon Sep 17 00:00:00 2001
From: Charlie Lin <tuug at gmx.us>
Date: Fri, 22 May 2026 08:24:25 -0500
Subject: [PATCH 14/14] Final corrected excluded codepoint is 0x3fc40
---
.../format.string/format.string.std/escaped_output.pass.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/libcxx/test/libcxx/utilities/format/format.string/format.string.std/escaped_output.pass.cpp b/libcxx/test/libcxx/utilities/format/format.string/format.string.std/escaped_output.pass.cpp
index ee1fae71f66ea..9e51b064e6f5a 100644
--- a/libcxx/test/libcxx/utilities/format/format.string/format.string.std/escaped_output.pass.cpp
+++ b/libcxx/test/libcxx/utilities/format/format.string/format.string.std/escaped_output.pass.cpp
@@ -75,7 +75,7 @@ inline constexpr int C = Cc + Cf + Cs + Co + Cn;
// [3FC40, 10FFFF] use a hand-crafted algorithm.
//
// This means a number of entries are omitted
-inline constexpr int excluded = ((0x10FFFF - 0x3347A) + 1) - 240;
+inline constexpr int excluded = ((0x10FFFF - 0x3FC40) + 1) - 240;
inline constexpr int entries = Z + C - excluded;
More information about the libcxx-commits
mailing list