[libcxx-commits] [libcxx] [llvm] Finish supporting Unicode 18 (PR #199111)

Charlie Lin via libcxx-commits libcxx-commits at lists.llvm.org
Fri May 22 06:24:43 PDT 2026


https://github.com/clin1234 updated https://github.com/llvm/llvm-project/pull/199111

>From e80090631e85bc24b8742d0922bf88eff998c09e Mon Sep 17 00:00:00 2001
From: Charlie Lin <tuug at gmx.us>
Date: Thu, 21 May 2026 13:52:02 -0500
Subject: [PATCH 01/14] Update Unicode version in comments to 18.0.0 in
 generate_escaped_output_table.py

---
 libcxx/utils/generate_escaped_output_table.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libcxx/utils/generate_escaped_output_table.py b/libcxx/utils/generate_escaped_output_table.py
index 30583dd8cdc85..dd7173201b466 100755
--- a/libcxx/utils/generate_escaped_output_table.py
+++ b/libcxx/utils/generate_escaped_output_table.py
@@ -322,7 +322,7 @@ def generate_data_tables() -> str:
 
     # The output table has two large entries at the end, with a small "gap"
     #   E0100..E01EF  ; Grapheme_Extend # Mn [240] VARIATION SELECTOR-17..VARIATION SELECTOR-256
-    # Based on Unicode 15.1.0:
+    # Based on Unicode 18.0.0:
     # - Encoding all these entries in the table requires 1173 entries.
     # - Manually handling these last two blocks reduces the size to 729 entries.
     # This not only reduces the binary size, but also improves the performance

>From e3ddeb6cc37f69abeb9f6f7e7303030a17e3af9c Mon Sep 17 00:00:00 2001
From: Charlie Lin <tuug at gmx.us>
Date: Thu, 21 May 2026 13:55:08 -0500
Subject: [PATCH 02/14] Update Unicode data URL in
 UnicodeNameMappingGenerator.cpp

---
 llvm/utils/UnicodeData/UnicodeNameMappingGenerator.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/utils/UnicodeData/UnicodeNameMappingGenerator.cpp b/llvm/utils/UnicodeData/UnicodeNameMappingGenerator.cpp
index 5796c94afef4a..106a51b80afca 100644
--- a/llvm/utils/UnicodeData/UnicodeNameMappingGenerator.cpp
+++ b/llvm/utils/UnicodeData/UnicodeNameMappingGenerator.cpp
@@ -8,7 +8,7 @@
 //
 // This file is used to generate lib/Support/UnicodeNameToCodepointGenerated.cpp
 // using UnicodeData.txt and NameAliases.txt available at
-// https://unicode.org/Public/15.1.0/ucd/
+// https://unicode.org/Public/18.0.0/ucd/
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ADT/STLExtras.h"

>From 06d685b9d2814f5fa286b993bea38d90f1f4e458 Mon Sep 17 00:00:00 2001
From: Charlie Lin <tuug at gmx.us>
Date: Thu, 21 May 2026 14:38:07 -0500
Subject: [PATCH 03/14] Update __entries array size and values for Unicode 18

---
 .../include/__format/escaped_output_table.h   | 141 ++++++++++--------
 1 file changed, 75 insertions(+), 66 deletions(-)

diff --git a/libcxx/include/__format/escaped_output_table.h b/libcxx/include/__format/escaped_output_table.h
index a62c715811a0c..c0d401d9a00c1 100644
--- a/libcxx/include/__format/escaped_output_table.h
+++ b/libcxx/include/__format/escaped_output_table.h
@@ -110,7 +110,7 @@ namespace __escaped_output_table {
 /// - bits [14, 31] The lower bound code point of the range. The upper bound of
 ///   the range is lower bound + size. Note the code expects code units the fit
 ///   into 18 bits, instead of the 21 bits needed for the full Unicode range.
-_LIBCPP_HIDE_FROM_ABI inline constexpr uint32_t __entries[735] = {
+_LIBCPP_HIDE_FROM_ABI inline constexpr uint32_t __entries[744] = {
     0x00000020 /* 00000000 - 00000020 [   33] */,
     0x001fc021 /* 0000007f - 000000a0 [   34] */,
     0x002b4000 /* 000000ad - 000000ad [    1] */,
@@ -120,10 +120,9 @@ _LIBCPP_HIDE_FROM_ABI inline constexpr uint32_t __entries[735] = {
     0x00e34000 /* 0000038d - 0000038d [    1] */,
     0x00e88000 /* 000003a2 - 000003a2 [    1] */,
     0x014c0000 /* 00000530 - 00000530 [    1] */,
-    0x0155c001 /* 00000557 - 00000558 [    2] */,
-    0x0162c001 /* 0000058b - 0000058c [    2] */,
+    0x0155c000 /* 00000557 - 00000557 [    1] */,
     0x01640000 /* 00000590 - 00000590 [    1] */,
-    0x01720007 /* 000005c8 - 000005cf [    8] */,
+    0x01728005 /* 000005ca - 000005cf [    6] */,
     0x017ac003 /* 000005eb - 000005ee [    4] */,
     0x017d4010 /* 000005f5 - 00000605 [   17] */,
     0x01870000 /* 0000061c - 0000061c [    1] */,
@@ -137,7 +136,7 @@ _LIBCPP_HIDE_FROM_ABI inline constexpr uint32_t __entries[735] = {
     0x02170001 /* 0000085c - 0000085d [    2] */,
     0x0217c000 /* 0000085f - 0000085f [    1] */,
     0x021ac004 /* 0000086b - 0000086f [    5] */,
-    0x0223c007 /* 0000088f - 00000896 [    8] */,
+    0x02240006 /* 00000890 - 00000896 [    7] */,
     0x02388000 /* 000008e2 - 000008e2 [    1] */,
     0x02610000 /* 00000984 - 00000984 [    1] */,
     0x02634001 /* 0000098d - 0000098e [    2] */,
@@ -192,7 +191,7 @@ _LIBCPP_HIDE_FROM_ABI inline constexpr uint32_t __entries[735] = {
     0x02ce8001 /* 00000b3a - 00000b3b [    2] */,
     0x02d14001 /* 00000b45 - 00000b46 [    2] */,
     0x02d24001 /* 00000b49 - 00000b4a [    2] */,
-    0x02d38006 /* 00000b4e - 00000b54 [    7] */,
+    0x02d38004 /* 00000b4e - 00000b52 [    5] */,
     0x02d60003 /* 00000b58 - 00000b5b [    4] */,
     0x02d78000 /* 00000b5e - 00000b5e [    1] */,
     0x02d90001 /* 00000b64 - 00000b65 [    2] */,
@@ -221,7 +220,7 @@ _LIBCPP_HIDE_FROM_ABI inline constexpr uint32_t __entries[735] = {
     0x03124000 /* 00000c49 - 00000c49 [    1] */,
     0x03138006 /* 00000c4e - 00000c54 [    7] */,
     0x0315c000 /* 00000c57 - 00000c57 [    1] */,
-    0x0316c001 /* 00000c5b - 00000c5c [    2] */,
+    0x0316c000 /* 00000c5b - 00000c5b [    1] */,
     0x03178001 /* 00000c5e - 00000c5f [    2] */,
     0x03190001 /* 00000c64 - 00000c65 [    2] */,
     0x031c0006 /* 00000c70 - 00000c76 [    7] */,
@@ -233,7 +232,7 @@ _LIBCPP_HIDE_FROM_ABI inline constexpr uint32_t __entries[735] = {
     0x03314000 /* 00000cc5 - 00000cc5 [    1] */,
     0x03324000 /* 00000cc9 - 00000cc9 [    1] */,
     0x03338006 /* 00000cce - 00000cd4 [    7] */,
-    0x0335c005 /* 00000cd7 - 00000cdc [    6] */,
+    0x0335c004 /* 00000cd7 - 00000cdb [    5] */,
     0x0337c000 /* 00000cdf - 00000cdf [    1] */,
     0x03390001 /* 00000ce4 - 00000ce5 [    2] */,
     0x033c0000 /* 00000cf0 - 00000cf0 [    1] */,
@@ -331,7 +330,7 @@ _LIBCPP_HIDE_FROM_ABI inline constexpr uint32_t __entries[735] = {
     0x06a28005 /* 00001a8a - 00001a8f [    6] */,
     0x06a68005 /* 00001a9a - 00001a9f [    6] */,
     0x06ab8001 /* 00001aae - 00001aaf [    2] */,
-    0x06b3c030 /* 00001acf - 00001aff [   49] */,
+    0x06bc400e /* 00001af1 - 00001aff [   15] */,
     0x06d34000 /* 00001b4d - 00001b4d [    1] */,
     0x06fd0007 /* 00001bf4 - 00001bfb [    8] */,
     0x070e0002 /* 00001c38 - 00001c3a [    3] */,
@@ -359,15 +358,12 @@ _LIBCPP_HIDE_FROM_ABI inline constexpr uint32_t __entries[735] = {
     0x080a0007 /* 00002028 - 0000202f [    8] */,
     0x0817c010 /* 0000205f - 0000206f [   17] */,
     0x081c8001 /* 00002072 - 00002073 [    2] */,
-    0x0823c000 /* 0000208f - 0000208f [    1] */,
-    0x08274002 /* 0000209d - 0000209f [    3] */,
-    0x0830400e /* 000020c1 - 000020cf [   15] */,
+    0x0831400a /* 000020c5 - 000020cf [   11] */,
     0x083c400e /* 000020f1 - 000020ff [   15] */,
     0x08630003 /* 0000218c - 0000218f [    4] */,
     0x090a8015 /* 0000242a - 0000243f [   22] */,
     0x0912c014 /* 0000244b - 0000245f [   21] */,
     0x0add0001 /* 00002b74 - 00002b75 [    2] */,
-    0x0ae58000 /* 00002b96 - 00002b96 [    1] */,
     0x0b3d0004 /* 00002cf4 - 00002cf8 [    5] */,
     0x0b498000 /* 00002d26 - 00002d26 [    1] */,
     0x0b4a0004 /* 00002d28 - 00002d2c [    5] */,
@@ -383,7 +379,8 @@ _LIBCPP_HIDE_FROM_ABI inline constexpr uint32_t __entries[735] = {
     0x0b73c000 /* 00002dcf - 00002dcf [    1] */,
     0x0b75c000 /* 00002dd7 - 00002dd7 [    1] */,
     0x0b77c000 /* 00002ddf - 00002ddf [    1] */,
-    0x0b978021 /* 00002e5e - 00002e7f [   34] */,
+    0x0b978001 /* 00002e5e - 00002e5f [    2] */,
+    0x0b99001b /* 00002e64 - 00002e7f [   28] */,
     0x0ba68000 /* 00002e9a - 00002e9a [    1] */,
     0x0bbd000b /* 00002ef4 - 00002eff [   12] */,
     0x0bf58019 /* 00002fd6 - 00002fef [   26] */,
@@ -399,10 +396,8 @@ _LIBCPP_HIDE_FROM_ABI inline constexpr uint32_t __entries[735] = {
     0x2931c008 /* 0000a4c7 - 0000a4cf [    9] */,
     0x298b0013 /* 0000a62c - 0000a63f [   20] */,
     0x29be0007 /* 0000a6f8 - 0000a6ff [    8] */,
-    0x29f38001 /* 0000a7ce - 0000a7cf [    2] */,
-    0x29f48000 /* 0000a7d2 - 0000a7d2 [    1] */,
-    0x29f50000 /* 0000a7d4 - 0000a7d4 [    1] */,
-    0x29f74014 /* 0000a7dd - 0000a7f1 [   21] */,
+    0x29f78003 /* 0000a7de - 0000a7e1 [    4] */,
+    0x29f8c00d /* 0000a7e3 - 0000a7f0 [   14] */,
     0x2a0b4002 /* 0000a82d - 0000a82f [    3] */,
     0x2a0e8005 /* 0000a83a - 0000a83f [    6] */,
     0x2a1e0007 /* 0000a878 - 0000a87f [    8] */,
@@ -423,7 +418,7 @@ _LIBCPP_HIDE_FROM_ABI inline constexpr uint32_t __entries[735] = {
     0x2ac5c008 /* 0000ab17 - 0000ab1f [    9] */,
     0x2ac9c000 /* 0000ab27 - 0000ab27 [    1] */,
     0x2acbc000 /* 0000ab2f - 0000ab2f [    1] */,
-    0x2adb0003 /* 0000ab6c - 0000ab6f [    4] */,
+    0x2adb8001 /* 0000ab6e - 0000ab6f [    2] */,
     0x2afb8001 /* 0000abee - 0000abef [    2] */,
     0x2afe8005 /* 0000abfa - 0000abff [    6] */,
     0x35e9000b /* 0000d7a4 - 0000d7af [   12] */,
@@ -438,9 +433,6 @@ _LIBCPP_HIDE_FROM_ABI inline constexpr uint32_t __entries[735] = {
     0x3ecfc000 /* 0000fb3f - 0000fb3f [    1] */,
     0x3ed08000 /* 0000fb42 - 0000fb42 [    1] */,
     0x3ed14000 /* 0000fb45 - 0000fb45 [    1] */,
-    0x3ef0c00f /* 0000fbc3 - 0000fbd2 [   16] */,
-    0x3f640001 /* 0000fd90 - 0000fd91 [    2] */,
-    0x3f720006 /* 0000fdc8 - 0000fdce [    7] */,
     0x3f74001f /* 0000fdd0 - 0000fdef [   32] */,
     0x3f868005 /* 0000fe1a - 0000fe1f [    6] */,
     0x3f94c000 /* 0000fe53 - 0000fe53 [    1] */,
@@ -498,7 +490,7 @@ _LIBCPP_HIDE_FROM_ABI inline constexpr uint32_t __entries[735] = {
     0x41da0017 /* 00010768 - 0001077f [   24] */,
     0x41e18000 /* 00010786 - 00010786 [    1] */,
     0x41ec4000 /* 000107b1 - 000107b1 [    1] */,
-    0x41eec044 /* 000107bb - 000107ff [   69] */,
+    0x41f0003f /* 000107c0 - 000107ff [   64] */,
     0x42018001 /* 00010806 - 00010807 [    2] */,
     0x42024000 /* 00010809 - 00010809 [    1] */,
     0x420d8000 /* 00010836 - 00010836 [    1] */,
@@ -511,7 +503,7 @@ _LIBCPP_HIDE_FROM_ABI inline constexpr uint32_t __entries[735] = {
     0x423d8004 /* 000108f6 - 000108fa [    5] */,
     0x42470002 /* 0001091c - 0001091e [    3] */,
     0x424e8004 /* 0001093a - 0001093e [    5] */,
-    0x4250003f /* 00010940 - 0001097f [   64] */,
+    0x42568025 /* 0001095a - 0001097f [   38] */,
     0x426e0003 /* 000109b8 - 000109bb [    4] */,
     0x42740001 /* 000109d0 - 000109d1 [    2] */,
     0x42810000 /* 00010a04 - 00010a04 [    1] */,
@@ -543,7 +535,8 @@ _LIBCPP_HIDE_FROM_ABI inline constexpr uint32_t __entries[735] = {
     0x43aa8000 /* 00010eaa - 00010eaa [    1] */,
     0x43ab8001 /* 00010eae - 00010eaf [    2] */,
     0x43ac800f /* 00010eb2 - 00010ec1 [   16] */,
-    0x43b14036 /* 00010ec5 - 00010efb [   55] */,
+    0x43b20000 /* 00010ec8 - 00010ec8 [    1] */,
+    0x43bbc000 /* 00010eef - 00010eef [    1] */,
     0x43ca0007 /* 00010f28 - 00010f2f [    8] */,
     0x43d68015 /* 00010f5a - 00010f6f [   22] */,
     0x43e28025 /* 00010f8a - 00010faf [   38] */,
@@ -626,7 +619,8 @@ _LIBCPP_HIDE_FROM_ABI inline constexpr uint32_t __entries[735] = {
     0x46920007 /* 00011a48 - 00011a4f [    8] */,
     0x46a8c00c /* 00011aa3 - 00011aaf [   13] */,
     0x46be4006 /* 00011af9 - 00011aff [    7] */,
-    0x46c280b5 /* 00011b0a - 00011bbf [  182] */,
+    0x46c2c054 /* 00011b0b - 00011b5f [   85] */,
+    0x46da0057 /* 00011b68 - 00011bbf [   88] */,
     0x46f8800d /* 00011be2 - 00011bef [   14] */,
     0x46fe8005 /* 00011bfa - 00011bff [    6] */,
     0x47024000 /* 00011c09 - 00011c09 [    1] */,
@@ -648,7 +642,10 @@ _LIBCPP_HIDE_FROM_ABI inline constexpr uint32_t __entries[735] = {
     0x4763c000 /* 00011d8f - 00011d8f [    1] */,
     0x47648000 /* 00011d92 - 00011d92 [    1] */,
     0x47664006 /* 00011d99 - 00011d9f [    7] */,
-    0x476a8135 /* 00011daa - 00011edf [  310] */,
+    0x476a8005 /* 00011daa - 00011daf [    6] */,
+    0x47770003 /* 00011ddc - 00011ddf [    4] */,
+    0x477a8005 /* 00011dea - 00011def [    6] */,
+    0x477c80ed /* 00011df2 - 00011edf [  238] */,
     0x47be4006 /* 00011ef9 - 00011eff [    7] */,
     0x47c44000 /* 00011f11 - 00011f11 [    1] */,
     0x47cec002 /* 00011f3b - 00011f3d [    3] */,
@@ -656,9 +653,8 @@ _LIBCPP_HIDE_FROM_ABI inline constexpr uint32_t __entries[735] = {
     0x47ec400e /* 00011fb1 - 00011fbf [   15] */,
     0x47fc800c /* 00011ff2 - 00011ffe [   13] */,
     0x48e68065 /* 0001239a - 000123ff [  102] */,
-    0x491bc000 /* 0001246f - 0001246f [    1] */,
-    0x491d400a /* 00012475 - 0001247f [   11] */,
-    0x49510a4b /* 00012544 - 00012f8f [ 2636] */,
+    0x4951000b /* 00012544 - 0001254f [   12] */,
+    0x49a1c908 /* 00012687 - 00012f8f [ 2313] */,
     0x4bfcc00c /* 00012ff3 - 00012fff [   13] */,
     0x4d0c000f /* 00013430 - 0001343f [   16] */,
     0x4d158009 /* 00013456 - 0001345f [   10] */,
@@ -677,40 +673,48 @@ _LIBCPP_HIDE_FROM_ABI inline constexpr uint32_t __entries[735] = {
     0x5ad88000 /* 00016b62 - 00016b62 [    1] */,
     0x5ade0004 /* 00016b78 - 00016b7c [    5] */,
     0x5ae401af /* 00016b90 - 00016d3f [  432] */,
-    0x5b5e80c5 /* 00016d7a - 00016e3f [  198] */,
-    0x5ba6c064 /* 00016e9b - 00016eff [  101] */,
+    0x5b5e8005 /* 00016d7a - 00016d7f [    6] */,
+    0x5b678001 /* 00016d9e - 00016d9f [    2] */,
+    0x5b6a8095 /* 00016daa - 00016e3f [  150] */,
+    0x5ba6c004 /* 00016e9b - 00016e9f [    5] */,
+    0x5bae4001 /* 00016eb9 - 00016eba [    2] */,
+    0x5bb5002b /* 00016ed4 - 00016eff [   44] */,
     0x5bd2c003 /* 00016f4b - 00016f4e [    4] */,
     0x5be20006 /* 00016f88 - 00016f8e [    7] */,
     0x5be8003f /* 00016fa0 - 00016fdf [   64] */,
     0x5bf9400a /* 00016fe5 - 00016fef [   11] */,
-    0x5bfc800d /* 00016ff2 - 00016fff [   14] */,
-    0x61fe0007 /* 000187f8 - 000187ff [    8] */,
-    0x63358028 /* 00018cd6 - 00018cfe [   41] */,
-    0x634262e6 /* 00018d09 - 0001afef [ 8935] */,
+    0x5bfdc008 /* 00016ff7 - 00016fff [    9] */,
+    0x6336c023 /* 00018cdb - 00018cfe [   36] */,
+    0x6348405e /* 00018d21 - 00018d7f [   95] */,
+    0x637cc00c /* 00018df3 - 00018dff [   13] */,
+    0x6464800d /* 00019192 - 0001919f [   14] */,
+    0x6474de1c /* 000191d3 - 0001afef [ 7709] */,
     0x6bfd0000 /* 0001aff4 - 0001aff4 [    1] */,
     0x6bff0000 /* 0001affc - 0001affc [    1] */,
     0x6bffc000 /* 0001afff - 0001afff [    1] */,
-    0x6c48c00e /* 0001b123 - 0001b131 [   15] */,
+    0x6c4a4008 /* 0001b129 - 0001b131 [    9] */,
     0x6c4cc01c /* 0001b133 - 0001b14f [   29] */,
     0x6c54c001 /* 0001b153 - 0001b154 [    2] */,
     0x6c55800d /* 0001b156 - 0001b163 [   14] */,
-    0x6c5a0007 /* 0001b168 - 0001b16f [    8] */,
+    0x6c5a4006 /* 0001b169 - 0001b16f [    7] */,
     0x6cbf0903 /* 0001b2fc - 0001bbff [ 2308] */,
     0x6f1ac004 /* 0001bc6b - 0001bc6f [    5] */,
     0x6f1f4002 /* 0001bc7d - 0001bc7f [    3] */,
     0x6f224006 /* 0001bc89 - 0001bc8f [    7] */,
     0x6f268001 /* 0001bc9a - 0001bc9b [    2] */,
     0x6f280f5f /* 0001bca0 - 0001cbff [ 3936] */,
-    0x733e8005 /* 0001ccfa - 0001ccff [    6] */,
-    0x73ad004b /* 0001ceb4 - 0001ceff [   76] */,
+    0x733f4002 /* 0001ccfd - 0001ccff [    3] */,
+    0x73ad0005 /* 0001ceb4 - 0001ceb9 [    6] */,
+    0x73b44000 /* 0001ced1 - 0001ced1 [    1] */,
+    0x73b54007 /* 0001ced5 - 0001cedc [    8] */,
+    0x73bf8001 /* 0001cefe - 0001ceff [    2] */,
     0x73cb8001 /* 0001cf2e - 0001cf2f [    2] */,
     0x73d1c008 /* 0001cf47 - 0001cf4f [    9] */,
     0x73f1003b /* 0001cfc4 - 0001cfff [   60] */,
     0x743d8009 /* 0001d0f6 - 0001d0ff [   10] */,
-    0x7449c001 /* 0001d127 - 0001d128 [    2] */,
     0x745cc007 /* 0001d173 - 0001d17a [    8] */,
-    0x747ac014 /* 0001d1eb - 0001d1ff [   21] */,
-    0x74918079 /* 0001d246 - 0001d2bf [  122] */,
+    0x74918009 /* 0001d246 - 0001d24f [   10] */,
+    0x74a0803d /* 0001d282 - 0001d2bf [   62] */,
     0x74b5000b /* 0001d2d4 - 0001d2df [   12] */,
     0x74bd000b /* 0001d2f4 - 0001d2ff [   12] */,
     0x74d5c008 /* 0001d357 - 0001d35f [    9] */,
@@ -733,13 +737,14 @@ _LIBCPP_HIDE_FROM_ABI inline constexpr uint32_t __entries[735] = {
     0x75514000 /* 0001d545 - 0001d545 [    1] */,
     0x7551c002 /* 0001d547 - 0001d549 [    3] */,
     0x75544000 /* 0001d551 - 0001d551 [    1] */,
-    0x75a98001 /* 0001d6a6 - 0001d6a7 [    2] */,
+    0x75a9c000 /* 0001d6a7 - 0001d6a7 [    1] */,
     0x75f30001 /* 0001d7cc - 0001d7cd [    2] */,
     0x76a3000e /* 0001da8c - 0001da9a [   15] */,
     0x76a80000 /* 0001daa0 - 0001daa0 [    1] */,
-    0x76ac044f /* 0001dab0 - 0001deff [ 1104] */,
-    0x77c7c005 /* 0001df1f - 0001df24 [    6] */,
-    0x77cac0d4 /* 0001df2b - 0001dfff [  213] */,
+    0x76ac004f /* 0001dab0 - 0001daff [   80] */,
+    0x76c743e2 /* 0001db1d - 0001deff [  995] */,
+    0x77e0800d /* 0001df82 - 0001df8f [   14] */,
+    0x77e5c035 /* 0001df97 - 0001dfcc [   54] */,
     0x7801c000 /* 0001e007 - 0001e007 [    1] */,
     0x78064001 /* 0001e019 - 0001e01a [    2] */,
     0x78088000 /* 0001e022 - 0001e022 [    1] */,
@@ -756,7 +761,10 @@ _LIBCPP_HIDE_FROM_ABI inline constexpr uint32_t __entries[735] = {
     0x78c001cf /* 0001e300 - 0001e4cf [  464] */,
     0x793e80d5 /* 0001e4fa - 0001e5cf [  214] */,
     0x797ec003 /* 0001e5fb - 0001e5fe [    4] */,
-    0x798001df /* 0001e600 - 0001e7df [  480] */,
+    0x798000bf /* 0001e600 - 0001e6bf [  192] */,
+    0x79b7c000 /* 0001e6df - 0001e6df [    1] */,
+    0x79bd8007 /* 0001e6f6 - 0001e6fd [    8] */,
+    0x79c000df /* 0001e700 - 0001e7df [  224] */,
     0x79f9c000 /* 0001e7e7 - 0001e7e7 [    1] */,
     0x79fb0000 /* 0001e7ec - 0001e7ec [    1] */,
     0x79fbc000 /* 0001e7ef - 0001e7ef [    1] */,
@@ -808,44 +816,45 @@ _LIBCPP_HIDE_FROM_ABI inline constexpr uint32_t __entries[735] = {
     0x7c300000 /* 0001f0c0 - 0001f0c0 [    1] */,
     0x7c340000 /* 0001f0d0 - 0001f0d0 [    1] */,
     0x7c3d8009 /* 0001f0f6 - 0001f0ff [   10] */,
-    0x7c6b8037 /* 0001f1ae - 0001f1e5 [   56] */,
+    0x7c6bc036 /* 0001f1af - 0001f1e5 [   55] */,
     0x7c80c00c /* 0001f203 - 0001f20f [   13] */,
     0x7c8f0003 /* 0001f23c - 0001f23f [    4] */,
     0x7c924006 /* 0001f249 - 0001f24f [    7] */,
     0x7c94800d /* 0001f252 - 0001f25f [   14] */,
     0x7c998099 /* 0001f266 - 0001f2ff [  154] */,
-    0x7db60003 /* 0001f6d8 - 0001f6db [    4] */,
+    0x7db68001 /* 0001f6da - 0001f6db [    2] */,
     0x7dbb4002 /* 0001f6ed - 0001f6ef [    3] */,
     0x7dbf4002 /* 0001f6fd - 0001f6ff [    3] */,
-    0x7dddc003 /* 0001f777 - 0001f77a [    4] */,
-    0x7df68005 /* 0001f7da - 0001f7df [    6] */,
+    0x7df70003 /* 0001f7dc - 0001f7df [    4] */,
     0x7dfb0003 /* 0001f7ec - 0001f7ef [    4] */,
-    0x7dfc400e /* 0001f7f1 - 0001f7ff [   15] */,
     0x7e030003 /* 0001f80c - 0001f80f [    4] */,
     0x7e120007 /* 0001f848 - 0001f84f [    8] */,
     0x7e168005 /* 0001f85a - 0001f85f [    6] */,
     0x7e220007 /* 0001f888 - 0001f88f [    8] */,
     0x7e2b8001 /* 0001f8ae - 0001f8af [    2] */,
     0x7e2f0003 /* 0001f8bc - 0001f8bf [    4] */,
-    0x7e30803d /* 0001f8c2 - 0001f8ff [   62] */,
-    0x7e95000b /* 0001fa54 - 0001fa5f [   12] */,
+    0x7e30800d /* 0001f8c2 - 0001f8cf [   14] */,
+    0x7e364026 /* 0001f8d9 - 0001f8ff [   39] */,
+    0x7e960007 /* 0001fa58 - 0001fa5f [    8] */,
     0x7e9b8001 /* 0001fa6e - 0001fa6f [    2] */,
     0x7e9f4002 /* 0001fa7d - 0001fa7f [    3] */,
-    0x7ea28004 /* 0001fa8a - 0001fa8e [    5] */,
-    0x7eb1c006 /* 0001fac7 - 0001facd [    7] */,
-    0x7eb74001 /* 0001fadd - 0001fade [    2] */,
-    0x7eba8005 /* 0001faea - 0001faef [    6] */,
-    0x7ebe4006 /* 0001faf9 - 0001faff [    7] */,
+    0x7eb1c000 /* 0001fac7 - 0001fac7 [    1] */,
+    0x7eb24002 /* 0001fac9 - 0001facb [    3] */,
+    0x7eb78000 /* 0001fade - 0001fade [    1] */,
+    0x7ebb0002 /* 0001faec - 0001faee [    3] */,
+    0x7ebec004 /* 0001fafb - 0001faff [    5] */,
     0x7ee4c000 /* 0001fb93 - 0001fb93 [    1] */,
-    0x7efe8405 /* 0001fbfa - 0001ffff [ 1030] */,
+    0x7efec404 /* 0001fbfb - 0001ffff [ 1029] */,
     0xa9b8001f /* 0002a6e0 - 0002a6ff [   32] */,
-    0xadce8005 /* 0002b73a - 0002b73f [    6] */,
-    0xae078001 /* 0002b81e - 0002b81f [    2] */,
-    0xb3a8800d /* 0002cea2 - 0002ceaf [   14] */,
+    0xae07c000 /* 0002b81f - 0002b81f [    1] */,
+    0xb3ab8001 /* 0002ceae - 0002ceaf [    2] */,
     0xbaf8400e /* 0002ebe1 - 0002ebef [   15] */,
     0xbb9789a1 /* 0002ee5e - 0002f7ff [ 2466] */,
     0xbe8785e1 /* 0002fa1e - 0002ffff [ 1506] */,
-    0xc4d2c004 /* 0003134b - 0003134f [    5] */};
+    0xc4d2c004 /* 0003134b - 0003134f [    5] */,
+    0xcd1ebfff /* 0003347a - 00037479 [16384] */,
+    0xdd1ebfff /* 0003747a - 0003b479 [16384] */,
+    0xed1e9b85 /* 0003b47a - 0003cfff [ 7046] */};
 
 /// Returns whether the code unit needs to be escaped.
 ///
@@ -866,7 +875,7 @@ _LIBCPP_HIDE_FROM_ABI inline constexpr uint32_t __entries[735] = {
      return false;
 
   // The entries at the end.
-  if (__code_point >= 0x000323b0)
+  if (__code_point >= 0x0003fc40)
     return true;
 
   ptrdiff_t __i = std::upper_bound(std::begin(__entries), std::end(__entries), (__code_point << 14) | 0x3fffu) - __entries;

>From b796ef4a6534e80ef6c1d6e821fde7a5300ab2e0 Mon Sep 17 00:00:00 2001
From: Charlie Lin <tuug at gmx.us>
Date: Thu, 21 May 2026 15:27:36 -0500
Subject: [PATCH 04/14] Update unicode-case-fold.py

---
 llvm/utils/unicode-case-fold.py | 44 +++++++++++++++++----------------
 1 file changed, 23 insertions(+), 21 deletions(-)

diff --git a/llvm/utils/unicode-case-fold.py b/llvm/utils/unicode-case-fold.py
index 4afb41d4060f0..ca0423082b724 100755
--- a/llvm/utils/unicode-case-fold.py
+++ b/llvm/utils/unicode-case-fold.py
@@ -36,7 +36,9 @@ def mappings(f):
         m = expr.match(line)
         if not m:
             continue
-        from_char = int(m.group(1), 16)
+        group_1 = m.group(1)
+        if "b" in group_1: group_1 = group_1[2:]
+        from_char = int(group_1, 16)
         to_char = int(m.group(2), 16)
         from_name = m.group(3)
 
@@ -101,28 +103,28 @@ def dump_block(b):
 
 
 current_block = []
-f = urlopen(sys.argv[1])
-for m in mappings(f):
-    if len(current_block) == 0:
-        current_block.append(m)
-        continue
-
-    if shift(current_block[0]) != shift(m):
-        # Incompatible shift, start a new block.
+with urlopen(sys.argv[1]) as f:
+    g = [str(x) for x in f.readlines()]
+    for m in mappings(g):
+        if len(current_block) == 0:
+            current_block.append(m)
+            continue
+
+        if shift(current_block[0]) != shift(m):
+            # Incompatible shift, start a new block.
+            dump_block(current_block)
+            current_block = [m]
+            continue
+
+        if len(current_block) == 1 or stride(current_block) == stride2(
+            current_block[-1], m
+        ):
+            current_block.append(m)
+            continue
+
+        # Incompatible stride, start a new block.
         dump_block(current_block)
         current_block = [m]
-        continue
-
-    if len(current_block) == 1 or stride(current_block) == stride2(
-        current_block[-1], m
-    ):
-        current_block.append(m)
-        continue
-
-    # Incompatible stride, start a new block.
-    dump_block(current_block)
-    current_block = [m]
-f.close()
 
 dump_block(current_block)
 

>From 0b6e8367e58be1c003430942372b81166faef7f3 Mon Sep 17 00:00:00 2001
From: Charlie Lin <tuug at gmx.us>
Date: Thu, 21 May 2026 15:28:40 -0500
Subject: [PATCH 05/14] Update UnicodeCaseFold.cpp

---
 llvm/lib/Support/UnicodeCaseFold.cpp | 285 ++++++++++++++++-----------
 1 file changed, 170 insertions(+), 115 deletions(-)

diff --git a/llvm/lib/Support/UnicodeCaseFold.cpp b/llvm/lib/Support/UnicodeCaseFold.cpp
index abf6df8d1e3c3..007c6261568ca 100644
--- a/llvm/lib/Support/UnicodeCaseFold.cpp
+++ b/llvm/lib/Support/UnicodeCaseFold.cpp
@@ -2,11 +2,11 @@
 //
 // This file was generated by utils/unicode-case-fold.py from the Unicode
 // case folding database at
-//    http://www.unicode.org/Public/15.1.0/ucd/CaseFolding.txt
+//    https://www.unicode.org/Public/draft/ucd/CaseFolding.txt
 //
 // To regenerate this file, run:
 //   utils/unicode-case-fold.py \
-//     "http://www.unicode.org/Public/15.1.0/ucd/CaseFolding.txt" \
+//     "https://www.unicode.org/Public/draft/ucd/CaseFolding.txt" \
 //     > lib/Support/UnicodeCaseFold.cpp
 //
 //===----------------------------------------------------------------------===//
@@ -19,7 +19,7 @@ int llvm::sys::unicode::foldCharSimple(int C) {
   // 26 characters
   if (C <= 0x005a)
     return C + 32;
-  // MICRO SIGN
+  // MICRO SIGN\n'
   if (C == 0x00b5)
     return 0x03bc;
   if (C < 0x00c0)
@@ -52,7 +52,7 @@ int llvm::sys::unicode::foldCharSimple(int C) {
   // 23 characters
   if (C <= 0x0176)
     return C | 1;
-  // LATIN CAPITAL LETTER Y WITH DIAERESIS
+  // LATIN CAPITAL LETTER Y WITH DIAERESIS\n'
   if (C == 0x0178)
     return 0x00ff;
   if (C < 0x0179)
@@ -60,10 +60,10 @@ int llvm::sys::unicode::foldCharSimple(int C) {
   // 3 characters
   if (C <= 0x017d && C % 2 == 1)
     return C + 1;
-  // LATIN SMALL LETTER LONG S
+  // LATIN SMALL LETTER LONG S\n'
   if (C == 0x017f)
     return 0x0073;
-  // LATIN CAPITAL LETTER B WITH HOOK
+  // LATIN CAPITAL LETTER B WITH HOOK\n'
   if (C == 0x0181)
     return 0x0253;
   if (C < 0x0182)
@@ -71,10 +71,10 @@ int llvm::sys::unicode::foldCharSimple(int C) {
   // 2 characters
   if (C <= 0x0184)
     return C | 1;
-  // LATIN CAPITAL LETTER OPEN O
+  // LATIN CAPITAL LETTER OPEN O\n'
   if (C == 0x0186)
     return 0x0254;
-  // LATIN CAPITAL LETTER C WITH HOOK
+  // LATIN CAPITAL LETTER C WITH HOOK\n'
   if (C == 0x0187)
     return 0x0188;
   if (C < 0x0189)
@@ -82,43 +82,43 @@ int llvm::sys::unicode::foldCharSimple(int C) {
   // 2 characters
   if (C <= 0x018a)
     return C + 205;
-  // LATIN CAPITAL LETTER D WITH TOPBAR
+  // LATIN CAPITAL LETTER D WITH TOPBAR\n'
   if (C == 0x018b)
     return 0x018c;
-  // LATIN CAPITAL LETTER REVERSED E
+  // LATIN CAPITAL LETTER REVERSED E\n'
   if (C == 0x018e)
     return 0x01dd;
-  // LATIN CAPITAL LETTER SCHWA
+  // LATIN CAPITAL LETTER SCHWA\n'
   if (C == 0x018f)
     return 0x0259;
-  // LATIN CAPITAL LETTER OPEN E
+  // LATIN CAPITAL LETTER OPEN E\n'
   if (C == 0x0190)
     return 0x025b;
-  // LATIN CAPITAL LETTER F WITH HOOK
+  // LATIN CAPITAL LETTER F WITH HOOK\n'
   if (C == 0x0191)
     return 0x0192;
-  // LATIN CAPITAL LETTER G WITH HOOK
+  // LATIN CAPITAL LETTER G WITH HOOK\n'
   if (C == 0x0193)
     return 0x0260;
-  // LATIN CAPITAL LETTER GAMMA
+  // LATIN CAPITAL LETTER GAMMA\n'
   if (C == 0x0194)
     return 0x0263;
-  // LATIN CAPITAL LETTER IOTA
+  // LATIN CAPITAL LETTER IOTA\n'
   if (C == 0x0196)
     return 0x0269;
-  // LATIN CAPITAL LETTER I WITH STROKE
+  // LATIN CAPITAL LETTER I WITH STROKE\n'
   if (C == 0x0197)
     return 0x0268;
-  // LATIN CAPITAL LETTER K WITH HOOK
+  // LATIN CAPITAL LETTER K WITH HOOK\n'
   if (C == 0x0198)
     return 0x0199;
-  // LATIN CAPITAL LETTER TURNED M
+  // LATIN CAPITAL LETTER TURNED M\n'
   if (C == 0x019c)
     return 0x026f;
-  // LATIN CAPITAL LETTER N WITH LEFT HOOK
+  // LATIN CAPITAL LETTER N WITH LEFT HOOK\n'
   if (C == 0x019d)
     return 0x0272;
-  // LATIN CAPITAL LETTER O WITH MIDDLE TILDE
+  // LATIN CAPITAL LETTER O WITH MIDDLE TILDE\n'
   if (C == 0x019f)
     return 0x0275;
   if (C < 0x01a0)
@@ -126,22 +126,22 @@ int llvm::sys::unicode::foldCharSimple(int C) {
   // 3 characters
   if (C <= 0x01a4)
     return C | 1;
-  // LATIN LETTER YR
+  // LATIN LETTER YR\n'
   if (C == 0x01a6)
     return 0x0280;
-  // LATIN CAPITAL LETTER TONE TWO
+  // LATIN CAPITAL LETTER TONE TWO\n'
   if (C == 0x01a7)
     return 0x01a8;
-  // LATIN CAPITAL LETTER ESH
+  // LATIN CAPITAL LETTER ESH\n'
   if (C == 0x01a9)
     return 0x0283;
-  // LATIN CAPITAL LETTER T WITH HOOK
+  // LATIN CAPITAL LETTER T WITH HOOK\n'
   if (C == 0x01ac)
     return 0x01ad;
-  // LATIN CAPITAL LETTER T WITH RETROFLEX HOOK
+  // LATIN CAPITAL LETTER T WITH RETROFLEX HOOK\n'
   if (C == 0x01ae)
     return 0x0288;
-  // LATIN CAPITAL LETTER U WITH HORN
+  // LATIN CAPITAL LETTER U WITH HORN\n'
   if (C == 0x01af)
     return 0x01b0;
   if (C < 0x01b1)
@@ -154,7 +154,7 @@ int llvm::sys::unicode::foldCharSimple(int C) {
   // 2 characters
   if (C <= 0x01b5 && C % 2 == 1)
     return C + 1;
-  // LATIN CAPITAL LETTER EZH
+  // LATIN CAPITAL LETTER EZH\n'
   if (C == 0x01b7)
     return 0x0292;
   if (C < 0x01b8)
@@ -162,19 +162,19 @@ int llvm::sys::unicode::foldCharSimple(int C) {
   // 2 characters
   if (C <= 0x01bc && C % 4 == 0)
     return C + 1;
-  // LATIN CAPITAL LETTER DZ WITH CARON
+  // LATIN CAPITAL LETTER DZ WITH CARON\n'
   if (C == 0x01c4)
     return 0x01c6;
-  // LATIN CAPITAL LETTER D WITH SMALL LETTER Z WITH CARON
+  // LATIN CAPITAL LETTER D WITH SMALL LETTER Z WITH CARON\n'
   if (C == 0x01c5)
     return 0x01c6;
-  // LATIN CAPITAL LETTER LJ
+  // LATIN CAPITAL LETTER LJ\n'
   if (C == 0x01c7)
     return 0x01c9;
-  // LATIN CAPITAL LETTER L WITH SMALL LETTER J
+  // LATIN CAPITAL LETTER L WITH SMALL LETTER J\n'
   if (C == 0x01c8)
     return 0x01c9;
-  // LATIN CAPITAL LETTER NJ
+  // LATIN CAPITAL LETTER NJ\n'
   if (C == 0x01ca)
     return 0x01cc;
   if (C < 0x01cb)
@@ -187,7 +187,7 @@ int llvm::sys::unicode::foldCharSimple(int C) {
   // 9 characters
   if (C <= 0x01ee)
     return C | 1;
-  // LATIN CAPITAL LETTER DZ
+  // LATIN CAPITAL LETTER DZ\n'
   if (C == 0x01f1)
     return 0x01f3;
   if (C < 0x01f2)
@@ -195,10 +195,10 @@ int llvm::sys::unicode::foldCharSimple(int C) {
   // 2 characters
   if (C <= 0x01f4)
     return C | 1;
-  // LATIN CAPITAL LETTER HWAIR
+  // LATIN CAPITAL LETTER HWAIR\n'
   if (C == 0x01f6)
     return 0x0195;
-  // LATIN CAPITAL LETTER WYNN
+  // LATIN CAPITAL LETTER WYNN\n'
   if (C == 0x01f7)
     return 0x01bf;
   if (C < 0x01f8)
@@ -206,7 +206,7 @@ int llvm::sys::unicode::foldCharSimple(int C) {
   // 20 characters
   if (C <= 0x021e)
     return C | 1;
-  // LATIN CAPITAL LETTER N WITH LONG RIGHT LEG
+  // LATIN CAPITAL LETTER N WITH LONG RIGHT LEG\n'
   if (C == 0x0220)
     return 0x019e;
   if (C < 0x0222)
@@ -214,28 +214,28 @@ int llvm::sys::unicode::foldCharSimple(int C) {
   // 9 characters
   if (C <= 0x0232)
     return C | 1;
-  // LATIN CAPITAL LETTER A WITH STROKE
+  // LATIN CAPITAL LETTER A WITH STROKE\n'
   if (C == 0x023a)
     return 0x2c65;
-  // LATIN CAPITAL LETTER C WITH STROKE
+  // LATIN CAPITAL LETTER C WITH STROKE\n'
   if (C == 0x023b)
     return 0x023c;
-  // LATIN CAPITAL LETTER L WITH BAR
+  // LATIN CAPITAL LETTER L WITH BAR\n'
   if (C == 0x023d)
     return 0x019a;
-  // LATIN CAPITAL LETTER T WITH DIAGONAL STROKE
+  // LATIN CAPITAL LETTER T WITH DIAGONAL STROKE\n'
   if (C == 0x023e)
     return 0x2c66;
-  // LATIN CAPITAL LETTER GLOTTAL STOP
+  // LATIN CAPITAL LETTER GLOTTAL STOP\n'
   if (C == 0x0241)
     return 0x0242;
-  // LATIN CAPITAL LETTER B WITH STROKE
+  // LATIN CAPITAL LETTER B WITH STROKE\n'
   if (C == 0x0243)
     return 0x0180;
-  // LATIN CAPITAL LETTER U BAR
+  // LATIN CAPITAL LETTER U BAR\n'
   if (C == 0x0244)
     return 0x0289;
-  // LATIN CAPITAL LETTER TURNED V
+  // LATIN CAPITAL LETTER TURNED V\n'
   if (C == 0x0245)
     return 0x028c;
   if (C < 0x0246)
@@ -243,7 +243,7 @@ int llvm::sys::unicode::foldCharSimple(int C) {
   // 5 characters
   if (C <= 0x024e)
     return C | 1;
-  // COMBINING GREEK YPOGEGRAMMENI
+  // COMBINING GREEK YPOGEGRAMMENI\n'
   if (C == 0x0345)
     return 0x03b9;
   if (C < 0x0370)
@@ -251,13 +251,13 @@ int llvm::sys::unicode::foldCharSimple(int C) {
   // 2 characters
   if (C <= 0x0372)
     return C | 1;
-  // GREEK CAPITAL LETTER PAMPHYLIAN DIGAMMA
+  // GREEK CAPITAL LETTER PAMPHYLIAN DIGAMMA\n'
   if (C == 0x0376)
     return 0x0377;
-  // GREEK CAPITAL LETTER YOT
+  // GREEK CAPITAL LETTER YOT\n'
   if (C == 0x037f)
     return 0x03f3;
-  // GREEK CAPITAL LETTER ALPHA WITH TONOS
+  // GREEK CAPITAL LETTER ALPHA WITH TONOS\n'
   if (C == 0x0386)
     return 0x03ac;
   if (C < 0x0388)
@@ -265,7 +265,7 @@ int llvm::sys::unicode::foldCharSimple(int C) {
   // 3 characters
   if (C <= 0x038a)
     return C + 37;
-  // GREEK CAPITAL LETTER OMICRON WITH TONOS
+  // GREEK CAPITAL LETTER OMICRON WITH TONOS\n'
   if (C == 0x038c)
     return 0x03cc;
   if (C < 0x038e)
@@ -283,22 +283,22 @@ int llvm::sys::unicode::foldCharSimple(int C) {
   // 9 characters
   if (C <= 0x03ab)
     return C + 32;
-  // GREEK SMALL LETTER FINAL SIGMA
+  // GREEK SMALL LETTER FINAL SIGMA\n'
   if (C == 0x03c2)
     return 0x03c3;
-  // GREEK CAPITAL KAI SYMBOL
+  // GREEK CAPITAL KAI SYMBOL\n'
   if (C == 0x03cf)
     return 0x03d7;
-  // GREEK BETA SYMBOL
+  // GREEK BETA SYMBOL\n'
   if (C == 0x03d0)
     return 0x03b2;
-  // GREEK THETA SYMBOL
+  // GREEK THETA SYMBOL\n'
   if (C == 0x03d1)
     return 0x03b8;
-  // GREEK PHI SYMBOL
+  // GREEK PHI SYMBOL\n'
   if (C == 0x03d5)
     return 0x03c6;
-  // GREEK PI SYMBOL
+  // GREEK PI SYMBOL\n'
   if (C == 0x03d6)
     return 0x03c0;
   if (C < 0x03d8)
@@ -306,25 +306,25 @@ int llvm::sys::unicode::foldCharSimple(int C) {
   // 12 characters
   if (C <= 0x03ee)
     return C | 1;
-  // GREEK KAPPA SYMBOL
+  // GREEK KAPPA SYMBOL\n'
   if (C == 0x03f0)
     return 0x03ba;
-  // GREEK RHO SYMBOL
+  // GREEK RHO SYMBOL\n'
   if (C == 0x03f1)
     return 0x03c1;
-  // GREEK CAPITAL THETA SYMBOL
+  // GREEK CAPITAL THETA SYMBOL\n'
   if (C == 0x03f4)
     return 0x03b8;
-  // GREEK LUNATE EPSILON SYMBOL
+  // GREEK LUNATE EPSILON SYMBOL\n'
   if (C == 0x03f5)
     return 0x03b5;
-  // GREEK CAPITAL LETTER SHO
+  // GREEK CAPITAL LETTER SHO\n'
   if (C == 0x03f7)
     return 0x03f8;
-  // GREEK CAPITAL LUNATE SIGMA SYMBOL
+  // GREEK CAPITAL LUNATE SIGMA SYMBOL\n'
   if (C == 0x03f9)
     return 0x03f2;
-  // GREEK CAPITAL LETTER SAN
+  // GREEK CAPITAL LETTER SAN\n'
   if (C == 0x03fa)
     return 0x03fb;
   if (C < 0x03fd)
@@ -352,7 +352,7 @@ int llvm::sys::unicode::foldCharSimple(int C) {
   // 27 characters
   if (C <= 0x04be)
     return C | 1;
-  // CYRILLIC LETTER PALOCHKA
+  // CYRILLIC LETTER PALOCHKA\n'
   if (C == 0x04c0)
     return 0x04cf;
   if (C < 0x04c1)
@@ -385,13 +385,13 @@ int llvm::sys::unicode::foldCharSimple(int C) {
   // 6 characters
   if (C <= 0x13fd)
     return C + -8;
-  // CYRILLIC SMALL LETTER ROUNDED VE
+  // CYRILLIC SMALL LETTER ROUNDED VE\n'
   if (C == 0x1c80)
     return 0x0432;
-  // CYRILLIC SMALL LETTER LONG-LEGGED DE
+  // CYRILLIC SMALL LETTER LONG-LEGGED DE\n'
   if (C == 0x1c81)
     return 0x0434;
-  // CYRILLIC SMALL LETTER NARROW O
+  // CYRILLIC SMALL LETTER NARROW O\n'
   if (C == 0x1c82)
     return 0x043e;
   if (C < 0x1c83)
@@ -399,18 +399,21 @@ int llvm::sys::unicode::foldCharSimple(int C) {
   // 2 characters
   if (C <= 0x1c84)
     return C + -6210;
-  // CYRILLIC SMALL LETTER THREE-LEGGED TE
+  // CYRILLIC SMALL LETTER THREE-LEGGED TE\n'
   if (C == 0x1c85)
     return 0x0442;
-  // CYRILLIC SMALL LETTER TALL HARD SIGN
+  // CYRILLIC SMALL LETTER TALL HARD SIGN\n'
   if (C == 0x1c86)
     return 0x044a;
-  // CYRILLIC SMALL LETTER TALL YAT
+  // CYRILLIC SMALL LETTER TALL YAT\n'
   if (C == 0x1c87)
     return 0x0463;
-  // CYRILLIC SMALL LETTER UNBLENDED UK
+  // CYRILLIC SMALL LETTER UNBLENDED UK\n'
   if (C == 0x1c88)
     return 0xa64b;
+  // CYRILLIC CAPITAL LETTER TJE\n'
+  if (C == 0x1c89)
+    return 0x1c8a;
   if (C < 0x1c90)
     return C;
   // 43 characters
@@ -426,10 +429,10 @@ int llvm::sys::unicode::foldCharSimple(int C) {
   // 75 characters
   if (C <= 0x1e94)
     return C | 1;
-  // LATIN SMALL LETTER LONG S WITH DOT ABOVE
+  // LATIN SMALL LETTER LONG S WITH DOT ABOVE\n'
   if (C == 0x1e9b)
     return 0x1e61;
-  // LATIN CAPITAL LETTER SHARP S
+  // LATIN CAPITAL LETTER SHARP S\n'
   if (C == 0x1e9e)
     return 0x00df;
   if (C < 0x1ea0)
@@ -497,10 +500,10 @@ int llvm::sys::unicode::foldCharSimple(int C) {
   // 2 characters
   if (C <= 0x1fbb)
     return C + -74;
-  // GREEK CAPITAL LETTER ALPHA WITH PROSGEGRAMMENI
+  // GREEK CAPITAL LETTER ALPHA WITH PROSGEGRAMMENI\n'
   if (C == 0x1fbc)
     return 0x1fb3;
-  // GREEK PROSGEGRAMMENI
+  // GREEK PROSGEGRAMMENI\n'
   if (C == 0x1fbe)
     return 0x03b9;
   if (C < 0x1fc8)
@@ -508,10 +511,10 @@ int llvm::sys::unicode::foldCharSimple(int C) {
   // 4 characters
   if (C <= 0x1fcb)
     return C + -86;
-  // GREEK CAPITAL LETTER ETA WITH PROSGEGRAMMENI
+  // GREEK CAPITAL LETTER ETA WITH PROSGEGRAMMENI\n'
   if (C == 0x1fcc)
     return 0x1fc3;
-  // GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA
+  // GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA\n'
   if (C == 0x1fd3)
     return 0x0390;
   if (C < 0x1fd8)
@@ -524,7 +527,7 @@ int llvm::sys::unicode::foldCharSimple(int C) {
   // 2 characters
   if (C <= 0x1fdb)
     return C + -100;
-  // GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA
+  // GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA\n'
   if (C == 0x1fe3)
     return 0x03b0;
   if (C < 0x1fe8)
@@ -537,7 +540,7 @@ int llvm::sys::unicode::foldCharSimple(int C) {
   // 2 characters
   if (C <= 0x1feb)
     return C + -112;
-  // GREEK CAPITAL LETTER RHO WITH DASIA
+  // GREEK CAPITAL LETTER RHO WITH DASIA\n'
   if (C == 0x1fec)
     return 0x1fe5;
   if (C < 0x1ff8)
@@ -550,19 +553,19 @@ int llvm::sys::unicode::foldCharSimple(int C) {
   // 2 characters
   if (C <= 0x1ffb)
     return C + -126;
-  // GREEK CAPITAL LETTER OMEGA WITH PROSGEGRAMMENI
+  // GREEK CAPITAL LETTER OMEGA WITH PROSGEGRAMMENI\n'
   if (C == 0x1ffc)
     return 0x1ff3;
-  // OHM SIGN
+  // OHM SIGN\n'
   if (C == 0x2126)
     return 0x03c9;
-  // KELVIN SIGN
+  // KELVIN SIGN\n'
   if (C == 0x212a)
     return 0x006b;
-  // ANGSTROM SIGN
+  // ANGSTROM SIGN\n'
   if (C == 0x212b)
     return 0x00e5;
-  // TURNED CAPITAL F
+  // TURNED CAPITAL F\n'
   if (C == 0x2132)
     return 0x214e;
   if (C < 0x2160)
@@ -570,7 +573,7 @@ int llvm::sys::unicode::foldCharSimple(int C) {
   // 16 characters
   if (C <= 0x216f)
     return C + 16;
-  // ROMAN NUMERAL REVERSED ONE HUNDRED
+  // ROMAN NUMERAL REVERSED ONE HUNDRED\n'
   if (C == 0x2183)
     return 0x2184;
   if (C < 0x24b6)
@@ -583,16 +586,16 @@ int llvm::sys::unicode::foldCharSimple(int C) {
   // 48 characters
   if (C <= 0x2c2f)
     return C + 48;
-  // LATIN CAPITAL LETTER L WITH DOUBLE BAR
+  // LATIN CAPITAL LETTER L WITH DOUBLE BAR\n'
   if (C == 0x2c60)
     return 0x2c61;
-  // LATIN CAPITAL LETTER L WITH MIDDLE TILDE
+  // LATIN CAPITAL LETTER L WITH MIDDLE TILDE\n'
   if (C == 0x2c62)
     return 0x026b;
-  // LATIN CAPITAL LETTER P WITH STROKE
+  // LATIN CAPITAL LETTER P WITH STROKE\n'
   if (C == 0x2c63)
     return 0x1d7d;
-  // LATIN CAPITAL LETTER R WITH TAIL
+  // LATIN CAPITAL LETTER R WITH TAIL\n'
   if (C == 0x2c64)
     return 0x027d;
   if (C < 0x2c67)
@@ -600,16 +603,16 @@ int llvm::sys::unicode::foldCharSimple(int C) {
   // 3 characters
   if (C <= 0x2c6b && C % 2 == 1)
     return C + 1;
-  // LATIN CAPITAL LETTER ALPHA
+  // LATIN CAPITAL LETTER ALPHA\n'
   if (C == 0x2c6d)
     return 0x0251;
-  // LATIN CAPITAL LETTER M WITH HOOK
+  // LATIN CAPITAL LETTER M WITH HOOK\n'
   if (C == 0x2c6e)
     return 0x0271;
-  // LATIN CAPITAL LETTER TURNED A
+  // LATIN CAPITAL LETTER TURNED A\n'
   if (C == 0x2c6f)
     return 0x0250;
-  // LATIN CAPITAL LETTER TURNED ALPHA
+  // LATIN CAPITAL LETTER TURNED ALPHA\n'
   if (C == 0x2c70)
     return 0x0252;
   if (C < 0x2c72)
@@ -662,7 +665,7 @@ int llvm::sys::unicode::foldCharSimple(int C) {
   // 2 characters
   if (C <= 0xa77b && C % 2 == 1)
     return C + 1;
-  // LATIN CAPITAL LETTER INSULAR G
+  // LATIN CAPITAL LETTER INSULAR G\n'
   if (C == 0xa77d)
     return 0x1d79;
   if (C < 0xa77e)
@@ -670,10 +673,10 @@ int llvm::sys::unicode::foldCharSimple(int C) {
   // 5 characters
   if (C <= 0xa786)
     return C | 1;
-  // LATIN CAPITAL LETTER SALTILLO
+  // LATIN CAPITAL LETTER SALTILLO\n'
   if (C == 0xa78b)
     return 0xa78c;
-  // LATIN CAPITAL LETTER TURNED H
+  // LATIN CAPITAL LETTER TURNED H\n'
   if (C == 0xa78d)
     return 0x0265;
   if (C < 0xa790)
@@ -686,31 +689,31 @@ int llvm::sys::unicode::foldCharSimple(int C) {
   // 10 characters
   if (C <= 0xa7a8)
     return C | 1;
-  // LATIN CAPITAL LETTER H WITH HOOK
+  // LATIN CAPITAL LETTER H WITH HOOK\n'
   if (C == 0xa7aa)
     return 0x0266;
-  // LATIN CAPITAL LETTER REVERSED OPEN E
+  // LATIN CAPITAL LETTER REVERSED OPEN E\n'
   if (C == 0xa7ab)
     return 0x025c;
-  // LATIN CAPITAL LETTER SCRIPT G
+  // LATIN CAPITAL LETTER SCRIPT G\n'
   if (C == 0xa7ac)
     return 0x0261;
-  // LATIN CAPITAL LETTER L WITH BELT
+  // LATIN CAPITAL LETTER L WITH BELT\n'
   if (C == 0xa7ad)
     return 0x026c;
-  // LATIN CAPITAL LETTER SMALL CAPITAL I
+  // LATIN CAPITAL LETTER SMALL CAPITAL I\n'
   if (C == 0xa7ae)
     return 0x026a;
-  // LATIN CAPITAL LETTER TURNED K
+  // LATIN CAPITAL LETTER TURNED K\n'
   if (C == 0xa7b0)
     return 0x029e;
-  // LATIN CAPITAL LETTER TURNED T
+  // LATIN CAPITAL LETTER TURNED T\n'
   if (C == 0xa7b1)
     return 0x0287;
-  // LATIN CAPITAL LETTER J WITH CROSSED-TAIL
+  // LATIN CAPITAL LETTER J WITH CROSSED-TAIL\n'
   if (C == 0xa7b2)
     return 0x029d;
-  // LATIN CAPITAL LETTER CHI
+  // LATIN CAPITAL LETTER CHI\n'
   if (C == 0xa7b3)
     return 0xab53;
   if (C < 0xa7b4)
@@ -718,13 +721,13 @@ int llvm::sys::unicode::foldCharSimple(int C) {
   // 8 characters
   if (C <= 0xa7c2)
     return C | 1;
-  // LATIN CAPITAL LETTER C WITH PALATAL HOOK
+  // LATIN CAPITAL LETTER C WITH PALATAL HOOK\n'
   if (C == 0xa7c4)
     return 0xa794;
-  // LATIN CAPITAL LETTER S WITH HOOK
+  // LATIN CAPITAL LETTER S WITH HOOK\n'
   if (C == 0xa7c5)
     return 0x0282;
-  // LATIN CAPITAL LETTER Z WITH PALATAL HOOK
+  // LATIN CAPITAL LETTER Z WITH PALATAL HOOK\n'
   if (C == 0xa7c6)
     return 0x1d8e;
   if (C < 0xa7c7)
@@ -732,22 +735,36 @@ int llvm::sys::unicode::foldCharSimple(int C) {
   // 2 characters
   if (C <= 0xa7c9 && C % 2 == 1)
     return C + 1;
-  if (C < 0xa7d0)
+  // LATIN CAPITAL LETTER RAMS HORN\n'
+  if (C == 0xa7cb)
+    return 0x0264;
+  if (C < 0xa7cc)
+    return C;
+  // 8 characters
+  if (C <= 0xa7da)
+    return C | 1;
+  // LATIN CAPITAL LETTER LAMBDA WITH STROKE\n'
+  if (C == 0xa7dc)
+    return 0x019b;
+  if (C < 0xa7dd)
     return C;
   // 2 characters
-  if (C <= 0xa7d6 && C % 6 == 0)
-    return C + 1;
-  if (C < 0xa7d8)
+  if (C <= 0xa7e2 && C % 5 == 3)
+    return C + -42342;
+  // LATIN CAPITAL LETTER REVERSED HALF H\n'
+  if (C == 0xa7f5)
+    return 0xa7f6;
+  if (C < 0xab6c)
     return C;
   // 2 characters
-  if (C <= 0xa7f5 && C % 29 == 19)
-    return C + 1;
+  if (C <= 0xab6d)
+    return C + -33;
   if (C < 0xab70)
     return C;
   // 80 characters
   if (C <= 0xabbf)
     return C + -38864;
-  // LATIN SMALL LIGATURE LONG S T
+  // LATIN SMALL LIGATURE LONG S T\n'
   if (C == 0xfb05)
     return 0xfb06;
   if (C < 0xff21)
@@ -790,6 +807,11 @@ int llvm::sys::unicode::foldCharSimple(int C) {
   // 51 characters
   if (C <= 0x10cb2)
     return C + 64;
+  if (C < 0x10d50)
+    return C;
+  // 22 characters
+  if (C <= 0x10d65)
+    return C + 32;
   if (C < 0x118a0)
     return C;
   // 32 characters
@@ -800,6 +822,39 @@ int llvm::sys::unicode::foldCharSimple(int C) {
   // 32 characters
   if (C <= 0x16e5f)
     return C + 32;
+  if (C < 0x16ea0)
+    return C;
+  // 25 characters
+  if (C <= 0x16eb8)
+    return C + 27;
+  if (C < 0x1df40)
+    return C;
+  // 2 characters
+  if (C <= 0x1df48 && C % 8 == 0)
+    return C + 1;
+  if (C < 0x1df4a)
+    return C;
+  // 2 characters
+  if (C <= 0x1df4d && C % 3 == 1)
+    return C + 1;
+  if (C < 0x1df51)
+    return C;
+  // 2 characters
+  if (C <= 0x1df68 && C % 23 == 0)
+    return C + 1;
+  if (C < 0x1df6a)
+    return C;
+  // 3 characters
+  if (C <= 0x1df6e)
+    return C | 1;
+  if (C < 0x1df72)
+    return C;
+  // 7 characters
+  if (C <= 0x1df7e)
+    return C | 1;
+  // LATIN SMALL LIGATURE LONG S WITH DESCENDER S\n'
+  if (C == 0x1df95)
+    return 0x00df;
   if (C < 0x1e900)
     return C;
   // 34 characters

>From 23a8738ac88fe6400efcfb7a6bb6d907cf9d6976 Mon Sep 17 00:00:00 2001
From: Charlie Lin <tuug at gmx.us>
Date: Thu, 21 May 2026 15:40:45 -0500
Subject: [PATCH 06/14] Update unicode-case-fold.py

---
 llvm/utils/unicode-case-fold.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/llvm/utils/unicode-case-fold.py b/llvm/utils/unicode-case-fold.py
index ca0423082b724..cfa5b279facf3 100755
--- a/llvm/utils/unicode-case-fold.py
+++ b/llvm/utils/unicode-case-fold.py
@@ -40,7 +40,9 @@ def mappings(f):
         if "b" in group_1: group_1 = group_1[2:]
         from_char = int(group_1, 16)
         to_char = int(m.group(2), 16)
-        from_name = m.group(3)
+        group_3 = m.group(3)
+        if "\\n" in group_3: group_3 = group_3[:-3]
+        from_name = group_3
 
         if from_char <= previous_from:
             raise Exception("Duplicate or unsorted characters in input")
@@ -68,7 +70,6 @@ def stride(block):
 # shift and the stride between adjecant mappings (if any) is constant.
 def dump_block(b):
     global body
-
     if len(b) == 1:
         # Special case for handling blocks of length 1. We don't even need to
         # emit the "if (C < X) return C" check below as all characters in this

>From ead2efbcc813f7072f7a15547eb651c81f831541 Mon Sep 17 00:00:00 2001
From: Charlie Lin <tuug at gmx.us>
Date: Thu, 21 May 2026 15:41:50 -0500
Subject: [PATCH 07/14] Update UnicodeCaseFold.cpp

---
 llvm/lib/Support/UnicodeCaseFold.cpp | 224 +++++++++++++--------------
 1 file changed, 112 insertions(+), 112 deletions(-)

diff --git a/llvm/lib/Support/UnicodeCaseFold.cpp b/llvm/lib/Support/UnicodeCaseFold.cpp
index 007c6261568ca..3e44ac5210f71 100644
--- a/llvm/lib/Support/UnicodeCaseFold.cpp
+++ b/llvm/lib/Support/UnicodeCaseFold.cpp
@@ -19,7 +19,7 @@ int llvm::sys::unicode::foldCharSimple(int C) {
   // 26 characters
   if (C <= 0x005a)
     return C + 32;
-  // MICRO SIGN\n'
+  // MICRO SIGN
   if (C == 0x00b5)
     return 0x03bc;
   if (C < 0x00c0)
@@ -52,7 +52,7 @@ int llvm::sys::unicode::foldCharSimple(int C) {
   // 23 characters
   if (C <= 0x0176)
     return C | 1;
-  // LATIN CAPITAL LETTER Y WITH DIAERESIS\n'
+  // LATIN CAPITAL LETTER Y WITH DIAERESIS
   if (C == 0x0178)
     return 0x00ff;
   if (C < 0x0179)
@@ -60,10 +60,10 @@ int llvm::sys::unicode::foldCharSimple(int C) {
   // 3 characters
   if (C <= 0x017d && C % 2 == 1)
     return C + 1;
-  // LATIN SMALL LETTER LONG S\n'
+  // LATIN SMALL LETTER LONG S
   if (C == 0x017f)
     return 0x0073;
-  // LATIN CAPITAL LETTER B WITH HOOK\n'
+  // LATIN CAPITAL LETTER B WITH HOOK
   if (C == 0x0181)
     return 0x0253;
   if (C < 0x0182)
@@ -71,10 +71,10 @@ int llvm::sys::unicode::foldCharSimple(int C) {
   // 2 characters
   if (C <= 0x0184)
     return C | 1;
-  // LATIN CAPITAL LETTER OPEN O\n'
+  // LATIN CAPITAL LETTER OPEN O
   if (C == 0x0186)
     return 0x0254;
-  // LATIN CAPITAL LETTER C WITH HOOK\n'
+  // LATIN CAPITAL LETTER C WITH HOOK
   if (C == 0x0187)
     return 0x0188;
   if (C < 0x0189)
@@ -82,43 +82,43 @@ int llvm::sys::unicode::foldCharSimple(int C) {
   // 2 characters
   if (C <= 0x018a)
     return C + 205;
-  // LATIN CAPITAL LETTER D WITH TOPBAR\n'
+  // LATIN CAPITAL LETTER D WITH TOPBAR
   if (C == 0x018b)
     return 0x018c;
-  // LATIN CAPITAL LETTER REVERSED E\n'
+  // LATIN CAPITAL LETTER REVERSED E
   if (C == 0x018e)
     return 0x01dd;
-  // LATIN CAPITAL LETTER SCHWA\n'
+  // LATIN CAPITAL LETTER SCHWA
   if (C == 0x018f)
     return 0x0259;
-  // LATIN CAPITAL LETTER OPEN E\n'
+  // LATIN CAPITAL LETTER OPEN E
   if (C == 0x0190)
     return 0x025b;
-  // LATIN CAPITAL LETTER F WITH HOOK\n'
+  // LATIN CAPITAL LETTER F WITH HOOK
   if (C == 0x0191)
     return 0x0192;
-  // LATIN CAPITAL LETTER G WITH HOOK\n'
+  // LATIN CAPITAL LETTER G WITH HOOK
   if (C == 0x0193)
     return 0x0260;
-  // LATIN CAPITAL LETTER GAMMA\n'
+  // LATIN CAPITAL LETTER GAMMA
   if (C == 0x0194)
     return 0x0263;
-  // LATIN CAPITAL LETTER IOTA\n'
+  // LATIN CAPITAL LETTER IOTA
   if (C == 0x0196)
     return 0x0269;
-  // LATIN CAPITAL LETTER I WITH STROKE\n'
+  // LATIN CAPITAL LETTER I WITH STROKE
   if (C == 0x0197)
     return 0x0268;
-  // LATIN CAPITAL LETTER K WITH HOOK\n'
+  // LATIN CAPITAL LETTER K WITH HOOK
   if (C == 0x0198)
     return 0x0199;
-  // LATIN CAPITAL LETTER TURNED M\n'
+  // LATIN CAPITAL LETTER TURNED M
   if (C == 0x019c)
     return 0x026f;
-  // LATIN CAPITAL LETTER N WITH LEFT HOOK\n'
+  // LATIN CAPITAL LETTER N WITH LEFT HOOK
   if (C == 0x019d)
     return 0x0272;
-  // LATIN CAPITAL LETTER O WITH MIDDLE TILDE\n'
+  // LATIN CAPITAL LETTER O WITH MIDDLE TILDE
   if (C == 0x019f)
     return 0x0275;
   if (C < 0x01a0)
@@ -126,22 +126,22 @@ int llvm::sys::unicode::foldCharSimple(int C) {
   // 3 characters
   if (C <= 0x01a4)
     return C | 1;
-  // LATIN LETTER YR\n'
+  // LATIN LETTER YR
   if (C == 0x01a6)
     return 0x0280;
-  // LATIN CAPITAL LETTER TONE TWO\n'
+  // LATIN CAPITAL LETTER TONE TWO
   if (C == 0x01a7)
     return 0x01a8;
-  // LATIN CAPITAL LETTER ESH\n'
+  // LATIN CAPITAL LETTER ESH
   if (C == 0x01a9)
     return 0x0283;
-  // LATIN CAPITAL LETTER T WITH HOOK\n'
+  // LATIN CAPITAL LETTER T WITH HOOK
   if (C == 0x01ac)
     return 0x01ad;
-  // LATIN CAPITAL LETTER T WITH RETROFLEX HOOK\n'
+  // LATIN CAPITAL LETTER T WITH RETROFLEX HOOK
   if (C == 0x01ae)
     return 0x0288;
-  // LATIN CAPITAL LETTER U WITH HORN\n'
+  // LATIN CAPITAL LETTER U WITH HORN
   if (C == 0x01af)
     return 0x01b0;
   if (C < 0x01b1)
@@ -154,7 +154,7 @@ int llvm::sys::unicode::foldCharSimple(int C) {
   // 2 characters
   if (C <= 0x01b5 && C % 2 == 1)
     return C + 1;
-  // LATIN CAPITAL LETTER EZH\n'
+  // LATIN CAPITAL LETTER EZH
   if (C == 0x01b7)
     return 0x0292;
   if (C < 0x01b8)
@@ -162,19 +162,19 @@ int llvm::sys::unicode::foldCharSimple(int C) {
   // 2 characters
   if (C <= 0x01bc && C % 4 == 0)
     return C + 1;
-  // LATIN CAPITAL LETTER DZ WITH CARON\n'
+  // LATIN CAPITAL LETTER DZ WITH CARON
   if (C == 0x01c4)
     return 0x01c6;
-  // LATIN CAPITAL LETTER D WITH SMALL LETTER Z WITH CARON\n'
+  // LATIN CAPITAL LETTER D WITH SMALL LETTER Z WITH CARON
   if (C == 0x01c5)
     return 0x01c6;
-  // LATIN CAPITAL LETTER LJ\n'
+  // LATIN CAPITAL LETTER LJ
   if (C == 0x01c7)
     return 0x01c9;
-  // LATIN CAPITAL LETTER L WITH SMALL LETTER J\n'
+  // LATIN CAPITAL LETTER L WITH SMALL LETTER J
   if (C == 0x01c8)
     return 0x01c9;
-  // LATIN CAPITAL LETTER NJ\n'
+  // LATIN CAPITAL LETTER NJ
   if (C == 0x01ca)
     return 0x01cc;
   if (C < 0x01cb)
@@ -187,7 +187,7 @@ int llvm::sys::unicode::foldCharSimple(int C) {
   // 9 characters
   if (C <= 0x01ee)
     return C | 1;
-  // LATIN CAPITAL LETTER DZ\n'
+  // LATIN CAPITAL LETTER DZ
   if (C == 0x01f1)
     return 0x01f3;
   if (C < 0x01f2)
@@ -195,10 +195,10 @@ int llvm::sys::unicode::foldCharSimple(int C) {
   // 2 characters
   if (C <= 0x01f4)
     return C | 1;
-  // LATIN CAPITAL LETTER HWAIR\n'
+  // LATIN CAPITAL LETTER HWAIR
   if (C == 0x01f6)
     return 0x0195;
-  // LATIN CAPITAL LETTER WYNN\n'
+  // LATIN CAPITAL LETTER WYNN
   if (C == 0x01f7)
     return 0x01bf;
   if (C < 0x01f8)
@@ -206,7 +206,7 @@ int llvm::sys::unicode::foldCharSimple(int C) {
   // 20 characters
   if (C <= 0x021e)
     return C | 1;
-  // LATIN CAPITAL LETTER N WITH LONG RIGHT LEG\n'
+  // LATIN CAPITAL LETTER N WITH LONG RIGHT LEG
   if (C == 0x0220)
     return 0x019e;
   if (C < 0x0222)
@@ -214,28 +214,28 @@ int llvm::sys::unicode::foldCharSimple(int C) {
   // 9 characters
   if (C <= 0x0232)
     return C | 1;
-  // LATIN CAPITAL LETTER A WITH STROKE\n'
+  // LATIN CAPITAL LETTER A WITH STROKE
   if (C == 0x023a)
     return 0x2c65;
-  // LATIN CAPITAL LETTER C WITH STROKE\n'
+  // LATIN CAPITAL LETTER C WITH STROKE
   if (C == 0x023b)
     return 0x023c;
-  // LATIN CAPITAL LETTER L WITH BAR\n'
+  // LATIN CAPITAL LETTER L WITH BAR
   if (C == 0x023d)
     return 0x019a;
-  // LATIN CAPITAL LETTER T WITH DIAGONAL STROKE\n'
+  // LATIN CAPITAL LETTER T WITH DIAGONAL STROKE
   if (C == 0x023e)
     return 0x2c66;
-  // LATIN CAPITAL LETTER GLOTTAL STOP\n'
+  // LATIN CAPITAL LETTER GLOTTAL STOP
   if (C == 0x0241)
     return 0x0242;
-  // LATIN CAPITAL LETTER B WITH STROKE\n'
+  // LATIN CAPITAL LETTER B WITH STROKE
   if (C == 0x0243)
     return 0x0180;
-  // LATIN CAPITAL LETTER U BAR\n'
+  // LATIN CAPITAL LETTER U BAR
   if (C == 0x0244)
     return 0x0289;
-  // LATIN CAPITAL LETTER TURNED V\n'
+  // LATIN CAPITAL LETTER TURNED V
   if (C == 0x0245)
     return 0x028c;
   if (C < 0x0246)
@@ -243,7 +243,7 @@ int llvm::sys::unicode::foldCharSimple(int C) {
   // 5 characters
   if (C <= 0x024e)
     return C | 1;
-  // COMBINING GREEK YPOGEGRAMMENI\n'
+  // COMBINING GREEK YPOGEGRAMMENI
   if (C == 0x0345)
     return 0x03b9;
   if (C < 0x0370)
@@ -251,13 +251,13 @@ int llvm::sys::unicode::foldCharSimple(int C) {
   // 2 characters
   if (C <= 0x0372)
     return C | 1;
-  // GREEK CAPITAL LETTER PAMPHYLIAN DIGAMMA\n'
+  // GREEK CAPITAL LETTER PAMPHYLIAN DIGAMMA
   if (C == 0x0376)
     return 0x0377;
-  // GREEK CAPITAL LETTER YOT\n'
+  // GREEK CAPITAL LETTER YOT
   if (C == 0x037f)
     return 0x03f3;
-  // GREEK CAPITAL LETTER ALPHA WITH TONOS\n'
+  // GREEK CAPITAL LETTER ALPHA WITH TONOS
   if (C == 0x0386)
     return 0x03ac;
   if (C < 0x0388)
@@ -265,7 +265,7 @@ int llvm::sys::unicode::foldCharSimple(int C) {
   // 3 characters
   if (C <= 0x038a)
     return C + 37;
-  // GREEK CAPITAL LETTER OMICRON WITH TONOS\n'
+  // GREEK CAPITAL LETTER OMICRON WITH TONOS
   if (C == 0x038c)
     return 0x03cc;
   if (C < 0x038e)
@@ -283,22 +283,22 @@ int llvm::sys::unicode::foldCharSimple(int C) {
   // 9 characters
   if (C <= 0x03ab)
     return C + 32;
-  // GREEK SMALL LETTER FINAL SIGMA\n'
+  // GREEK SMALL LETTER FINAL SIGMA
   if (C == 0x03c2)
     return 0x03c3;
-  // GREEK CAPITAL KAI SYMBOL\n'
+  // GREEK CAPITAL KAI SYMBOL
   if (C == 0x03cf)
     return 0x03d7;
-  // GREEK BETA SYMBOL\n'
+  // GREEK BETA SYMBOL
   if (C == 0x03d0)
     return 0x03b2;
-  // GREEK THETA SYMBOL\n'
+  // GREEK THETA SYMBOL
   if (C == 0x03d1)
     return 0x03b8;
-  // GREEK PHI SYMBOL\n'
+  // GREEK PHI SYMBOL
   if (C == 0x03d5)
     return 0x03c6;
-  // GREEK PI SYMBOL\n'
+  // GREEK PI SYMBOL
   if (C == 0x03d6)
     return 0x03c0;
   if (C < 0x03d8)
@@ -306,25 +306,25 @@ int llvm::sys::unicode::foldCharSimple(int C) {
   // 12 characters
   if (C <= 0x03ee)
     return C | 1;
-  // GREEK KAPPA SYMBOL\n'
+  // GREEK KAPPA SYMBOL
   if (C == 0x03f0)
     return 0x03ba;
-  // GREEK RHO SYMBOL\n'
+  // GREEK RHO SYMBOL
   if (C == 0x03f1)
     return 0x03c1;
-  // GREEK CAPITAL THETA SYMBOL\n'
+  // GREEK CAPITAL THETA SYMBOL
   if (C == 0x03f4)
     return 0x03b8;
-  // GREEK LUNATE EPSILON SYMBOL\n'
+  // GREEK LUNATE EPSILON SYMBOL
   if (C == 0x03f5)
     return 0x03b5;
-  // GREEK CAPITAL LETTER SHO\n'
+  // GREEK CAPITAL LETTER SHO
   if (C == 0x03f7)
     return 0x03f8;
-  // GREEK CAPITAL LUNATE SIGMA SYMBOL\n'
+  // GREEK CAPITAL LUNATE SIGMA SYMBOL
   if (C == 0x03f9)
     return 0x03f2;
-  // GREEK CAPITAL LETTER SAN\n'
+  // GREEK CAPITAL LETTER SAN
   if (C == 0x03fa)
     return 0x03fb;
   if (C < 0x03fd)
@@ -352,7 +352,7 @@ int llvm::sys::unicode::foldCharSimple(int C) {
   // 27 characters
   if (C <= 0x04be)
     return C | 1;
-  // CYRILLIC LETTER PALOCHKA\n'
+  // CYRILLIC LETTER PALOCHKA
   if (C == 0x04c0)
     return 0x04cf;
   if (C < 0x04c1)
@@ -385,13 +385,13 @@ int llvm::sys::unicode::foldCharSimple(int C) {
   // 6 characters
   if (C <= 0x13fd)
     return C + -8;
-  // CYRILLIC SMALL LETTER ROUNDED VE\n'
+  // CYRILLIC SMALL LETTER ROUNDED VE
   if (C == 0x1c80)
     return 0x0432;
-  // CYRILLIC SMALL LETTER LONG-LEGGED DE\n'
+  // CYRILLIC SMALL LETTER LONG-LEGGED DE
   if (C == 0x1c81)
     return 0x0434;
-  // CYRILLIC SMALL LETTER NARROW O\n'
+  // CYRILLIC SMALL LETTER NARROW O
   if (C == 0x1c82)
     return 0x043e;
   if (C < 0x1c83)
@@ -399,19 +399,19 @@ int llvm::sys::unicode::foldCharSimple(int C) {
   // 2 characters
   if (C <= 0x1c84)
     return C + -6210;
-  // CYRILLIC SMALL LETTER THREE-LEGGED TE\n'
+  // CYRILLIC SMALL LETTER THREE-LEGGED TE
   if (C == 0x1c85)
     return 0x0442;
-  // CYRILLIC SMALL LETTER TALL HARD SIGN\n'
+  // CYRILLIC SMALL LETTER TALL HARD SIGN
   if (C == 0x1c86)
     return 0x044a;
-  // CYRILLIC SMALL LETTER TALL YAT\n'
+  // CYRILLIC SMALL LETTER TALL YAT
   if (C == 0x1c87)
     return 0x0463;
-  // CYRILLIC SMALL LETTER UNBLENDED UK\n'
+  // CYRILLIC SMALL LETTER UNBLENDED UK
   if (C == 0x1c88)
     return 0xa64b;
-  // CYRILLIC CAPITAL LETTER TJE\n'
+  // CYRILLIC CAPITAL LETTER TJE
   if (C == 0x1c89)
     return 0x1c8a;
   if (C < 0x1c90)
@@ -429,10 +429,10 @@ int llvm::sys::unicode::foldCharSimple(int C) {
   // 75 characters
   if (C <= 0x1e94)
     return C | 1;
-  // LATIN SMALL LETTER LONG S WITH DOT ABOVE\n'
+  // LATIN SMALL LETTER LONG S WITH DOT ABOVE
   if (C == 0x1e9b)
     return 0x1e61;
-  // LATIN CAPITAL LETTER SHARP S\n'
+  // LATIN CAPITAL LETTER SHARP S
   if (C == 0x1e9e)
     return 0x00df;
   if (C < 0x1ea0)
@@ -500,10 +500,10 @@ int llvm::sys::unicode::foldCharSimple(int C) {
   // 2 characters
   if (C <= 0x1fbb)
     return C + -74;
-  // GREEK CAPITAL LETTER ALPHA WITH PROSGEGRAMMENI\n'
+  // GREEK CAPITAL LETTER ALPHA WITH PROSGEGRAMMENI
   if (C == 0x1fbc)
     return 0x1fb3;
-  // GREEK PROSGEGRAMMENI\n'
+  // GREEK PROSGEGRAMMENI
   if (C == 0x1fbe)
     return 0x03b9;
   if (C < 0x1fc8)
@@ -511,10 +511,10 @@ int llvm::sys::unicode::foldCharSimple(int C) {
   // 4 characters
   if (C <= 0x1fcb)
     return C + -86;
-  // GREEK CAPITAL LETTER ETA WITH PROSGEGRAMMENI\n'
+  // GREEK CAPITAL LETTER ETA WITH PROSGEGRAMMENI
   if (C == 0x1fcc)
     return 0x1fc3;
-  // GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA\n'
+  // GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA
   if (C == 0x1fd3)
     return 0x0390;
   if (C < 0x1fd8)
@@ -527,7 +527,7 @@ int llvm::sys::unicode::foldCharSimple(int C) {
   // 2 characters
   if (C <= 0x1fdb)
     return C + -100;
-  // GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA\n'
+  // GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA
   if (C == 0x1fe3)
     return 0x03b0;
   if (C < 0x1fe8)
@@ -540,7 +540,7 @@ int llvm::sys::unicode::foldCharSimple(int C) {
   // 2 characters
   if (C <= 0x1feb)
     return C + -112;
-  // GREEK CAPITAL LETTER RHO WITH DASIA\n'
+  // GREEK CAPITAL LETTER RHO WITH DASIA
   if (C == 0x1fec)
     return 0x1fe5;
   if (C < 0x1ff8)
@@ -553,19 +553,19 @@ int llvm::sys::unicode::foldCharSimple(int C) {
   // 2 characters
   if (C <= 0x1ffb)
     return C + -126;
-  // GREEK CAPITAL LETTER OMEGA WITH PROSGEGRAMMENI\n'
+  // GREEK CAPITAL LETTER OMEGA WITH PROSGEGRAMMENI
   if (C == 0x1ffc)
     return 0x1ff3;
-  // OHM SIGN\n'
+  // OHM SIGN
   if (C == 0x2126)
     return 0x03c9;
-  // KELVIN SIGN\n'
+  // KELVIN SIGN
   if (C == 0x212a)
     return 0x006b;
-  // ANGSTROM SIGN\n'
+  // ANGSTROM SIGN
   if (C == 0x212b)
     return 0x00e5;
-  // TURNED CAPITAL F\n'
+  // TURNED CAPITAL F
   if (C == 0x2132)
     return 0x214e;
   if (C < 0x2160)
@@ -573,7 +573,7 @@ int llvm::sys::unicode::foldCharSimple(int C) {
   // 16 characters
   if (C <= 0x216f)
     return C + 16;
-  // ROMAN NUMERAL REVERSED ONE HUNDRED\n'
+  // ROMAN NUMERAL REVERSED ONE HUNDRED
   if (C == 0x2183)
     return 0x2184;
   if (C < 0x24b6)
@@ -586,16 +586,16 @@ int llvm::sys::unicode::foldCharSimple(int C) {
   // 48 characters
   if (C <= 0x2c2f)
     return C + 48;
-  // LATIN CAPITAL LETTER L WITH DOUBLE BAR\n'
+  // LATIN CAPITAL LETTER L WITH DOUBLE BAR
   if (C == 0x2c60)
     return 0x2c61;
-  // LATIN CAPITAL LETTER L WITH MIDDLE TILDE\n'
+  // LATIN CAPITAL LETTER L WITH MIDDLE TILDE
   if (C == 0x2c62)
     return 0x026b;
-  // LATIN CAPITAL LETTER P WITH STROKE\n'
+  // LATIN CAPITAL LETTER P WITH STROKE
   if (C == 0x2c63)
     return 0x1d7d;
-  // LATIN CAPITAL LETTER R WITH TAIL\n'
+  // LATIN CAPITAL LETTER R WITH TAIL
   if (C == 0x2c64)
     return 0x027d;
   if (C < 0x2c67)
@@ -603,16 +603,16 @@ int llvm::sys::unicode::foldCharSimple(int C) {
   // 3 characters
   if (C <= 0x2c6b && C % 2 == 1)
     return C + 1;
-  // LATIN CAPITAL LETTER ALPHA\n'
+  // LATIN CAPITAL LETTER ALPHA
   if (C == 0x2c6d)
     return 0x0251;
-  // LATIN CAPITAL LETTER M WITH HOOK\n'
+  // LATIN CAPITAL LETTER M WITH HOOK
   if (C == 0x2c6e)
     return 0x0271;
-  // LATIN CAPITAL LETTER TURNED A\n'
+  // LATIN CAPITAL LETTER TURNED A
   if (C == 0x2c6f)
     return 0x0250;
-  // LATIN CAPITAL LETTER TURNED ALPHA\n'
+  // LATIN CAPITAL LETTER TURNED ALPHA
   if (C == 0x2c70)
     return 0x0252;
   if (C < 0x2c72)
@@ -665,7 +665,7 @@ int llvm::sys::unicode::foldCharSimple(int C) {
   // 2 characters
   if (C <= 0xa77b && C % 2 == 1)
     return C + 1;
-  // LATIN CAPITAL LETTER INSULAR G\n'
+  // LATIN CAPITAL LETTER INSULAR G
   if (C == 0xa77d)
     return 0x1d79;
   if (C < 0xa77e)
@@ -673,10 +673,10 @@ int llvm::sys::unicode::foldCharSimple(int C) {
   // 5 characters
   if (C <= 0xa786)
     return C | 1;
-  // LATIN CAPITAL LETTER SALTILLO\n'
+  // LATIN CAPITAL LETTER SALTILLO
   if (C == 0xa78b)
     return 0xa78c;
-  // LATIN CAPITAL LETTER TURNED H\n'
+  // LATIN CAPITAL LETTER TURNED H
   if (C == 0xa78d)
     return 0x0265;
   if (C < 0xa790)
@@ -689,31 +689,31 @@ int llvm::sys::unicode::foldCharSimple(int C) {
   // 10 characters
   if (C <= 0xa7a8)
     return C | 1;
-  // LATIN CAPITAL LETTER H WITH HOOK\n'
+  // LATIN CAPITAL LETTER H WITH HOOK
   if (C == 0xa7aa)
     return 0x0266;
-  // LATIN CAPITAL LETTER REVERSED OPEN E\n'
+  // LATIN CAPITAL LETTER REVERSED OPEN E
   if (C == 0xa7ab)
     return 0x025c;
-  // LATIN CAPITAL LETTER SCRIPT G\n'
+  // LATIN CAPITAL LETTER SCRIPT G
   if (C == 0xa7ac)
     return 0x0261;
-  // LATIN CAPITAL LETTER L WITH BELT\n'
+  // LATIN CAPITAL LETTER L WITH BELT
   if (C == 0xa7ad)
     return 0x026c;
-  // LATIN CAPITAL LETTER SMALL CAPITAL I\n'
+  // LATIN CAPITAL LETTER SMALL CAPITAL I
   if (C == 0xa7ae)
     return 0x026a;
-  // LATIN CAPITAL LETTER TURNED K\n'
+  // LATIN CAPITAL LETTER TURNED K
   if (C == 0xa7b0)
     return 0x029e;
-  // LATIN CAPITAL LETTER TURNED T\n'
+  // LATIN CAPITAL LETTER TURNED T
   if (C == 0xa7b1)
     return 0x0287;
-  // LATIN CAPITAL LETTER J WITH CROSSED-TAIL\n'
+  // LATIN CAPITAL LETTER J WITH CROSSED-TAIL
   if (C == 0xa7b2)
     return 0x029d;
-  // LATIN CAPITAL LETTER CHI\n'
+  // LATIN CAPITAL LETTER CHI
   if (C == 0xa7b3)
     return 0xab53;
   if (C < 0xa7b4)
@@ -721,13 +721,13 @@ int llvm::sys::unicode::foldCharSimple(int C) {
   // 8 characters
   if (C <= 0xa7c2)
     return C | 1;
-  // LATIN CAPITAL LETTER C WITH PALATAL HOOK\n'
+  // LATIN CAPITAL LETTER C WITH PALATAL HOOK
   if (C == 0xa7c4)
     return 0xa794;
-  // LATIN CAPITAL LETTER S WITH HOOK\n'
+  // LATIN CAPITAL LETTER S WITH HOOK
   if (C == 0xa7c5)
     return 0x0282;
-  // LATIN CAPITAL LETTER Z WITH PALATAL HOOK\n'
+  // LATIN CAPITAL LETTER Z WITH PALATAL HOOK
   if (C == 0xa7c6)
     return 0x1d8e;
   if (C < 0xa7c7)
@@ -735,7 +735,7 @@ int llvm::sys::unicode::foldCharSimple(int C) {
   // 2 characters
   if (C <= 0xa7c9 && C % 2 == 1)
     return C + 1;
-  // LATIN CAPITAL LETTER RAMS HORN\n'
+  // LATIN CAPITAL LETTER RAMS HORN
   if (C == 0xa7cb)
     return 0x0264;
   if (C < 0xa7cc)
@@ -743,7 +743,7 @@ int llvm::sys::unicode::foldCharSimple(int C) {
   // 8 characters
   if (C <= 0xa7da)
     return C | 1;
-  // LATIN CAPITAL LETTER LAMBDA WITH STROKE\n'
+  // LATIN CAPITAL LETTER LAMBDA WITH STROKE
   if (C == 0xa7dc)
     return 0x019b;
   if (C < 0xa7dd)
@@ -751,7 +751,7 @@ int llvm::sys::unicode::foldCharSimple(int C) {
   // 2 characters
   if (C <= 0xa7e2 && C % 5 == 3)
     return C + -42342;
-  // LATIN CAPITAL LETTER REVERSED HALF H\n'
+  // LATIN CAPITAL LETTER REVERSED HALF H
   if (C == 0xa7f5)
     return 0xa7f6;
   if (C < 0xab6c)
@@ -764,7 +764,7 @@ int llvm::sys::unicode::foldCharSimple(int C) {
   // 80 characters
   if (C <= 0xabbf)
     return C + -38864;
-  // LATIN SMALL LIGATURE LONG S T\n'
+  // LATIN SMALL LIGATURE LONG S T
   if (C == 0xfb05)
     return 0xfb06;
   if (C < 0xff21)
@@ -852,7 +852,7 @@ int llvm::sys::unicode::foldCharSimple(int C) {
   // 7 characters
   if (C <= 0x1df7e)
     return C | 1;
-  // LATIN SMALL LIGATURE LONG S WITH DESCENDER S\n'
+  // LATIN SMALL LIGATURE LONG S WITH DESCENDER S
   if (C == 0x1df95)
     return 0x00df;
   if (C < 0x1e900)

>From 293470d13ba0bdfbd254a8a2917daa597c2ac54e Mon Sep 17 00:00:00 2001
From: Charlie Lin <tuug at gmx.us>
Date: Thu, 21 May 2026 15:51:48 -0500
Subject: [PATCH 08/14] Update UnicodeNameMappingGenerator.cpp

---
 llvm/utils/UnicodeData/UnicodeNameMappingGenerator.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/utils/UnicodeData/UnicodeNameMappingGenerator.cpp b/llvm/utils/UnicodeData/UnicodeNameMappingGenerator.cpp
index 106a51b80afca..97dd47fba7a79 100644
--- a/llvm/utils/UnicodeData/UnicodeNameMappingGenerator.cpp
+++ b/llvm/utils/UnicodeData/UnicodeNameMappingGenerator.cpp
@@ -354,9 +354,9 @@ int main(int argc, char **argv) {
          "Usage: %s UnicodeData.txt NameAliases.txt output\n\n",
          argv[0]);
   printf("NameAliases.txt can be found at "
-         "https://unicode.org/Public/15.1.0/ucd/NameAliases.txt\n"
+         "https://unicode.org/Public/18.0.0/ucd/NameAliases.txt\n"
          "UnicodeData.txt can be found at "
-         "https://unicode.org/Public/15.1.0/ucd/UnicodeData.txt\n\n");
+         "https://unicode.org/Public/18.0.0/ucd/UnicodeData.txt\n\n");
 
   if (argc != 4)
     return EXIT_FAILURE;

>From c78ce9bb0fd7ed90c131722592c3186184127023 Mon Sep 17 00:00:00 2001
From: Charlie Lin <tuug at gmx.us>
Date: Fri, 22 May 2026 06:03:06 -0500
Subject: [PATCH 09/14] Update generate_escaped_output_table.py

---
 libcxx/utils/generate_escaped_output_table.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libcxx/utils/generate_escaped_output_table.py b/libcxx/utils/generate_escaped_output_table.py
index dd7173201b466..20bf572dc6dbe 100755
--- a/libcxx/utils/generate_escaped_output_table.py
+++ b/libcxx/utils/generate_escaped_output_table.py
@@ -105,7 +105,7 @@ def compactPropertyRanges(input: list[PropertyRange]) -> list[PropertyRange]:
 /// - Unassigned.
 ///
 /// The data is generated from
-/// - https://www.unicode.org/Public/UCD/latest/ucd/extracted/DerivedGeneralCategory.txt
+/// - https://www.unicode.org/Public/UCD/18.0.0/ucd/extracted/DerivedGeneralCategory.txt
 ///
 /// The table is similar to the table
 ///  __extended_grapheme_custer_property_boundary::__entries

>From f8ffabc8541eb32acae5d0656be5501e4d0ad174 Mon Sep 17 00:00:00 2001
From: Charlie Lin <tuug at gmx.us>
Date: Fri, 22 May 2026 06:11:56 -0500
Subject: [PATCH 10/14] Update entries of last two blocks for 18.0.0

---
 libcxx/utils/generate_escaped_output_table.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/libcxx/utils/generate_escaped_output_table.py b/libcxx/utils/generate_escaped_output_table.py
index 20bf572dc6dbe..d043dfb1a681c 100755
--- a/libcxx/utils/generate_escaped_output_table.py
+++ b/libcxx/utils/generate_escaped_output_table.py
@@ -330,10 +330,10 @@ def generate_data_tables() -> str:
     # The exact entries may differ between Unicode versions. When these numbers
     # change the test needs to be updated too.
     #   libcxx/test/libcxx/utilities/format/format.string/format.string.std/escaped_output.pass.cpp
-    assert (data[-2].lower) == 0x323B0
-    assert (data[-2].upper) == 0xE00FF
-    assert (data[-1].lower) == 0xE01F0
-    assert (data[-1].upper) == 0x10FFFF
+    assert (data[-2].lower) == 0x3FC40, hex(data[-2].lower)
+    assert (data[-2].upper) == 0xE00FF, hex(data[-2].upper)
+    assert (data[-1].lower) == 0xE01F0, hex(data[-1].lower)
+    assert (data[-1].upper) == 0x10FFFF, hex(data[-1].upper)
 
     return "\n".join(
         [

>From 8a8a1cf2b8851c03d97deafc01bc9d3b2ead2773 Mon Sep 17 00:00:00 2001
From: Charlie Lin <tuug at gmx.us>
Date: Fri, 22 May 2026 06:13:33 -0500
Subject: [PATCH 11/14] Update excluded range

---
 .../format.string/format.string.std/escaped_output.pass.cpp     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libcxx/test/libcxx/utilities/format/format.string/format.string.std/escaped_output.pass.cpp b/libcxx/test/libcxx/utilities/format/format.string/format.string.std/escaped_output.pass.cpp
index d24db360173b1..39a21fbeb9778 100644
--- a/libcxx/test/libcxx/utilities/format/format.string/format.string.std/escaped_output.pass.cpp
+++ b/libcxx/test/libcxx/utilities/format/format.string/format.string.std/escaped_output.pass.cpp
@@ -73,7 +73,7 @@ inline constexpr int C  = Cc + Cf + Cs + Co + Cn;
 // [323B0, 10FFFF] use a hand-crafted algorithm.
 //
 // This means a number of entries are omitted
-inline constexpr int excluded = ((0x10FFFF - 0x323B0) + 1) - 240;
+inline constexpr int excluded = ((0x10FFFF - 0x3FC40) + 1) - 240;
 
 inline constexpr int entries = Z + C - excluded;
 

>From c6c47f6c028ef3e862661c0209996cc43bb7b2d6 Mon Sep 17 00:00:00 2001
From: Charlie Lin <tuug at gmx.us>
Date: Fri, 22 May 2026 06:14:43 -0500
Subject: [PATCH 12/14] Update escaped_output.pass.cpp

---
 .../format.string/format.string.std/escaped_output.pass.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/libcxx/test/libcxx/utilities/format/format.string/format.string.std/escaped_output.pass.cpp b/libcxx/test/libcxx/utilities/format/format.string/format.string.std/escaped_output.pass.cpp
index 39a21fbeb9778..3c5502652866b 100644
--- a/libcxx/test/libcxx/utilities/format/format.string/format.string.std/escaped_output.pass.cpp
+++ b/libcxx/test/libcxx/utilities/format/format.string/format.string.std/escaped_output.pass.cpp
@@ -66,11 +66,11 @@ inline constexpr int C  = Cc + Cf + Cs + Co + Cn;
 // 100000..10FFFD; Co # [65534] <private-use-100000>..<private-use-10FFFD>
 // 10FFFE..10FFFF; Cn #   [2] <noncharacter-10FFFE>..<noncharacter-10FFFF>
 //
-// It can be observed all entries in the range 323B0..10FFFF are in the
+// It can be observed all entries in the range 3FC40..10FFFF are in the
 // categories Cf, Co, Cn, except a small range with the property Mn.
 // In order to reduce the size of the table only the entires in the range
-// [0000, 323B0) are stored in the table. The entries in the range
-// [323B0, 10FFFF] use a hand-crafted algorithm.
+// [0000, 3FC40) are stored in the table. The entries in the range
+// [3FC40, 10FFFF] use a hand-crafted algorithm.
 //
 // This means a number of entries are omitted
 inline constexpr int excluded = ((0x10FFFF - 0x3FC40) + 1) - 240;

>From b6f14c307888132e9ee09fd1651ae2764ae75344 Mon Sep 17 00:00:00 2001
From: Charlie Lin <tuug at gmx.us>
Date: Fri, 22 May 2026 07:47:12 -0500
Subject: [PATCH 13/14] Account for new Unicode property ranges in 18

---
 .../format.string.std/escaped_output.pass.cpp          | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/libcxx/test/libcxx/utilities/format/format.string/format.string.std/escaped_output.pass.cpp b/libcxx/test/libcxx/utilities/format/format.string/format.string.std/escaped_output.pass.cpp
index 3c5502652866b..ee1fae71f66ea 100644
--- a/libcxx/test/libcxx/utilities/format/format.string/format.string.std/escaped_output.pass.cpp
+++ b/libcxx/test/libcxx/utilities/format/format.string/format.string.std/escaped_output.pass.cpp
@@ -48,13 +48,15 @@ inline constexpr int Cc = 65;
 inline constexpr int Cf = 170;
 inline constexpr int Cs = 2'048;
 inline constexpr int Co = 137'468;
-inline constexpr int Cn = 819'533;
+inline constexpr int Cn = 801'683;
 inline constexpr int C  = Cc + Cf + Cs + Co + Cn;
 
 // This is the final part of the Unicode properties table:
 //
-// 31350..323AF  ; Lo # [4192] CJK UNIFIED IDEOGRAPH-31350..CJK UNIFIED IDEOGRAPH-323AF
-// 323B0..E0000  ; Cn # [711761] <reserved-323B0>..<reserved-E0000>
+// 31350..33479  ; Lo # [8490] CJK UNIFIED IDEOGRAPH-31350..CJK UNIFIED IDEOGRAPH-33479
+// 3347A..3CFFF  ; Cn # [39814] <reserved-3347A>..<reserved-3CFFF>
+// 3D000..3FC3F  ; Lo # [11328] SEAL CHARACTER-3D000..SEAL CHARACTER-3FC3F
+// 3FC40..E0000  ; Cn # [656321] <reserved-3FC40>..<reserved-E0000>
 // E0001         ; Cf #       LANGUAGE TAG
 // E0002..E001F  ; Cn #  [30] <reserved-E0002>..<reserved-E001F>
 // E0020..E007F  ; Cf #  [96] TAG SPACE..CANCEL TAG
@@ -73,7 +75,7 @@ inline constexpr int C  = Cc + Cf + Cs + Co + Cn;
 // [3FC40, 10FFFF] use a hand-crafted algorithm.
 //
 // This means a number of entries are omitted
-inline constexpr int excluded = ((0x10FFFF - 0x3FC40) + 1) - 240;
+inline constexpr int excluded = ((0x10FFFF - 0x3347A) + 1) - 240;
 
 inline constexpr int entries = Z + C - excluded;
 

>From 6b810bc4666d02b99eea232db0d5eab566aa8c4c Mon Sep 17 00:00:00 2001
From: Charlie Lin <tuug at gmx.us>
Date: Fri, 22 May 2026 08:24:25 -0500
Subject: [PATCH 14/14] Final corrected excluded codepoint is 0x3fc40

---
 .../format.string/format.string.std/escaped_output.pass.cpp     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libcxx/test/libcxx/utilities/format/format.string/format.string.std/escaped_output.pass.cpp b/libcxx/test/libcxx/utilities/format/format.string/format.string.std/escaped_output.pass.cpp
index ee1fae71f66ea..9e51b064e6f5a 100644
--- a/libcxx/test/libcxx/utilities/format/format.string/format.string.std/escaped_output.pass.cpp
+++ b/libcxx/test/libcxx/utilities/format/format.string/format.string.std/escaped_output.pass.cpp
@@ -75,7 +75,7 @@ inline constexpr int C  = Cc + Cf + Cs + Co + Cn;
 // [3FC40, 10FFFF] use a hand-crafted algorithm.
 //
 // This means a number of entries are omitted
-inline constexpr int excluded = ((0x10FFFF - 0x3347A) + 1) - 240;
+inline constexpr int excluded = ((0x10FFFF - 0x3FC40) + 1) - 240;
 
 inline constexpr int entries = Z + C - excluded;
 



More information about the libcxx-commits mailing list