[PATCH] D138518: Update the list of double width codepoints

Corentin Jabot via Phabricator via llvm-commits llvm-commits at lists.llvm.org
Mon Nov 28 06:13:47 PST 2022


This revision was landed with ongoing or failed builds.
This revision was automatically updated to reflect the committed changes.
Closed by commit rG2903769bf524: Update the list of double width codepoints (authored by cor3ntin).

Changed prior to commit:
  https://reviews.llvm.org/D138518?vs=477263&id=478212#toc

Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D138518/new/

https://reviews.llvm.org/D138518

Files:
  llvm/lib/Support/Unicode.cpp
  llvm/unittests/Support/UnicodeTest.cpp


Index: llvm/unittests/Support/UnicodeTest.cpp
===================================================================
--- llvm/unittests/Support/UnicodeTest.cpp
+++ llvm/unittests/Support/UnicodeTest.cpp
@@ -45,6 +45,11 @@
   EXPECT_EQ(3, columnWidthUTF8("q\344\270\200"));
   EXPECT_EQ(3, columnWidthUTF8("\314\200\340\270\201\344\270\200"));
 
+  EXPECT_EQ(2, columnWidthUTF8("\u231A")); // WATCH (emoji)
+  EXPECT_EQ(2, columnWidthUTF8("\U0001FADB")); // PEA POD (Unicode 15 emoji)
+  EXPECT_EQ(2, columnWidthUTF8("\U0001B132")); // HIRAGANA LETTER SMALL KO
+  EXPECT_EQ(2, columnWidthUTF8("\U00017042")); // TANGUT IDEOGRAPH
+
   // Invalid UTF-8 strings, columnWidthUTF8 should error out.
   EXPECT_EQ(-2, columnWidthUTF8("\344"));
   EXPECT_EQ(-2, columnWidthUTF8("\344\270"));
Index: llvm/lib/Support/Unicode.cpp
===================================================================
--- llvm/lib/Support/Unicode.cpp
+++ llvm/lib/Support/Unicode.cpp
@@ -300,8 +300,7 @@
 ///   * 0 for non-spacing and enclosing combining marks;
 ///   * 2 for CJK characters excluding halfwidth forms;
 ///   * 1 for all remaining characters.
-static inline int charWidth(int UCS)
-{
+static inline int charWidth(int UCS) {
   if (!isPrintable(UCS))
     return ErrorNonPrintableCharacter;
 
@@ -430,26 +429,45 @@
   if (CombiningCharacters.contains(UCS))
     return 0;
 
+  // We consider double width codepoints any codepoint with
+  // the property East_Asian_Width=F|W
+  // + Misc Symbols and Pictographs (U+1F300...U+1F5FF)
+  // + Supplemental Symbols and Pictographs (U+1F900...U+1F9FF)
   static const UnicodeCharRange DoubleWidthCharacterRanges[] = {
-    // Hangul Jamo
-    { 0x1100, 0x11FF },
-    // Deprecated fullwidth angle brackets
-    { 0x2329, 0x232A },
-    // CJK Misc, CJK Unified Ideographs, Yijing Hexagrams, Yi
-    // excluding U+303F (IDEOGRAPHIC HALF FILL SPACE)
-    { 0x2E80, 0x303E }, { 0x3040, 0xA4CF },
-    // Hangul
-    { 0xAC00, 0xD7A3 }, { 0xD7B0, 0xD7C6 }, { 0xD7CB, 0xD7FB },
-    // CJK Unified Ideographs
-    { 0xF900, 0xFAFF },
-    // Vertical forms
-    { 0xFE10, 0xFE19 },
-    // CJK Compatibility Forms + Small Form Variants
-    { 0xFE30, 0xFE6F },
-    // Fullwidth forms
-    { 0xFF01, 0xFF60 }, { 0xFFE0, 0xFFE6 },
-    // CJK Unified Ideographs
-    { 0x20000, 0x2A6DF }, { 0x2A700, 0x2B81F }, { 0x2F800, 0x2FA1F }
+      {0x1100, 0x115F},   {0x231A, 0x231B},   {0x2329, 0x232A},
+      {0x23E9, 0x23EC},   {0x23F0, 0x23F0},   {0x23F3, 0x23F3},
+      {0x25FD, 0x25FE},   {0x2614, 0x2615},   {0x2648, 0x2653},
+      {0x267F, 0x267F},   {0x2693, 0x2693},   {0x26A1, 0x26A1},
+      {0x26AA, 0x26AB},   {0x26BD, 0x26BE},   {0x26C4, 0x26C5},
+      {0x26CE, 0x26CE},   {0x26D4, 0x26D4},   {0x26EA, 0x26EA},
+      {0x26F2, 0x26F3},   {0x26F5, 0x26F5},   {0x26FA, 0x26FA},
+      {0x26FD, 0x26FD},   {0x2705, 0x2705},   {0x270A, 0x270B},
+      {0x2728, 0x2728},   {0x274C, 0x274C},   {0x274E, 0x274E},
+      {0x2753, 0x2755},   {0x2757, 0x2757},   {0x2795, 0x2797},
+      {0x27B0, 0x27B0},   {0x27BF, 0x27BF},   {0x2B1B, 0x2B1C},
+      {0x2B50, 0x2B50},   {0x2B55, 0x2B55},   {0x2E80, 0x2E99},
+      {0x2E9B, 0x2EF3},   {0x2F00, 0x2FD5},   {0x2FF0, 0x2FFB},
+      {0x3000, 0x303E},   {0x3041, 0x3096},   {0x3099, 0x30FF},
+      {0x3105, 0x312F},   {0x3131, 0x318E},   {0x3190, 0x31E3},
+      {0x31F0, 0x321E},   {0x3220, 0x3247},   {0x3250, 0xA48C},
+      {0xA490, 0xA4C6},   {0xA960, 0xA97C},   {0xAC00, 0xD7A3},
+      {0xF900, 0xFAFF},   {0xFE10, 0xFE19},   {0xFE30, 0xFE52},
+      {0xFE54, 0xFE66},   {0xFE68, 0xFE6B},   {0xFF01, 0xFF60},
+      {0xFFE0, 0xFFE6},   {0x16FE0, 0x16FE4}, {0x16FF0, 0x16FF1},
+      {0x17000, 0x187F7}, {0x18800, 0x18CD5}, {0x18D00, 0x18D08},
+      {0x1AFF0, 0x1AFF3}, {0x1AFF5, 0x1AFFB}, {0x1AFFD, 0x1AFFE},
+      {0x1B000, 0x1B122}, {0x1B132, 0x1B132}, {0x1B150, 0x1B152},
+      {0x1B155, 0x1B155}, {0x1B164, 0x1B167}, {0x1B170, 0x1B2FB},
+      {0x1F004, 0x1F004}, {0x1F0CF, 0x1F0CF}, {0x1F18E, 0x1F18E},
+      {0x1F191, 0x1F19A}, {0x1F200, 0x1F202}, {0x1F210, 0x1F23B},
+      {0x1F240, 0x1F248}, {0x1F250, 0x1F251}, {0x1F260, 0x1F265},
+      {0x1F300, 0x1F64F}, {0x1F680, 0x1F6C5}, {0x1F6CC, 0x1F6CC},
+      {0x1F6D0, 0x1F6D2}, {0x1F6D5, 0x1F6D7}, {0x1F6DC, 0x1F6DF},
+      {0x1F6EB, 0x1F6EC}, {0x1F6F4, 0x1F6FC}, {0x1F7E0, 0x1F7EB},
+      {0x1F7F0, 0x1F7F0}, {0x1F900, 0x1F9FF}, {0x1FA70, 0x1FA7C},
+      {0x1FA80, 0x1FA88}, {0x1FA90, 0x1FABD}, {0x1FABF, 0x1FAC5},
+      {0x1FACE, 0x1FADB}, {0x1FAE0, 0x1FAE8}, {0x1FAF0, 0x1FAF8},
+      {0x20000, 0x2FFFD}, {0x30000, 0x3FFFD}
   };
   static const UnicodeCharSet DoubleWidthCharacters(DoubleWidthCharacterRanges);
 
@@ -493,4 +511,3 @@
 } // namespace unicode
 } // namespace sys
 } // namespace llvm
-


-------------- next part --------------
A non-text attachment was scrubbed...
Name: D138518.478212.patch
Type: text/x-patch
Size: 4751 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20221128/1ecfc226/attachment.bin>


More information about the llvm-commits mailing list