r341700 - PR38870: Add warning for zero-width unicode characters appearing in

Richard Smith via cfe-commits cfe-commits at lists.llvm.org
Fri Sep 7 12:25:39 PDT 2018


Author: rsmith
Date: Fri Sep  7 12:25:39 2018
New Revision: 341700

URL: http://llvm.org/viewvc/llvm-project?rev=341700&view=rev
Log:
PR38870: Add warning for zero-width unicode characters appearing in
identifiers.

Modified:
    cfe/trunk/include/clang/Basic/DiagnosticLexKinds.td
    cfe/trunk/lib/Lex/Lexer.cpp
    cfe/trunk/test/Lexer/unicode.c

Modified: cfe/trunk/include/clang/Basic/DiagnosticLexKinds.td
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/include/clang/Basic/DiagnosticLexKinds.td?rev=341700&r1=341699&r2=341700&view=diff
==============================================================================
--- cfe/trunk/include/clang/Basic/DiagnosticLexKinds.td (original)
+++ cfe/trunk/include/clang/Basic/DiagnosticLexKinds.td Fri Sep  7 12:25:39 2018
@@ -122,6 +122,9 @@ def ext_unicode_whitespace : ExtWarn<
 def warn_utf8_symbol_homoglyph : Warning<
   "treating Unicode character <U+%0> as identifier character rather than "
   "as '%1' symbol">, InGroup<DiagGroup<"unicode-homoglyph">>;
+def warn_utf8_symbol_zero_width : Warning<
+  "identifier contains Unicode character <U+%0> that is invisible in "
+  "some environments">, InGroup<DiagGroup<"unicode-zero-width">>;
 
 def err_hex_escape_no_digits : Error<
   "\\%0 used with no following hex digits">;

Modified: cfe/trunk/lib/Lex/Lexer.cpp
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Lex/Lexer.cpp?rev=341700&r1=341699&r2=341700&view=diff
==============================================================================
--- cfe/trunk/lib/Lex/Lexer.cpp (original)
+++ cfe/trunk/lib/Lex/Lexer.cpp Fri Sep  7 12:25:39 2018
@@ -1510,8 +1510,17 @@ static void maybeDiagnoseUTF8Homoglyph(D
     bool operator<(HomoglyphPair R) const { return Character < R.Character; }
   };
   static constexpr HomoglyphPair SortedHomoglyphs[] = {
+    {U'\u00ad', 0},   // SOFT HYPHEN
     {U'\u01c3', '!'}, // LATIN LETTER RETROFLEX CLICK
     {U'\u037e', ';'}, // GREEK QUESTION MARK
+    {U'\u200b', 0},   // ZERO WIDTH SPACE
+    {U'\u200c', 0},   // ZERO WIDTH NON-JOINER
+    {U'\u200d', 0},   // ZERO WIDTH JOINER
+    {U'\u2060', 0},   // WORD JOINER
+    {U'\u2061', 0},   // FUNCTION APPLICATION
+    {U'\u2062', 0},   // INVISIBLE TIMES
+    {U'\u2063', 0},   // INVISIBLE SEPARATOR
+    {U'\u2064', 0},   // INVISIBLE PLUS
     {U'\u2212', '-'}, // MINUS SIGN
     {U'\u2215', '/'}, // DIVISION SLASH
     {U'\u2216', '\\'}, // SET MINUS
@@ -1521,6 +1530,7 @@ static void maybeDiagnoseUTF8Homoglyph(D
     {U'\u2236', ':'}, // RATIO
     {U'\u223c', '~'}, // TILDE OPERATOR
     {U'\ua789', ':'}, // MODIFIER LETTER COLON
+    {U'\ufeff', 0},   // ZERO WIDTH NO-BREAK SPACE
     {U'\uff01', '!'}, // FULLWIDTH EXCLAMATION MARK
     {U'\uff03', '#'}, // FULLWIDTH NUMBER SIGN
     {U'\uff04', '$'}, // FULLWIDTH DOLLAR SIGN
@@ -1560,9 +1570,14 @@ static void maybeDiagnoseUTF8Homoglyph(D
       llvm::raw_svector_ostream CharOS(CharBuf);
       llvm::write_hex(CharOS, C, llvm::HexPrintStyle::Upper, 4);
     }
-    const char LooksLikeStr[] = {Homoglyph->LooksLike, 0};
-    Diags.Report(Range.getBegin(), diag::warn_utf8_symbol_homoglyph)
-        << Range << CharBuf << LooksLikeStr;
+    if (Homoglyph->LooksLike) {
+      const char LooksLikeStr[] = {Homoglyph->LooksLike, 0};
+      Diags.Report(Range.getBegin(), diag::warn_utf8_symbol_homoglyph)
+          << Range << CharBuf << LooksLikeStr;
+    } else {
+      Diags.Report(Range.getBegin(), diag::warn_utf8_symbol_zero_width)
+          << Range << CharBuf;
+    }
   }
 }
 

Modified: cfe/trunk/test/Lexer/unicode.c
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/test/Lexer/unicode.c?rev=341700&r1=341699&r2=341700&view=diff
==============================================================================
--- cfe/trunk/test/Lexer/unicode.c (original)
+++ cfe/trunk/test/Lexer/unicode.c Fri Sep  7 12:25:39 2018
@@ -38,3 +38,10 @@ int n; = 3; // expected-warning {{tre
 int *n꞉꞉v = &n;; // expected-warning 2{{treating Unicode character <U+A789> as identifier character rather than as ':' symbol}}
                  // expected-warning at -1 {{treating Unicode character <U+037E> as identifier character rather than as ';' symbol}}
 int v=[=](auto){return~x;}(); // expected-warning 12{{treating Unicode character}}
+
+int ⁠xx‍;
+// expected-warning at -1 {{identifier contains Unicode character <U+2060> that is invisible in some environments}}
+// expected-warning at -2 {{identifier contains Unicode character <U+FEFF> that is invisible in some environments}}
+// expected-warning at -3 {{identifier contains Unicode character <U+200D> that is invisible in some environments}}
+int foo​bar = 0; // expected-warning {{identifier contains Unicode character <U+200B> that is invisible in some environments}}
+int x = foobar; // expected-error {{undeclared identifier}}




More information about the cfe-commits mailing list