r341700 - PR38870: Add warning for zero-width unicode characters appearing in
Richard Smith via cfe-commits
cfe-commits at lists.llvm.org
Fri Sep 7 12:25:39 PDT 2018
Author: rsmith
Date: Fri Sep 7 12:25:39 2018
New Revision: 341700
URL: http://llvm.org/viewvc/llvm-project?rev=341700&view=rev
Log:
PR38870: Add warning for zero-width unicode characters appearing in
identifiers.
Modified:
cfe/trunk/include/clang/Basic/DiagnosticLexKinds.td
cfe/trunk/lib/Lex/Lexer.cpp
cfe/trunk/test/Lexer/unicode.c
Modified: cfe/trunk/include/clang/Basic/DiagnosticLexKinds.td
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/include/clang/Basic/DiagnosticLexKinds.td?rev=341700&r1=341699&r2=341700&view=diff
==============================================================================
--- cfe/trunk/include/clang/Basic/DiagnosticLexKinds.td (original)
+++ cfe/trunk/include/clang/Basic/DiagnosticLexKinds.td Fri Sep 7 12:25:39 2018
@@ -122,6 +122,9 @@ def ext_unicode_whitespace : ExtWarn<
def warn_utf8_symbol_homoglyph : Warning<
"treating Unicode character <U+%0> as identifier character rather than "
"as '%1' symbol">, InGroup<DiagGroup<"unicode-homoglyph">>;
+def warn_utf8_symbol_zero_width : Warning<
+ "identifier contains Unicode character <U+%0> that is invisible in "
+ "some environments">, InGroup<DiagGroup<"unicode-zero-width">>;
def err_hex_escape_no_digits : Error<
"\\%0 used with no following hex digits">;
Modified: cfe/trunk/lib/Lex/Lexer.cpp
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Lex/Lexer.cpp?rev=341700&r1=341699&r2=341700&view=diff
==============================================================================
--- cfe/trunk/lib/Lex/Lexer.cpp (original)
+++ cfe/trunk/lib/Lex/Lexer.cpp Fri Sep 7 12:25:39 2018
@@ -1510,8 +1510,17 @@ static void maybeDiagnoseUTF8Homoglyph(D
bool operator<(HomoglyphPair R) const { return Character < R.Character; }
};
static constexpr HomoglyphPair SortedHomoglyphs[] = {
+ {U'\u00ad', 0}, // SOFT HYPHEN
{U'\u01c3', '!'}, // LATIN LETTER RETROFLEX CLICK
{U'\u037e', ';'}, // GREEK QUESTION MARK
+ {U'\u200b', 0}, // ZERO WIDTH SPACE
+ {U'\u200c', 0}, // ZERO WIDTH NON-JOINER
+ {U'\u200d', 0}, // ZERO WIDTH JOINER
+ {U'\u2060', 0}, // WORD JOINER
+ {U'\u2061', 0}, // FUNCTION APPLICATION
+ {U'\u2062', 0}, // INVISIBLE TIMES
+ {U'\u2063', 0}, // INVISIBLE SEPARATOR
+ {U'\u2064', 0}, // INVISIBLE PLUS
{U'\u2212', '-'}, // MINUS SIGN
{U'\u2215', '/'}, // DIVISION SLASH
{U'\u2216', '\\'}, // SET MINUS
@@ -1521,6 +1530,7 @@ static void maybeDiagnoseUTF8Homoglyph(D
{U'\u2236', ':'}, // RATIO
{U'\u223c', '~'}, // TILDE OPERATOR
{U'\ua789', ':'}, // MODIFIER LETTER COLON
+ {U'\ufeff', 0}, // ZERO WIDTH NO-BREAK SPACE
{U'\uff01', '!'}, // FULLWIDTH EXCLAMATION MARK
{U'\uff03', '#'}, // FULLWIDTH NUMBER SIGN
{U'\uff04', '$'}, // FULLWIDTH DOLLAR SIGN
@@ -1560,9 +1570,14 @@ static void maybeDiagnoseUTF8Homoglyph(D
llvm::raw_svector_ostream CharOS(CharBuf);
llvm::write_hex(CharOS, C, llvm::HexPrintStyle::Upper, 4);
}
- const char LooksLikeStr[] = {Homoglyph->LooksLike, 0};
- Diags.Report(Range.getBegin(), diag::warn_utf8_symbol_homoglyph)
- << Range << CharBuf << LooksLikeStr;
+ if (Homoglyph->LooksLike) {
+ const char LooksLikeStr[] = {Homoglyph->LooksLike, 0};
+ Diags.Report(Range.getBegin(), diag::warn_utf8_symbol_homoglyph)
+ << Range << CharBuf << LooksLikeStr;
+ } else {
+ Diags.Report(Range.getBegin(), diag::warn_utf8_symbol_zero_width)
+ << Range << CharBuf;
+ }
}
}
Modified: cfe/trunk/test/Lexer/unicode.c
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/test/Lexer/unicode.c?rev=341700&r1=341699&r2=341700&view=diff
==============================================================================
--- cfe/trunk/test/Lexer/unicode.c (original)
+++ cfe/trunk/test/Lexer/unicode.c Fri Sep 7 12:25:39 2018
@@ -38,3 +38,10 @@ int n; = 3; // expected-warning {{tre
int *nêêv = &n;; // expected-warning 2{{treating Unicode character <U+A789> as identifier character rather than as ':' symbol}}
// expected-warning at -1 {{treating Unicode character <U+037E> as identifier character rather than as ';' symbol}}
int vï¼ï¼»ï¼ï¼½ï¼autoï¼ï½returnï½xï¼ï½ï¼ï¼; // expected-warning 12{{treating Unicode character}}
+
+int â xxâ;
+// expected-warning at -1 {{identifier contains Unicode character <U+2060> that is invisible in some environments}}
+// expected-warning at -2 {{identifier contains Unicode character <U+FEFF> that is invisible in some environments}}
+// expected-warning at -3 {{identifier contains Unicode character <U+200D> that is invisible in some environments}}
+int fooâbar = 0; // expected-warning {{identifier contains Unicode character <U+200B> that is invisible in some environments}}
+int x = foobar; // expected-error {{undeclared identifier}}
More information about the cfe-commits
mailing list