r320697 - Warn if we find a Unicode homoglyph for a symbol in an identifier.

Thu Dec 14 05:15:08 PST 2017

Author: rsmith
Date: Thu Dec 14 05:15:08 2017
New Revision: 320697

URL: http://llvm.org/viewvc/llvm-project?rev=320697&view=rev
Log:
Warn if we find a Unicode homoglyph for a symbol in an identifier.

Specifically, warn if:
 * we find a character that the language standard says we must treat as an
   identifier, and
 * that character is not reasonably an identifier character (it's a punctuation
   character or similar), and 
 * it renders identically to a valid non-identifier character in common
   fixed-width fonts.

Some tools "helpfully" substitute the surprising characters for the expected
characters, and replacing semicolons with Greek question marks is a common
"prank".

Modified:
    cfe/trunk/include/clang/Basic/DiagnosticLexKinds.td
    cfe/trunk/lib/Lex/Lexer.cpp
    cfe/trunk/test/Lexer/unicode.c

Modified: cfe/trunk/include/clang/Basic/DiagnosticLexKinds.td
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/include/clang/Basic/DiagnosticLexKinds.td?rev=320697&r1=320696&r2=320697&view=diff
==============================================================================

--- cfe/trunk/include/clang/Basic/DiagnosticLexKinds.td (original)
+++ cfe/trunk/include/clang/Basic/DiagnosticLexKinds.td Thu Dec 14 05:15:08 2017
@@ -119,6 +119,9 @@ def err_non_ascii : Error<
 def ext_unicode_whitespace : ExtWarn<
   "treating Unicode character as whitespace">,
   InGroup<DiagGroup<"unicode-whitespace">>;
+def warn_utf8_symbol_homoglyph : Warning<
+  "treating Unicode character <U+%0> as identifier character rather than "
+  "as '%1' symbol">, InGroup<DiagGroup<"unicode-homoglyph">>;
 
 def err_hex_escape_no_digits : Error<
   "\\%0 used with no following hex digits">;

Modified: cfe/trunk/lib/Lex/Lexer.cpp
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Lex/Lexer.cpp?rev=320697&r1=320696&r2=320697&view=diff
==============================================================================
--- cfe/trunk/lib/Lex/Lexer.cpp (original)
+++ cfe/trunk/lib/Lex/Lexer.cpp Thu Dec 14 05:15:08 2017
@@ -37,6 +37,7 @@
 #include "llvm/Support/ConvertUTF.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/NativeFormatting.h"
 #include "llvm/Support/UnicodeCharRanges.h"
 #include <algorithm>
 #include <cassert>
@@ -1500,6 +1501,75 @@ static void maybeDiagnoseIDCharCompat(Di
   }
 }
 
+/// After encountering UTF-8 character C and interpreting it as an identifier
+/// character, check whether it's a homoglyph for a common non-identifier
+/// source character that is unlikely to be an intentional identifier
+/// character and warn if so.
+static void maybeDiagnoseUTF8Homoglyph(DiagnosticsEngine &Diags, uint32_t C,
+                                       CharSourceRange Range) {
+  // FIXME: Handle Unicode quotation marks (smart quotes, fullwidth quotes).
+  struct HomoglyphPair {
+    uint32_t Character;
+    char LooksLike;
+    bool operator<(HomoglyphPair R) const { return Character < R.Character; }
+  };
+  static constexpr HomoglyphPair SortedHomoglyphs[] = {
+    {U'\u01c3', '!'}, // LATIN LETTER RETROFLEX CLICK
+    {U'\u037e', ';'}, // GREEK QUESTION MARK
+    {U'\u2212', '-'}, // MINUS SIGN
+    {U'\u2215', '/'}, // DIVISION SLASH
+    {U'\u2216', '\\'}, // SET MINUS
+    {U'\u2217', '*'}, // ASTERISK OPERATOR
+    {U'\u2223', '|'}, // DIVIDES
+    {U'\u2227', '^'}, // LOGICAL AND
+    {U'\u2236', ':'}, // RATIO
+    {U'\u223c', '~'}, // TILDE OPERATOR
+    {U'\ua789', ':'}, // MODIFIER LETTER COLON
+    {U'\uff01', '!'}, // FULLWIDTH EXCLAMATION MARK
+    {U'\uff03', '#'}, // FULLWIDTH NUMBER SIGN
+    {U'\uff04', '$'}, // FULLWIDTH DOLLAR SIGN
+    {U'\uff05', '%'}, // FULLWIDTH PERCENT SIGN
+    {U'\uff06', '&'}, // FULLWIDTH AMPERSAND
+    {U'\uff08', '('}, // FULLWIDTH LEFT PARENTHESIS
+    {U'\uff09', ')'}, // FULLWIDTH RIGHT PARENTHESIS
+    {U'\uff0a', '*'}, // FULLWIDTH ASTERISK
+    {U'\uff0b', '+'}, // FULLWIDTH ASTERISK
+    {U'\uff0c', ','}, // FULLWIDTH COMMA
+    {U'\uff0d', '-'}, // FULLWIDTH HYPHEN-MINUS
+    {U'\uff0e', '.'}, // FULLWIDTH FULL STOP
+    {U'\uff0f', '/'}, // FULLWIDTH SOLIDUS
+    {U'\uff1a', ':'}, // FULLWIDTH COLON
+    {U'\uff1b', ';'}, // FULLWIDTH SEMICOLON
+    {U'\uff1c', '<'}, // FULLWIDTH LESS-THAN SIGN
+    {U'\uff1d', '='}, // FULLWIDTH EQUALS SIGN
+    {U'\uff1e', '>'}, // FULLWIDTH GREATER-THAN SIGN
+    {U'\uff1f', '?'}, // FULLWIDTH QUESTION MARK
+    {U'\uff20', '@'}, // FULLWIDTH COMMERCIAL AT
+    {U'\uff3b', '['}, // FULLWIDTH LEFT SQUARE BRACKET
+    {U'\uff3c', '\\'}, // FULLWIDTH REVERSE SOLIDUS
+    {U'\uff3d', ']'}, // FULLWIDTH RIGHT SQUARE BRACKET
+    {U'\uff3e', '^'}, // FULLWIDTH CIRCUMFLEX ACCENT
+    {U'\uff5b', '{'}, // FULLWIDTH LEFT CURLY BRACKET
+    {U'\uff5c', '|'}, // FULLWIDTH VERTICAL LINE
+    {U'\uff5d', '}'}, // FULLWIDTH RIGHT CURLY BRACKET
+    {U'\uff5e', '~'}, // FULLWIDTH TILDE
+    {0, 0}
+  };
+  auto Homoglyph =
+      std::lower_bound(std::begin(SortedHomoglyphs),
+                       std::end(SortedHomoglyphs) - 1, HomoglyphPair{C, '\0'});
+  if (Homoglyph->Character == C) {
+    llvm::SmallString<5> CharBuf;
+    {
+      llvm::raw_svector_ostream CharOS(CharBuf);
+      llvm::write_hex(CharOS, C, llvm::HexPrintStyle::Upper, 4);
+    }
+    const char LooksLikeStr[] = {Homoglyph->LooksLike, 0};
+    Diags.Report(Range.getBegin(), diag::warn_utf8_symbol_homoglyph)
+        << Range << CharBuf << LooksLikeStr;
+  }
+}
+
 bool Lexer::tryConsumeIdentifierUCN(const char *&CurPtr, unsigned Size,
                                     Token &Result) {
   const char *UCNPtr = CurPtr + Size;
@@ -1534,10 +1604,13 @@ bool Lexer::tryConsumeIdentifierUTF8Char
       !isAllowedIDChar(static_cast<uint32_t>(CodePoint), LangOpts))
     return false;
 
-  if (!isLexingRawMode())
+  if (!isLexingRawMode()) {
     maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint,
                               makeCharRange(*this, CurPtr, UnicodePtr),
                               /*IsFirst=*/false);
+    maybeDiagnoseUTF8Homoglyph(PP->getDiagnostics(), CodePoint,
+                               makeCharRange(*this, CurPtr, UnicodePtr));
+  }
 
   CurPtr = UnicodePtr;
   return true;
@@ -3737,6 +3810,7 @@ LexNextToken:
     // We can't just reset CurPtr to BufferPtr because BufferPtr may point to
     // an escaped newline.
     --CurPtr;
+    const char *UTF8StartPtr = CurPtr;
     llvm::ConversionResult Status =
         llvm::convertUTF8Sequence((const llvm::UTF8 **)&CurPtr,
                                   (const llvm::UTF8 *)BufferEnd,
@@ -3751,6 +3825,9 @@ LexNextToken:
         // (We manually eliminate the tail call to avoid recursion.)
         goto LexNextToken;
       }
+      if (!isLexingRawMode())
+        maybeDiagnoseUTF8Homoglyph(PP->getDiagnostics(), CodePoint,
+                                   makeCharRange(*this, UTF8StartPtr, CurPtr));
       return LexUnicode(Result, CodePoint, CurPtr);
     }
 

Modified: cfe/trunk/test/Lexer/unicode.c
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/test/Lexer/unicode.c?rev=320697&r1=320696&r2=320697&view=diff
==============================================================================
--- cfe/trunk/test/Lexer/unicode.c (original)
+++ cfe/trunk/test/Lexer/unicode.c Thu Dec 14 05:15:08 2017
@@ -33,3 +33,8 @@ int main () {
   int ð· = ðµ(ð¹);
   return ð·;
 }
+
+int nÍ¾ = 3; // expected-warning {{treating Unicode character <U+037E> as identifier character rather than as ';' symbol}}
+int *nêêv = &nÍ¾; // expected-warning 2{{treating Unicode character <U+A789> as identifier character rather than as ':' symbol}}
+                 // expected-warning at -1 {{treating Unicode character <U+037E> as identifier character rather than as ';' symbol}}
+int vï¼ï¼»ï¼ï¼½ï¼autoï¼ï½returnï½xï¼ï½ï¼ï¼; // expected-warning 12{{treating Unicode character}}