r320697 - Warn if we find a Unicode homoglyph for a symbol in an identifier.
Aaron Ballman via cfe-commits
cfe-commits at lists.llvm.org
Thu Dec 14 05:20:56 PST 2017
On Thu, Dec 14, 2017 at 8:15 AM, Richard Smith via cfe-commits
<cfe-commits at lists.llvm.org> wrote:
> Author: rsmith
> Date: Thu Dec 14 05:15:08 2017
> New Revision: 320697
>
> URL: http://llvm.org/viewvc/llvm-project?rev=320697&view=rev
> Log:
> Warn if we find a Unicode homoglyph for a symbol in an identifier.
>
> Specifically, warn if:
> * we find a character that the language standard says we must treat as an
> identifier, and
> * that character is not reasonably an identifier character (it's a punctuation
> character or similar), and
> * it renders identically to a valid non-identifier character in common
> fixed-width fonts.
>
> Some tools "helpfully" substitute the surprising characters for the expected
> characters, and replacing semicolons with Greek question marks is a common
> "prank".
>
> Modified:
> cfe/trunk/include/clang/Basic/DiagnosticLexKinds.td
> cfe/trunk/lib/Lex/Lexer.cpp
> cfe/trunk/test/Lexer/unicode.c
>
> Modified: cfe/trunk/include/clang/Basic/DiagnosticLexKinds.td
> URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/include/clang/Basic/DiagnosticLexKinds.td?rev=320697&r1=320696&r2=320697&view=diff
> ==============================================================================
> --- cfe/trunk/include/clang/Basic/DiagnosticLexKinds.td (original)
> +++ cfe/trunk/include/clang/Basic/DiagnosticLexKinds.td Thu Dec 14 05:15:08 2017
> @@ -119,6 +119,9 @@ def err_non_ascii : Error<
> def ext_unicode_whitespace : ExtWarn<
> "treating Unicode character as whitespace">,
> InGroup<DiagGroup<"unicode-whitespace">>;
> +def warn_utf8_symbol_homoglyph : Warning<
> + "treating Unicode character <U+%0> as identifier character rather than "
> + "as '%1' symbol">, InGroup<DiagGroup<"unicode-homoglyph">>;
Can this wording be tweaked slightly to "as an identifier character"
or does that cause too much of an "a/an" problem with "as %1 symbol"?
~Aaron
>
> def err_hex_escape_no_digits : Error<
> "\\%0 used with no following hex digits">;
>
> Modified: cfe/trunk/lib/Lex/Lexer.cpp
> URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Lex/Lexer.cpp?rev=320697&r1=320696&r2=320697&view=diff
> ==============================================================================
> --- cfe/trunk/lib/Lex/Lexer.cpp (original)
> +++ cfe/trunk/lib/Lex/Lexer.cpp Thu Dec 14 05:15:08 2017
> @@ -37,6 +37,7 @@
> #include "llvm/Support/ConvertUTF.h"
> #include "llvm/Support/MathExtras.h"
> #include "llvm/Support/MemoryBuffer.h"
> +#include "llvm/Support/NativeFormatting.h"
> #include "llvm/Support/UnicodeCharRanges.h"
> #include <algorithm>
> #include <cassert>
> @@ -1500,6 +1501,75 @@ static void maybeDiagnoseIDCharCompat(Di
> }
> }
>
> +/// After encountering UTF-8 character C and interpreting it as an identifier
> +/// character, check whether it's a homoglyph for a common non-identifier
> +/// source character that is unlikely to be an intentional identifier
> +/// character and warn if so.
> +static void maybeDiagnoseUTF8Homoglyph(DiagnosticsEngine &Diags, uint32_t C,
> + CharSourceRange Range) {
> + // FIXME: Handle Unicode quotation marks (smart quotes, fullwidth quotes).
> + struct HomoglyphPair {
> + uint32_t Character;
> + char LooksLike;
> + bool operator<(HomoglyphPair R) const { return Character < R.Character; }
> + };
> + static constexpr HomoglyphPair SortedHomoglyphs[] = {
> + {U'\u01c3', '!'}, // LATIN LETTER RETROFLEX CLICK
> + {U'\u037e', ';'}, // GREEK QUESTION MARK
> + {U'\u2212', '-'}, // MINUS SIGN
> + {U'\u2215', '/'}, // DIVISION SLASH
> + {U'\u2216', '\\'}, // SET MINUS
> + {U'\u2217', '*'}, // ASTERISK OPERATOR
> + {U'\u2223', '|'}, // DIVIDES
> + {U'\u2227', '^'}, // LOGICAL AND
> + {U'\u2236', ':'}, // RATIO
> + {U'\u223c', '~'}, // TILDE OPERATOR
> + {U'\ua789', ':'}, // MODIFIER LETTER COLON
> + {U'\uff01', '!'}, // FULLWIDTH EXCLAMATION MARK
> + {U'\uff03', '#'}, // FULLWIDTH NUMBER SIGN
> + {U'\uff04', '$'}, // FULLWIDTH DOLLAR SIGN
> + {U'\uff05', '%'}, // FULLWIDTH PERCENT SIGN
> + {U'\uff06', '&'}, // FULLWIDTH AMPERSAND
> + {U'\uff08', '('}, // FULLWIDTH LEFT PARENTHESIS
> + {U'\uff09', ')'}, // FULLWIDTH RIGHT PARENTHESIS
> + {U'\uff0a', '*'}, // FULLWIDTH ASTERISK
> + {U'\uff0b', '+'}, // FULLWIDTH ASTERISK
> + {U'\uff0c', ','}, // FULLWIDTH COMMA
> + {U'\uff0d', '-'}, // FULLWIDTH HYPHEN-MINUS
> + {U'\uff0e', '.'}, // FULLWIDTH FULL STOP
> + {U'\uff0f', '/'}, // FULLWIDTH SOLIDUS
> + {U'\uff1a', ':'}, // FULLWIDTH COLON
> + {U'\uff1b', ';'}, // FULLWIDTH SEMICOLON
> + {U'\uff1c', '<'}, // FULLWIDTH LESS-THAN SIGN
> + {U'\uff1d', '='}, // FULLWIDTH EQUALS SIGN
> + {U'\uff1e', '>'}, // FULLWIDTH GREATER-THAN SIGN
> + {U'\uff1f', '?'}, // FULLWIDTH QUESTION MARK
> + {U'\uff20', '@'}, // FULLWIDTH COMMERCIAL AT
> + {U'\uff3b', '['}, // FULLWIDTH LEFT SQUARE BRACKET
> + {U'\uff3c', '\\'}, // FULLWIDTH REVERSE SOLIDUS
> + {U'\uff3d', ']'}, // FULLWIDTH RIGHT SQUARE BRACKET
> + {U'\uff3e', '^'}, // FULLWIDTH CIRCUMFLEX ACCENT
> + {U'\uff5b', '{'}, // FULLWIDTH LEFT CURLY BRACKET
> + {U'\uff5c', '|'}, // FULLWIDTH VERTICAL LINE
> + {U'\uff5d', '}'}, // FULLWIDTH RIGHT CURLY BRACKET
> + {U'\uff5e', '~'}, // FULLWIDTH TILDE
> + {0, 0}
> + };
> + auto Homoglyph =
> + std::lower_bound(std::begin(SortedHomoglyphs),
> + std::end(SortedHomoglyphs) - 1, HomoglyphPair{C, '\0'});
> + if (Homoglyph->Character == C) {
> + llvm::SmallString<5> CharBuf;
> + {
> + llvm::raw_svector_ostream CharOS(CharBuf);
> + llvm::write_hex(CharOS, C, llvm::HexPrintStyle::Upper, 4);
> + }
> + const char LooksLikeStr[] = {Homoglyph->LooksLike, 0};
> + Diags.Report(Range.getBegin(), diag::warn_utf8_symbol_homoglyph)
> + << Range << CharBuf << LooksLikeStr;
> + }
> +}
> +
> bool Lexer::tryConsumeIdentifierUCN(const char *&CurPtr, unsigned Size,
> Token &Result) {
> const char *UCNPtr = CurPtr + Size;
> @@ -1534,10 +1604,13 @@ bool Lexer::tryConsumeIdentifierUTF8Char
> !isAllowedIDChar(static_cast<uint32_t>(CodePoint), LangOpts))
> return false;
>
> - if (!isLexingRawMode())
> + if (!isLexingRawMode()) {
> maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint,
> makeCharRange(*this, CurPtr, UnicodePtr),
> /*IsFirst=*/false);
> + maybeDiagnoseUTF8Homoglyph(PP->getDiagnostics(), CodePoint,
> + makeCharRange(*this, CurPtr, UnicodePtr));
> + }
>
> CurPtr = UnicodePtr;
> return true;
> @@ -3737,6 +3810,7 @@ LexNextToken:
> // We can't just reset CurPtr to BufferPtr because BufferPtr may point to
> // an escaped newline.
> --CurPtr;
> + const char *UTF8StartPtr = CurPtr;
> llvm::ConversionResult Status =
> llvm::convertUTF8Sequence((const llvm::UTF8 **)&CurPtr,
> (const llvm::UTF8 *)BufferEnd,
> @@ -3751,6 +3825,9 @@ LexNextToken:
> // (We manually eliminate the tail call to avoid recursion.)
> goto LexNextToken;
> }
> + if (!isLexingRawMode())
> + maybeDiagnoseUTF8Homoglyph(PP->getDiagnostics(), CodePoint,
> + makeCharRange(*this, UTF8StartPtr, CurPtr));
> return LexUnicode(Result, CodePoint, CurPtr);
> }
>
>
> Modified: cfe/trunk/test/Lexer/unicode.c
> URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/test/Lexer/unicode.c?rev=320697&r1=320696&r2=320697&view=diff
> ==============================================================================
> --- cfe/trunk/test/Lexer/unicode.c (original)
> +++ cfe/trunk/test/Lexer/unicode.c Thu Dec 14 05:15:08 2017
> @@ -33,3 +33,8 @@ int main () {
> int 🌷 = 🌵(🌹);
> return 🌷;
> }
> +
> +int n; = 3; // expected-warning {{treating Unicode character <U+037E> as identifier character rather than as ';' symbol}}
> +int *n꞉꞉v = &n;; // expected-warning 2{{treating Unicode character <U+A789> as identifier character rather than as ':' symbol}}
> + // expected-warning at -1 {{treating Unicode character <U+037E> as identifier character rather than as ';' symbol}}
> +int vï¼ ï¼»ï¼ ï¼½ï¼ˆauto){return~xï¼›ï½ ï¼ˆï¼‰; // expected-warning 12{{treating Unicode character}}
>
>
> _______________________________________________
> cfe-commits mailing list
> cfe-commits at lists.llvm.org
> http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
More information about the cfe-commits
mailing list