[cfe-commits] r173369 - in /cfe/trunk: include/clang/Basic/ConvertUTF.h include/clang/Basic/DiagnosticLexKinds.td include/clang/Lex/Lexer.h include/clang/Lex/Token.h lib/Lex/Lexer.cpp lib/Lex/Preprocessor.cpp test/CXX/over/over.oper/over.literal/p8.cpp test/CodeGen/ucn-identifiers.c test/FixIt/fixit-unicode.c test/Lexer/utf8-invalid.c test/Preprocessor/ucn-pp-identifier.c test/Sema/ucn-identifiers.c

Thu Jan 24 13:34:51 PST 2013

On Thu, Jan 24, 2013 at 10:50 PM, Jordan Rose <jordan_rose at apple.com> wrote:
> Author: jrose
> Date: Thu Jan 24 14:50:46 2013
> New Revision: 173369
>
> URL: http://llvm.org/viewvc/llvm-project?rev=173369&view=rev
> Log:
> Handle universal character names and Unicode characters outside of literals.
>
> This is a missing piece for C99 conformance.
>
> This patch handles UCNs by adding a '\\' case to LexTokenInternal and
> LexIdentifier -- if we see a backslash, we tentatively try to read in a UCN.
> If the UCN is not syntactically well-formed, we fall back to the old
> treatment: a backslash followed by an identifier beginning with 'u' (or 'U').
>
> Because the spelling of an identifier with UCNs still has the UCN in it, we
> need to convert that to UTF-8 in Preprocessor::LookUpIdentifierInfo.
>
> Of course, valid code that does *not* use UCNs will see only a very minimal
> performance hit (checks after each identifier for non-ASCII characters,
> checks when converting raw_identifiers to identifiers that they do not
> contain UCNs, and checks when getting the spelling of an identifier that it
> does not contain a UCN).
>
> This patch also adds basic support for actual UTF-8 in the source. This is
> treated almost exactly the same as UCNs except that we consider stray
> Unicode characters to be mistakes and offer a fixit to remove them.
>
> Added:
>     cfe/trunk/test/CodeGen/ucn-identifiers.c
>     cfe/trunk/test/Lexer/utf8-invalid.c
>     cfe/trunk/test/Preprocessor/ucn-pp-identifier.c
>     cfe/trunk/test/Sema/ucn-identifiers.c
> Modified:
>     cfe/trunk/include/clang/Basic/ConvertUTF.h
>     cfe/trunk/include/clang/Basic/DiagnosticLexKinds.td
>     cfe/trunk/include/clang/Lex/Lexer.h
>     cfe/trunk/include/clang/Lex/Token.h
>     cfe/trunk/lib/Lex/Lexer.cpp
>     cfe/trunk/lib/Lex/Preprocessor.cpp
>     cfe/trunk/test/CXX/over/over.oper/over.literal/p8.cpp
>     cfe/trunk/test/FixIt/fixit-unicode.c
>
> Modified: cfe/trunk/include/clang/Basic/ConvertUTF.h
> URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/include/clang/Basic/ConvertUTF.h?rev=173369&r1=173368&r2=173369&view=diff
> ==============================================================================
> --- cfe/trunk/include/clang/Basic/ConvertUTF.h (original)
> +++ cfe/trunk/include/clang/Basic/ConvertUTF.h Thu Jan 24 14:50:46 2013
> @@ -161,6 +161,16 @@
>
>  unsigned getNumBytesForUTF8(UTF8 firstByte);
>
> +static inline ConversionResult convertUTF8Sequence(const UTF8 **source,
> +                                                   const UTF8 *sourceEnd,
> +                                                   UTF32 *target,
> +                                                   ConversionFlags flags) {
> +  unsigned size = getNumBytesForUTF8(**source);
> +  if (size > sourceEnd - *source)
> +    return sourceExhausted;
> +  return ConvertUTF8toUTF32(source, *source + size, &target, target + 1, flags);
> +}
> +
>  #ifdef __cplusplus
>  }
>
>
> Modified: cfe/trunk/include/clang/Basic/DiagnosticLexKinds.td
> URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/include/clang/Basic/DiagnosticLexKinds.td?rev=173369&r1=173368&r2=173369&view=diff
> ==============================================================================
> --- cfe/trunk/include/clang/Basic/DiagnosticLexKinds.td (original)
> +++ cfe/trunk/include/clang/Basic/DiagnosticLexKinds.td Thu Jan 24 14:50:46 2013
> @@ -93,15 +93,29 @@
>    "multi-character character constant">, InGroup<MultiChar>;
>  def ext_four_char_character_literal : Extension<
>    "multi-character character constant">, InGroup<FourByteMultiChar>;
> -
>
> -// Literal
> -def ext_nonstandard_escape : Extension<
> -  "use of non-standard escape character '\\%0'">;
> -def ext_unknown_escape : ExtWarn<"unknown escape sequence '\\%0'">;
> -def err_hex_escape_no_digits : Error<"\\%0 used with no following hex digits">;
> +
> +// Unicode and UCNs
> +def err_invalid_utf8 : Error<
> +  "source file is not valid UTF-8">;
> +def err_non_ascii : Error<
> +  "non-ASCII characters are not allowed outside of literals and identifiers">;
> +def ext_unicode_whitespace : ExtWarn<
> +  "treating Unicode character as whitespace">,
> +  InGroup<DiagGroup<"unicode-whitespace">>;
> +
> +def err_hex_escape_no_digits : Error<
> +  "\\%0 used with no following hex digits">;
> +def warn_ucn_escape_no_digits : Warning<
> +  "\\%0 used with no following hex digits; "
> +  "treating as '\\' followed by identifier">, InGroup<Unicode>;
> +def err_ucn_escape_incomplete : Error<
> +  "incomplete universal character name">;
> +def warn_ucn_escape_incomplete : Warning<
> +  "incomplete universal character name; "
> +  "treating as '\\' followed by identifier">, InGroup<Unicode>;
>  def err_ucn_escape_invalid : Error<"invalid universal character">;
> -def err_ucn_escape_incomplete : Error<"incomplete universal character name">;
> +
>  def err_ucn_escape_basic_scs : Error<
>    "character '%0' cannot be specified by a universal character name">;
>  def err_ucn_control_character : Error<
> @@ -112,6 +126,12 @@
>  def warn_cxx98_compat_literal_ucn_control_character : Warning<
>    "universal character name referring to a control character "
>    "is incompatible with C++98">, InGroup<CXX98Compat>, DefaultIgnore;
> +
> +
> +// Literal
> +def ext_nonstandard_escape : Extension<
> +  "use of non-standard escape character '\\%0'">;
> +def ext_unknown_escape : ExtWarn<"unknown escape sequence '\\%0'">;
>  def err_invalid_decimal_digit : Error<"invalid digit '%0' in decimal constant">;
>  def err_invalid_binary_digit : Error<"invalid digit '%0' in binary constant">;
>  def err_invalid_octal_digit : Error<"invalid digit '%0' in octal constant">;
>
> Modified: cfe/trunk/include/clang/Lex/Lexer.h
> URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/include/clang/Lex/Lexer.h?rev=173369&r1=173368&r2=173369&view=diff
> ==============================================================================
> --- cfe/trunk/include/clang/Lex/Lexer.h (original)
> +++ cfe/trunk/include/clang/Lex/Lexer.h Thu Jan 24 14:50:46 2013
> @@ -437,6 +437,11 @@
>    ///
>    void LexTokenInternal(Token &Result);
>
> +  /// Given that a token begins with the Unicode character \p C, figure out
> +  /// what kind of token it is and dispatch to the appropriate lexing helper
> +  /// function.
> +  void LexUnicode(Token &Result, uint32_t C, const char *CurPtr);
> +
>    /// FormTokenWithChars - When we lex a token, we have identified a span
>    /// starting at BufferPtr, going to TokEnd that forms the token.  This method
>    /// takes that range and assigns it to the token as its location and size.  In
> @@ -579,6 +584,21 @@
>    void cutOffLexing() { BufferPtr = BufferEnd; }
>
>    bool isHexaLiteral(const char *Start, const LangOptions &LangOpts);
> +
> +
> +  /// Read a universal character name.
> +  ///
> +  /// \param CurPtr The position in the source buffer after the initial '\'.
> +  ///               If the UCN is syntactically well-formed (but not necessarily
> +  ///               valid), this parameter will be updated to point to the
> +  ///               character after the UCN.
> +  /// \param SlashLoc The position in the source buffer of the '\'.
> +  /// \param Tok The token being formed. Pass \c NULL to suppress diagnostics
> +  ///            and handle token formation in the caller.
> +  ///
> +  /// \return The Unicode codepoint specified by the UCN, or 0 if the UCN is
> +  ///         invalid.
> +  uint32_t tryReadUCN(const char *&CurPtr, const char *SlashLoc, Token *Tok);
>  };
>
>
>
> Modified: cfe/trunk/include/clang/Lex/Token.h
> URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/include/clang/Lex/Token.h?rev=173369&r1=173368&r2=173369&view=diff
> ==============================================================================
> --- cfe/trunk/include/clang/Lex/Token.h (original)
> +++ cfe/trunk/include/clang/Lex/Token.h Thu Jan 24 14:50:46 2013
> @@ -74,9 +74,10 @@
>      StartOfLine   = 0x01,  // At start of line or only after whitespace.
>      LeadingSpace  = 0x02,  // Whitespace exists before this token.
>      DisableExpand = 0x04,  // This identifier may never be macro expanded.
> -    NeedsCleaning = 0x08,   // Contained an escaped newline or trigraph.
> +    NeedsCleaning = 0x08,  // Contained an escaped newline or trigraph.
>      LeadingEmptyMacro = 0x10, // Empty macro exists before this token.
> -    HasUDSuffix = 0x20     // This string or character literal has a ud-suffix.
> +    HasUDSuffix = 0x20,    // This string or character literal has a ud-suffix.
> +    HasUCN = 0x40          // This identifier contains a UCN.
>    };
>
>    tok::TokenKind getKind() const { return (tok::TokenKind)Kind; }
> @@ -257,6 +258,9 @@
>    /// \brief Return true if this token is a string or character literal which
>    /// has a ud-suffix.
>    bool hasUDSuffix() const { return (Flags & HasUDSuffix) ? true : false; }
> +
> +  /// Returns true if this token contains a universal character name.
> +  bool hasUCN() const { return (Flags & HasUCN) ? true : false; }
>  };
>
>  /// \brief Information about the conditional stack (\#if directives)
>
> Modified: cfe/trunk/lib/Lex/Lexer.cpp
> URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Lex/Lexer.cpp?rev=173369&r1=173368&r2=173369&view=diff
> ==============================================================================
> --- cfe/trunk/lib/Lex/Lexer.cpp (original)
> +++ cfe/trunk/lib/Lex/Lexer.cpp Thu Jan 24 14:50:46 2013
> @@ -25,11 +25,13 @@
>  //===----------------------------------------------------------------------===//
>
>  #include "clang/Lex/Lexer.h"
> +#include "clang/Basic/ConvertUTF.h"
>  #include "clang/Basic/SourceManager.h"
>  #include "clang/Lex/CodeCompletionHandler.h"
>  #include "clang/Lex/LexDiagnostic.h"
>  #include "clang/Lex/Preprocessor.h"
>  #include "llvm/ADT/STLExtras.h"
> +#include "llvm/ADT/StringExtras.h"
>  #include "llvm/ADT/StringSwitch.h"
>  #include "llvm/Support/Compiler.h"
>  #include "llvm/Support/MemoryBuffer.h"
> @@ -371,10 +373,12 @@
>    // NOTE: this has to be checked *before* testing for an IdentifierInfo.
>    if (Tok.is(tok::raw_identifier))
>      TokStart = Tok.getRawIdentifierData();
> -  else if (const IdentifierInfo *II = Tok.getIdentifierInfo()) {
> -    // Just return the string from the identifier table, which is very quick.
> -    Buffer = II->getNameStart();
> -    return II->getLength();
> +  else if (!Tok.hasUCN()) {
> +    if (const IdentifierInfo *II = Tok.getIdentifierInfo()) {
> +      // Just return the string from the identifier table, which is very quick.
> +      Buffer = II->getNameStart();
> +      return II->getLength();
> +    }
>    }
>
>    // NOTE: this can be checked even after testing for an IdentifierInfo.
> @@ -1376,7 +1380,6 @@
>  ///   2. If this is an escaped newline (potentially with whitespace between
>  ///      the backslash and newline), implicitly skip the newline and return
>  ///      the char after it.
> -///   3. If this is a UCN, return it.  FIXME: C++ UCN's?
>  ///
>  /// This handles the slow/uncommon case of the getCharAndSize method.  Here we
>  /// know that we can accumulate into Size, and that we have already incremented
> @@ -1509,6 +1512,77 @@
>    IsAtStartOfLine = StartOfLine;
>  }
>
> +namespace {
> +  struct UCNCharRange {
> +    uint32_t Lower;
> +    uint32_t Upper;
> +  };
> +
> +  // C11 D.1, C++11 [charname.allowed]
> +  // FIXME: C99 and C++03 each have a different set of allowed UCNs.
> +  const UCNCharRange UCNAllowedCharRanges[] = {
> +    // 1
> +    { 0x00A8, 0x00A8 }, { 0x00AA, 0x00AA }, { 0x00AD, 0x00AD },
> +    { 0x00AF, 0x00AF }, { 0x00B2, 0x00B5 }, { 0x00B7, 0x00BA },
> +    { 0x00BC, 0x00BE }, { 0x00C0, 0x00D6 }, { 0x00D8, 0x00F6 },
> +    { 0x00F8, 0x00FF },
> +    // 2
> +    { 0x0100, 0x167F }, { 0x1681, 0x180D }, { 0x180F, 0x1FFF },
> +    // 3
> +    { 0x200B, 0x200D }, { 0x202A, 0x202E }, { 0x203F, 0x2040 },
> +    { 0x2054, 0x2054 }, { 0x2060, 0x206F },
> +    // 4
> +    { 0x2070, 0x218F }, { 0x2460, 0x24FF }, { 0x2776, 0x2793 },
> +    { 0x2C00, 0x2DFF }, { 0x2E80, 0x2FFF },
> +    // 5
> +    { 0x3004, 0x3007 }, { 0x3021, 0x302F }, { 0x3031, 0x303F },
> +    // 6
> +    { 0x3040, 0xD7FF },
> +    // 7
> +    { 0xF900, 0xFD3D }, { 0xFD40, 0xFDCF }, { 0xFDF0, 0xFE44 },
> +    { 0xFE47, 0xFFFD },
> +    // 8
> +    { 0x10000, 0x1FFFD }, { 0x20000, 0x2FFFD }, { 0x30000, 0x3FFFD },
> +    { 0x40000, 0x4FFFD }, { 0x50000, 0x5FFFD }, { 0x60000, 0x6FFFD },
> +    { 0x70000, 0x7FFFD }, { 0x80000, 0x8FFFD }, { 0x90000, 0x9FFFD },
> +    { 0xA0000, 0xAFFFD }, { 0xB0000, 0xBFFFD }, { 0xC0000, 0xCFFFD },
> +    { 0xD0000, 0xDFFFD }, { 0xE0000, 0xEFFFD }
> +  };
> +}
> +
> +static bool isAllowedIDChar(uint32_t c) {
> +  unsigned LowPoint = 0;
> +  unsigned HighPoint = llvm::array_lengthof(UCNAllowedCharRanges);
> +
> +  // Binary search the UCNAllowedCharRanges set.
> +  while (HighPoint != LowPoint) {
> +    unsigned MidPoint = (HighPoint + LowPoint) / 2;
> +    if (c < UCNAllowedCharRanges[MidPoint].Lower)
> +      HighPoint = MidPoint;
> +    else if (c > UCNAllowedCharRanges[MidPoint].Upper)
> +      LowPoint = MidPoint + 1;
> +    else
> +      return true;
> +  }
> +
> +  return false;
> +}
> +
> +static bool isAllowedInitiallyIDChar(uint32_t c) {
> +  // C11 D.2, C++11 [charname.disallowed]
> +  // FIXME: C99 only forbids "digits", presumably as described in C99 Annex D.
> +  // FIXME: C++03 does not forbid any initial characters.
> +  return !(0x0300 <= c && c <= 0x036F) &&
> +         !(0x1DC0 <= c && c <= 0x1DFF) &&
> +         !(0x20D0 <= c && c <= 0x20FF) &&
> +         !(0xFE20 <= c && c <= 0xFE2F);
> +}
> +
> +static inline bool isASCII(char C) {
> +  return static_cast<signed char>(C) >= 0;
> +}
> +
> +
>  void Lexer::LexIdentifier(Token &Result, const char *CurPtr) {
>    // Match [_A-Za-z0-9]*, we have already matched [_A-Za-z$]
>    unsigned Size;
> @@ -1520,11 +1594,11 @@
>
>    // Fast path, no $,\,? in identifier found.  '\' might be an escaped newline
>    // or UCN, and ? might be a trigraph for '\', an escaped newline or UCN.
> -  // FIXME: UCNs.
>    //
>    // TODO: Could merge these checks into a CharInfo flag to make the comparison
>    // cheaper
> -  if (C != '\\' && C != '?' && (C != '$' || !LangOpts.DollarIdents)) {
> +  if (isASCII(C) && C != '\\' && C != '?' &&
> +      (C != '$' || !LangOpts.DollarIdents)) {
>  FinishIdentifier:
>      const char *IdStart = BufferPtr;
>      FormTokenWithChars(Result, CurPtr, tok::raw_identifier);
> @@ -1561,8 +1635,38 @@
>        CurPtr = ConsumeChar(CurPtr, Size, Result);
>        C = getCharAndSize(CurPtr, Size);
>        continue;
> -    } else if (!isIdentifierBody(C)) { // FIXME: UCNs.
> -      // Found end of identifier.
> +
> +    } else if (C == '\\') {
> +      const char *UCNPtr = CurPtr + Size;
> +      uint32_t CodePoint = tryReadUCN(UCNPtr, CurPtr, /*Token=*/0);
> +      if (CodePoint == 0 || !isAllowedIDChar(CodePoint))
> +        goto FinishIdentifier;
> +
> +      Result.setFlag(Token::HasUCN);
> +      if ((UCNPtr - CurPtr ==  6 && CurPtr[1] == 'u') ||
> +          (UCNPtr - CurPtr == 10 && CurPtr[1] == 'U'))
> +        CurPtr = UCNPtr;
> +      else
> +        while (CurPtr != UCNPtr)
> +          (void)getAndAdvanceChar(CurPtr, Result);
> +
> +      C = getCharAndSize(CurPtr, Size);
> +      continue;
> +    } else if (!isASCII(C)) {
> +      const char *UnicodePtr = CurPtr;
> +      UTF32 CodePoint;
> +      ConversionResult Result = convertUTF8Sequence((const UTF8 **)&UnicodePtr,
> +                                                    (const UTF8 *)BufferEnd,
> +                                                    &CodePoint,
> +                                                    strictConversion);
> +      if (Result != conversionOK ||
> +          !isAllowedIDChar(static_cast<uint32_t>(CodePoint)))
> +        goto FinishIdentifier;
> +
> +      CurPtr = UnicodePtr;
> +      C = getCharAndSize(CurPtr, Size);
> +      continue;
> +    } else if (!isIdentifierBody(C)) {
>        goto FinishIdentifier;
>      }
>
> @@ -1570,7 +1674,7 @@
>      CurPtr = ConsumeChar(CurPtr, Size, Result);
>
>      C = getCharAndSize(CurPtr, Size);
> -    while (isIdentifierBody(C)) { // FIXME: UCNs.
> +    while (isIdentifierBody(C)) {
>        CurPtr = ConsumeChar(CurPtr, Size, Result);
>        C = getCharAndSize(CurPtr, Size);
>      }
> @@ -2592,6 +2696,135 @@
>    return false;
>  }
>
> +uint32_t Lexer::tryReadUCN(const char *&StartPtr, const char *SlashLoc,
> +                           Token *Result) {
> +  assert(LangOpts.CPlusPlus || LangOpts.C99);
> +
> +  unsigned CharSize;
> +  char Kind = getCharAndSize(StartPtr, CharSize);
> +
> +  unsigned NumHexDigits;
> +  if (Kind == 'u')
> +    NumHexDigits = 4;
> +  else if (Kind == 'U')
> +    NumHexDigits = 8;
> +  else
> +    return 0;
> +
> +  const char *CurPtr = StartPtr + CharSize;
> +  const char *KindLoc = &CurPtr[-1];
> +
> +  uint32_t CodePoint = 0;
> +  for (unsigned i = 0; i < NumHexDigits; ++i) {
> +    char C = getCharAndSize(CurPtr, CharSize);
> +
> +    unsigned Value = llvm::hexDigitValue(C);
> +    if (Value == -1U) {
> +      if (Result && !isLexingRawMode()) {
> +        if (i == 0) {
> +          Diag(BufferPtr, diag::warn_ucn_escape_no_digits)
> +            << StringRef(KindLoc, 1);
> +        } else {
> +          // FIXME: if i == 4 and NumHexDigits == 8, suggest a fixit to \u.
> +          Diag(BufferPtr, diag::warn_ucn_escape_incomplete);
> +        }
> +      }
> +
> +      return 0;
> +    }
> +
> +    CodePoint <<= 4;
> +    CodePoint += Value;
> +
> +    CurPtr += CharSize;
> +  }
> +
> +  if (Result) {
> +    Result->setFlag(Token::HasUCN);
> +    if (CurPtr - StartPtr == NumHexDigits + 2)
> +      StartPtr = CurPtr;
> +    else
> +      while (StartPtr != CurPtr)
> +        (void)getAndAdvanceChar(StartPtr, *Result);
> +  } else {
> +    StartPtr = CurPtr;
> +  }
> +
> +  // C99 6.4.3p2: A universal character name shall not specify a character whose
> +  //   short identifier is less than 00A0 other than 0024 ($), 0040 (@), or
> +  //   0060 (`), nor one in the range D800 through DFFF inclusive.)
> +  // C++11 [lex.charset]p2: If the hexadecimal value for a
> +  //   universal-character-name corresponds to a surrogate code point (in the
> +  //   range 0xD800-0xDFFF, inclusive), the program is ill-formed. Additionally,
> +  //   if the hexadecimal value for a universal-character-name outside the
> +  //   c-char-sequence, s-char-sequence, or r-char-sequence of a character or
> +  //   string literal corresponds to a control character (in either of the
> +  //   ranges 0x00-0x1F or 0x7F-0x9F, both inclusive) or to a character in the
> +  //   basic source character set, the program is ill-formed.
> +  if (CodePoint < 0xA0) {
> +    if (CodePoint == 0x24 || CodePoint == 0x40 || CodePoint == 0x60)
> +      return CodePoint;
> +
> +    // We don't use isLexingRawMode() here because we need to warn about bad
> +    // UCNs even when skipping preprocessing tokens in a #if block.
> +    if (Result && PP) {
> +      if (CodePoint < 0x20 || CodePoint >= 0x7F)
> +        Diag(BufferPtr, diag::err_ucn_control_character);
> +      else {
> +        char C = static_cast<char>(CodePoint);
> +        Diag(BufferPtr, diag::err_ucn_escape_basic_scs) << StringRef(&C, 1);
> +      }
> +    }
> +
> +    return 0;
> +
> +  } else if ((!LangOpts.CPlusPlus || LangOpts.CPlusPlus11) &&
> +             (CodePoint >= 0xD800 && CodePoint <= 0xDFFF)) {
> +    // C++03 allows UCNs representing surrogate characters. C99 and C++11 don't.
> +    // We don't use isLexingRawMode() here because we need to warn about bad
> +    // UCNs even when skipping preprocessing tokens in a #if block.
> +    if (Result && PP)
> +      Diag(BufferPtr, diag::err_ucn_escape_invalid);
> +    return 0;
> +  }
> +
> +  return CodePoint;
> +}
> +
> +void Lexer::LexUnicode(Token &Result, uint32_t C, const char *CurPtr) {
> +  if (isAllowedIDChar(C) && isAllowedInitiallyIDChar(C)) {
> +    MIOpt.ReadToken();
> +    return LexIdentifier(Result, CurPtr);
> +  }
> +
> +  if (!isASCII(*BufferPtr) && !isAllowedIDChar(C)) {
> +    // Non-ASCII characters tend to creep into source code unintentionally.
> +    // Instead of letting the parser complain about the unknown token,
> +    // just drop the character.
> +    // Note that we can /only/ do this when the non-ASCII character is actually
> +    // spelled as Unicode, not written as a UCN. The standard requires that
> +    // we not throw away any possible preprocessor tokens, but there's a
> +    // loophole in the mapping of Unicode characters to basic character set
> +    // characters that allows us to map these particular characters to, say,
> +    // whitespace.
> +    if (!isLexingRawMode()) {
> +      CharSourceRange CharRange =
> +        CharSourceRange::getCharRange(getSourceLocation(),
> +                                      getSourceLocation(CurPtr));
> +      Diag(BufferPtr, diag::err_non_ascii)
> +        << FixItHint::CreateRemoval(CharRange);
> +    }
> +
> +    BufferPtr = CurPtr;
> +    return LexTokenInternal(Result);
> +  }
> +
> +  // Otherwise, we have an explicit UCN or a character that's unlikely to show
> +  // up by accident.
> +  MIOpt.ReadToken();
> +  FormTokenWithChars(Result, CurPtr, tok::unknown);
> +}
> +
>
>  /// LexTokenInternal - This implements a simple C family lexer.  It is an
>  /// extremely performance critical piece of code.  This assumes that the buffer
> @@ -3243,12 +3476,41 @@
>        Kind = tok::unknown;
>      break;
>
> +  // UCNs (C99 6.4.3, C++11 [lex.charset]p2)
>    case '\\':
> -    // FIXME: UCN's.
> -    // FALL THROUGH.
> -  default:
> +    if (uint32_t CodePoint = tryReadUCN(CurPtr, BufferPtr, &Result))
> +      return LexUnicode(Result, CodePoint, CurPtr);
> +
>      Kind = tok::unknown;
>      break;
> +
> +  default: {
> +    if (isASCII(Char)) {
> +      Kind = tok::unknown;
> +      break;
> +    }
> +
> +    UTF32 CodePoint;
> +
> +    // We can't just reset CurPtr to BufferPtr because BufferPtr may point to
> +    // an escaped newline.
> +    --CurPtr;
> +    ConversionResult Status = convertUTF8Sequence((const UTF8 **)&CurPtr,
> +                                                  (const UTF8 *)BufferEnd,
> +                                                  &CodePoint,
> +                                                  strictConversion);
> +    if (Status == conversionOK)
> +      return LexUnicode(Result, CodePoint, CurPtr);
> +
> +    // Non-ASCII characters tend to creep into source code unintentionally.
> +    // Instead of letting the parser complain about the unknown token,
> +    // just warn that we don't have valid UTF-8, then drop the character.

The comment says 'just warn', but we throw an error here:

> +    if (!isLexingRawMode())
> +      Diag(CurPtr, diag::err_invalid_utf8);

Dmitri

-- 
main(i,j){for(i=2;;i++){for(j=2;j<i;j++){if(!(i%j)){j=0;break;}}if
(j){printf("%d\n",i);}}} /*Dmitri Gribenko <gribozavr at gmail.com>*/