[cfe-commits] r173369 - in /cfe/trunk: include/clang/Basic/ConvertUTF.h include/clang/Basic/DiagnosticLexKinds.td include/clang/Lex/Lexer.h include/clang/Lex/Token.h lib/Lex/Lexer.cpp lib/Lex/Preprocessor.cpp test/CXX/over/over.oper/over.literal/p8.cpp test/CodeGen/ucn-identifiers.c test/FixIt/fixit-unicode.c test/Lexer/utf8-invalid.c test/Preprocessor/ucn-pp-identifier.c test/Sema/ucn-identifiers.c
Dmitri Gribenko
gribozavr at gmail.com
Thu Jan 24 13:34:51 PST 2013
On Thu, Jan 24, 2013 at 10:50 PM, Jordan Rose <jordan_rose at apple.com> wrote:
> Author: jrose
> Date: Thu Jan 24 14:50:46 2013
> New Revision: 173369
>
> URL: http://llvm.org/viewvc/llvm-project?rev=173369&view=rev
> Log:
> Handle universal character names and Unicode characters outside of literals.
>
> This is a missing piece for C99 conformance.
>
> This patch handles UCNs by adding a '\\' case to LexTokenInternal and
> LexIdentifier -- if we see a backslash, we tentatively try to read in a UCN.
> If the UCN is not syntactically well-formed, we fall back to the old
> treatment: a backslash followed by an identifier beginning with 'u' (or 'U').
>
> Because the spelling of an identifier with UCNs still has the UCN in it, we
> need to convert that to UTF-8 in Preprocessor::LookUpIdentifierInfo.
>
> Of course, valid code that does *not* use UCNs will see only a very minimal
> performance hit (checks after each identifier for non-ASCII characters,
> checks when converting raw_identifiers to identifiers that they do not
> contain UCNs, and checks when getting the spelling of an identifier that it
> does not contain a UCN).
>
> This patch also adds basic support for actual UTF-8 in the source. This is
> treated almost exactly the same as UCNs except that we consider stray
> Unicode characters to be mistakes and offer a fixit to remove them.
>
> Added:
> cfe/trunk/test/CodeGen/ucn-identifiers.c
> cfe/trunk/test/Lexer/utf8-invalid.c
> cfe/trunk/test/Preprocessor/ucn-pp-identifier.c
> cfe/trunk/test/Sema/ucn-identifiers.c
> Modified:
> cfe/trunk/include/clang/Basic/ConvertUTF.h
> cfe/trunk/include/clang/Basic/DiagnosticLexKinds.td
> cfe/trunk/include/clang/Lex/Lexer.h
> cfe/trunk/include/clang/Lex/Token.h
> cfe/trunk/lib/Lex/Lexer.cpp
> cfe/trunk/lib/Lex/Preprocessor.cpp
> cfe/trunk/test/CXX/over/over.oper/over.literal/p8.cpp
> cfe/trunk/test/FixIt/fixit-unicode.c
>
> Modified: cfe/trunk/include/clang/Basic/ConvertUTF.h
> URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/include/clang/Basic/ConvertUTF.h?rev=173369&r1=173368&r2=173369&view=diff
> ==============================================================================
> --- cfe/trunk/include/clang/Basic/ConvertUTF.h (original)
> +++ cfe/trunk/include/clang/Basic/ConvertUTF.h Thu Jan 24 14:50:46 2013
> @@ -161,6 +161,16 @@
>
> unsigned getNumBytesForUTF8(UTF8 firstByte);
>
> +static inline ConversionResult convertUTF8Sequence(const UTF8 **source,
> + const UTF8 *sourceEnd,
> + UTF32 *target,
> + ConversionFlags flags) {
> + unsigned size = getNumBytesForUTF8(**source);
> + if (size > sourceEnd - *source)
> + return sourceExhausted;
> + return ConvertUTF8toUTF32(source, *source + size, &target, target + 1, flags);
> +}
> +
> #ifdef __cplusplus
> }
>
>
> Modified: cfe/trunk/include/clang/Basic/DiagnosticLexKinds.td
> URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/include/clang/Basic/DiagnosticLexKinds.td?rev=173369&r1=173368&r2=173369&view=diff
> ==============================================================================
> --- cfe/trunk/include/clang/Basic/DiagnosticLexKinds.td (original)
> +++ cfe/trunk/include/clang/Basic/DiagnosticLexKinds.td Thu Jan 24 14:50:46 2013
> @@ -93,15 +93,29 @@
> "multi-character character constant">, InGroup<MultiChar>;
> def ext_four_char_character_literal : Extension<
> "multi-character character constant">, InGroup<FourByteMultiChar>;
> -
>
> -// Literal
> -def ext_nonstandard_escape : Extension<
> - "use of non-standard escape character '\\%0'">;
> -def ext_unknown_escape : ExtWarn<"unknown escape sequence '\\%0'">;
> -def err_hex_escape_no_digits : Error<"\\%0 used with no following hex digits">;
> +
> +// Unicode and UCNs
> +def err_invalid_utf8 : Error<
> + "source file is not valid UTF-8">;
> +def err_non_ascii : Error<
> + "non-ASCII characters are not allowed outside of literals and identifiers">;
> +def ext_unicode_whitespace : ExtWarn<
> + "treating Unicode character as whitespace">,
> + InGroup<DiagGroup<"unicode-whitespace">>;
> +
> +def err_hex_escape_no_digits : Error<
> + "\\%0 used with no following hex digits">;
> +def warn_ucn_escape_no_digits : Warning<
> + "\\%0 used with no following hex digits; "
> + "treating as '\\' followed by identifier">, InGroup<Unicode>;
> +def err_ucn_escape_incomplete : Error<
> + "incomplete universal character name">;
> +def warn_ucn_escape_incomplete : Warning<
> + "incomplete universal character name; "
> + "treating as '\\' followed by identifier">, InGroup<Unicode>;
> def err_ucn_escape_invalid : Error<"invalid universal character">;
> -def err_ucn_escape_incomplete : Error<"incomplete universal character name">;
> +
> def err_ucn_escape_basic_scs : Error<
> "character '%0' cannot be specified by a universal character name">;
> def err_ucn_control_character : Error<
> @@ -112,6 +126,12 @@
> def warn_cxx98_compat_literal_ucn_control_character : Warning<
> "universal character name referring to a control character "
> "is incompatible with C++98">, InGroup<CXX98Compat>, DefaultIgnore;
> +
> +
> +// Literal
> +def ext_nonstandard_escape : Extension<
> + "use of non-standard escape character '\\%0'">;
> +def ext_unknown_escape : ExtWarn<"unknown escape sequence '\\%0'">;
> def err_invalid_decimal_digit : Error<"invalid digit '%0' in decimal constant">;
> def err_invalid_binary_digit : Error<"invalid digit '%0' in binary constant">;
> def err_invalid_octal_digit : Error<"invalid digit '%0' in octal constant">;
>
> Modified: cfe/trunk/include/clang/Lex/Lexer.h
> URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/include/clang/Lex/Lexer.h?rev=173369&r1=173368&r2=173369&view=diff
> ==============================================================================
> --- cfe/trunk/include/clang/Lex/Lexer.h (original)
> +++ cfe/trunk/include/clang/Lex/Lexer.h Thu Jan 24 14:50:46 2013
> @@ -437,6 +437,11 @@
> ///
> void LexTokenInternal(Token &Result);
>
> + /// Given that a token begins with the Unicode character \p C, figure out
> + /// what kind of token it is and dispatch to the appropriate lexing helper
> + /// function.
> + void LexUnicode(Token &Result, uint32_t C, const char *CurPtr);
> +
> /// FormTokenWithChars - When we lex a token, we have identified a span
> /// starting at BufferPtr, going to TokEnd that forms the token. This method
> /// takes that range and assigns it to the token as its location and size. In
> @@ -579,6 +584,21 @@
> void cutOffLexing() { BufferPtr = BufferEnd; }
>
> bool isHexaLiteral(const char *Start, const LangOptions &LangOpts);
> +
> +
> + /// Read a universal character name.
> + ///
> + /// \param CurPtr The position in the source buffer after the initial '\'.
> + /// If the UCN is syntactically well-formed (but not necessarily
> + /// valid), this parameter will be updated to point to the
> + /// character after the UCN.
> + /// \param SlashLoc The position in the source buffer of the '\'.
> + /// \param Tok The token being formed. Pass \c NULL to suppress diagnostics
> + /// and handle token formation in the caller.
> + ///
> + /// \return The Unicode codepoint specified by the UCN, or 0 if the UCN is
> + /// invalid.
> + uint32_t tryReadUCN(const char *&CurPtr, const char *SlashLoc, Token *Tok);
> };
>
>
>
> Modified: cfe/trunk/include/clang/Lex/Token.h
> URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/include/clang/Lex/Token.h?rev=173369&r1=173368&r2=173369&view=diff
> ==============================================================================
> --- cfe/trunk/include/clang/Lex/Token.h (original)
> +++ cfe/trunk/include/clang/Lex/Token.h Thu Jan 24 14:50:46 2013
> @@ -74,9 +74,10 @@
> StartOfLine = 0x01, // At start of line or only after whitespace.
> LeadingSpace = 0x02, // Whitespace exists before this token.
> DisableExpand = 0x04, // This identifier may never be macro expanded.
> - NeedsCleaning = 0x08, // Contained an escaped newline or trigraph.
> + NeedsCleaning = 0x08, // Contained an escaped newline or trigraph.
> LeadingEmptyMacro = 0x10, // Empty macro exists before this token.
> - HasUDSuffix = 0x20 // This string or character literal has a ud-suffix.
> + HasUDSuffix = 0x20, // This string or character literal has a ud-suffix.
> + HasUCN = 0x40 // This identifier contains a UCN.
> };
>
> tok::TokenKind getKind() const { return (tok::TokenKind)Kind; }
> @@ -257,6 +258,9 @@
> /// \brief Return true if this token is a string or character literal which
> /// has a ud-suffix.
> bool hasUDSuffix() const { return (Flags & HasUDSuffix) ? true : false; }
> +
> + /// Returns true if this token contains a universal character name.
> + bool hasUCN() const { return (Flags & HasUCN) ? true : false; }
> };
>
> /// \brief Information about the conditional stack (\#if directives)
>
> Modified: cfe/trunk/lib/Lex/Lexer.cpp
> URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Lex/Lexer.cpp?rev=173369&r1=173368&r2=173369&view=diff
> ==============================================================================
> --- cfe/trunk/lib/Lex/Lexer.cpp (original)
> +++ cfe/trunk/lib/Lex/Lexer.cpp Thu Jan 24 14:50:46 2013
> @@ -25,11 +25,13 @@
> //===----------------------------------------------------------------------===//
>
> #include "clang/Lex/Lexer.h"
> +#include "clang/Basic/ConvertUTF.h"
> #include "clang/Basic/SourceManager.h"
> #include "clang/Lex/CodeCompletionHandler.h"
> #include "clang/Lex/LexDiagnostic.h"
> #include "clang/Lex/Preprocessor.h"
> #include "llvm/ADT/STLExtras.h"
> +#include "llvm/ADT/StringExtras.h"
> #include "llvm/ADT/StringSwitch.h"
> #include "llvm/Support/Compiler.h"
> #include "llvm/Support/MemoryBuffer.h"
> @@ -371,10 +373,12 @@
> // NOTE: this has to be checked *before* testing for an IdentifierInfo.
> if (Tok.is(tok::raw_identifier))
> TokStart = Tok.getRawIdentifierData();
> - else if (const IdentifierInfo *II = Tok.getIdentifierInfo()) {
> - // Just return the string from the identifier table, which is very quick.
> - Buffer = II->getNameStart();
> - return II->getLength();
> + else if (!Tok.hasUCN()) {
> + if (const IdentifierInfo *II = Tok.getIdentifierInfo()) {
> + // Just return the string from the identifier table, which is very quick.
> + Buffer = II->getNameStart();
> + return II->getLength();
> + }
> }
>
> // NOTE: this can be checked even after testing for an IdentifierInfo.
> @@ -1376,7 +1380,6 @@
> /// 2. If this is an escaped newline (potentially with whitespace between
> /// the backslash and newline), implicitly skip the newline and return
> /// the char after it.
> -/// 3. If this is a UCN, return it. FIXME: C++ UCN's?
> ///
> /// This handles the slow/uncommon case of the getCharAndSize method. Here we
> /// know that we can accumulate into Size, and that we have already incremented
> @@ -1509,6 +1512,77 @@
> IsAtStartOfLine = StartOfLine;
> }
>
> +namespace {
> + struct UCNCharRange {
> + uint32_t Lower;
> + uint32_t Upper;
> + };
> +
> + // C11 D.1, C++11 [charname.allowed]
> + // FIXME: C99 and C++03 each have a different set of allowed UCNs.
> + const UCNCharRange UCNAllowedCharRanges[] = {
> + // 1
> + { 0x00A8, 0x00A8 }, { 0x00AA, 0x00AA }, { 0x00AD, 0x00AD },
> + { 0x00AF, 0x00AF }, { 0x00B2, 0x00B5 }, { 0x00B7, 0x00BA },
> + { 0x00BC, 0x00BE }, { 0x00C0, 0x00D6 }, { 0x00D8, 0x00F6 },
> + { 0x00F8, 0x00FF },
> + // 2
> + { 0x0100, 0x167F }, { 0x1681, 0x180D }, { 0x180F, 0x1FFF },
> + // 3
> + { 0x200B, 0x200D }, { 0x202A, 0x202E }, { 0x203F, 0x2040 },
> + { 0x2054, 0x2054 }, { 0x2060, 0x206F },
> + // 4
> + { 0x2070, 0x218F }, { 0x2460, 0x24FF }, { 0x2776, 0x2793 },
> + { 0x2C00, 0x2DFF }, { 0x2E80, 0x2FFF },
> + // 5
> + { 0x3004, 0x3007 }, { 0x3021, 0x302F }, { 0x3031, 0x303F },
> + // 6
> + { 0x3040, 0xD7FF },
> + // 7
> + { 0xF900, 0xFD3D }, { 0xFD40, 0xFDCF }, { 0xFDF0, 0xFE44 },
> + { 0xFE47, 0xFFFD },
> + // 8
> + { 0x10000, 0x1FFFD }, { 0x20000, 0x2FFFD }, { 0x30000, 0x3FFFD },
> + { 0x40000, 0x4FFFD }, { 0x50000, 0x5FFFD }, { 0x60000, 0x6FFFD },
> + { 0x70000, 0x7FFFD }, { 0x80000, 0x8FFFD }, { 0x90000, 0x9FFFD },
> + { 0xA0000, 0xAFFFD }, { 0xB0000, 0xBFFFD }, { 0xC0000, 0xCFFFD },
> + { 0xD0000, 0xDFFFD }, { 0xE0000, 0xEFFFD }
> + };
> +}
> +
> +static bool isAllowedIDChar(uint32_t c) {
> + unsigned LowPoint = 0;
> + unsigned HighPoint = llvm::array_lengthof(UCNAllowedCharRanges);
> +
> + // Binary search the UCNAllowedCharRanges set.
> + while (HighPoint != LowPoint) {
> + unsigned MidPoint = (HighPoint + LowPoint) / 2;
> + if (c < UCNAllowedCharRanges[MidPoint].Lower)
> + HighPoint = MidPoint;
> + else if (c > UCNAllowedCharRanges[MidPoint].Upper)
> + LowPoint = MidPoint + 1;
> + else
> + return true;
> + }
> +
> + return false;
> +}
> +
> +static bool isAllowedInitiallyIDChar(uint32_t c) {
> + // C11 D.2, C++11 [charname.disallowed]
> + // FIXME: C99 only forbids "digits", presumably as described in C99 Annex D.
> + // FIXME: C++03 does not forbid any initial characters.
> + return !(0x0300 <= c && c <= 0x036F) &&
> + !(0x1DC0 <= c && c <= 0x1DFF) &&
> + !(0x20D0 <= c && c <= 0x20FF) &&
> + !(0xFE20 <= c && c <= 0xFE2F);
> +}
> +
> +static inline bool isASCII(char C) {
> + return static_cast<signed char>(C) >= 0;
> +}
> +
> +
> void Lexer::LexIdentifier(Token &Result, const char *CurPtr) {
> // Match [_A-Za-z0-9]*, we have already matched [_A-Za-z$]
> unsigned Size;
> @@ -1520,11 +1594,11 @@
>
> // Fast path, no $,\,? in identifier found. '\' might be an escaped newline
> // or UCN, and ? might be a trigraph for '\', an escaped newline or UCN.
> - // FIXME: UCNs.
> //
> // TODO: Could merge these checks into a CharInfo flag to make the comparison
> // cheaper
> - if (C != '\\' && C != '?' && (C != '$' || !LangOpts.DollarIdents)) {
> + if (isASCII(C) && C != '\\' && C != '?' &&
> + (C != '$' || !LangOpts.DollarIdents)) {
> FinishIdentifier:
> const char *IdStart = BufferPtr;
> FormTokenWithChars(Result, CurPtr, tok::raw_identifier);
> @@ -1561,8 +1635,38 @@
> CurPtr = ConsumeChar(CurPtr, Size, Result);
> C = getCharAndSize(CurPtr, Size);
> continue;
> - } else if (!isIdentifierBody(C)) { // FIXME: UCNs.
> - // Found end of identifier.
> +
> + } else if (C == '\\') {
> + const char *UCNPtr = CurPtr + Size;
> + uint32_t CodePoint = tryReadUCN(UCNPtr, CurPtr, /*Token=*/0);
> + if (CodePoint == 0 || !isAllowedIDChar(CodePoint))
> + goto FinishIdentifier;
> +
> + Result.setFlag(Token::HasUCN);
> + if ((UCNPtr - CurPtr == 6 && CurPtr[1] == 'u') ||
> + (UCNPtr - CurPtr == 10 && CurPtr[1] == 'U'))
> + CurPtr = UCNPtr;
> + else
> + while (CurPtr != UCNPtr)
> + (void)getAndAdvanceChar(CurPtr, Result);
> +
> + C = getCharAndSize(CurPtr, Size);
> + continue;
> + } else if (!isASCII(C)) {
> + const char *UnicodePtr = CurPtr;
> + UTF32 CodePoint;
> + ConversionResult Result = convertUTF8Sequence((const UTF8 **)&UnicodePtr,
> + (const UTF8 *)BufferEnd,
> + &CodePoint,
> + strictConversion);
> + if (Result != conversionOK ||
> + !isAllowedIDChar(static_cast<uint32_t>(CodePoint)))
> + goto FinishIdentifier;
> +
> + CurPtr = UnicodePtr;
> + C = getCharAndSize(CurPtr, Size);
> + continue;
> + } else if (!isIdentifierBody(C)) {
> goto FinishIdentifier;
> }
>
> @@ -1570,7 +1674,7 @@
> CurPtr = ConsumeChar(CurPtr, Size, Result);
>
> C = getCharAndSize(CurPtr, Size);
> - while (isIdentifierBody(C)) { // FIXME: UCNs.
> + while (isIdentifierBody(C)) {
> CurPtr = ConsumeChar(CurPtr, Size, Result);
> C = getCharAndSize(CurPtr, Size);
> }
> @@ -2592,6 +2696,135 @@
> return false;
> }
>
> +uint32_t Lexer::tryReadUCN(const char *&StartPtr, const char *SlashLoc,
> + Token *Result) {
> + assert(LangOpts.CPlusPlus || LangOpts.C99);
> +
> + unsigned CharSize;
> + char Kind = getCharAndSize(StartPtr, CharSize);
> +
> + unsigned NumHexDigits;
> + if (Kind == 'u')
> + NumHexDigits = 4;
> + else if (Kind == 'U')
> + NumHexDigits = 8;
> + else
> + return 0;
> +
> + const char *CurPtr = StartPtr + CharSize;
> + const char *KindLoc = &CurPtr[-1];
> +
> + uint32_t CodePoint = 0;
> + for (unsigned i = 0; i < NumHexDigits; ++i) {
> + char C = getCharAndSize(CurPtr, CharSize);
> +
> + unsigned Value = llvm::hexDigitValue(C);
> + if (Value == -1U) {
> + if (Result && !isLexingRawMode()) {
> + if (i == 0) {
> + Diag(BufferPtr, diag::warn_ucn_escape_no_digits)
> + << StringRef(KindLoc, 1);
> + } else {
> + // FIXME: if i == 4 and NumHexDigits == 8, suggest a fixit to \u.
> + Diag(BufferPtr, diag::warn_ucn_escape_incomplete);
> + }
> + }
> +
> + return 0;
> + }
> +
> + CodePoint <<= 4;
> + CodePoint += Value;
> +
> + CurPtr += CharSize;
> + }
> +
> + if (Result) {
> + Result->setFlag(Token::HasUCN);
> + if (CurPtr - StartPtr == NumHexDigits + 2)
> + StartPtr = CurPtr;
> + else
> + while (StartPtr != CurPtr)
> + (void)getAndAdvanceChar(StartPtr, *Result);
> + } else {
> + StartPtr = CurPtr;
> + }
> +
> + // C99 6.4.3p2: A universal character name shall not specify a character whose
> + // short identifier is less than 00A0 other than 0024 ($), 0040 (@), or
> + // 0060 (`), nor one in the range D800 through DFFF inclusive.)
> + // C++11 [lex.charset]p2: If the hexadecimal value for a
> + // universal-character-name corresponds to a surrogate code point (in the
> + // range 0xD800-0xDFFF, inclusive), the program is ill-formed. Additionally,
> + // if the hexadecimal value for a universal-character-name outside the
> + // c-char-sequence, s-char-sequence, or r-char-sequence of a character or
> + // string literal corresponds to a control character (in either of the
> + // ranges 0x00-0x1F or 0x7F-0x9F, both inclusive) or to a character in the
> + // basic source character set, the program is ill-formed.
> + if (CodePoint < 0xA0) {
> + if (CodePoint == 0x24 || CodePoint == 0x40 || CodePoint == 0x60)
> + return CodePoint;
> +
> + // We don't use isLexingRawMode() here because we need to warn about bad
> + // UCNs even when skipping preprocessing tokens in a #if block.
> + if (Result && PP) {
> + if (CodePoint < 0x20 || CodePoint >= 0x7F)
> + Diag(BufferPtr, diag::err_ucn_control_character);
> + else {
> + char C = static_cast<char>(CodePoint);
> + Diag(BufferPtr, diag::err_ucn_escape_basic_scs) << StringRef(&C, 1);
> + }
> + }
> +
> + return 0;
> +
> + } else if ((!LangOpts.CPlusPlus || LangOpts.CPlusPlus11) &&
> + (CodePoint >= 0xD800 && CodePoint <= 0xDFFF)) {
> + // C++03 allows UCNs representing surrogate characters. C99 and C++11 don't.
> + // We don't use isLexingRawMode() here because we need to warn about bad
> + // UCNs even when skipping preprocessing tokens in a #if block.
> + if (Result && PP)
> + Diag(BufferPtr, diag::err_ucn_escape_invalid);
> + return 0;
> + }
> +
> + return CodePoint;
> +}
> +
> +void Lexer::LexUnicode(Token &Result, uint32_t C, const char *CurPtr) {
> + if (isAllowedIDChar(C) && isAllowedInitiallyIDChar(C)) {
> + MIOpt.ReadToken();
> + return LexIdentifier(Result, CurPtr);
> + }
> +
> + if (!isASCII(*BufferPtr) && !isAllowedIDChar(C)) {
> + // Non-ASCII characters tend to creep into source code unintentionally.
> + // Instead of letting the parser complain about the unknown token,
> + // just drop the character.
> + // Note that we can /only/ do this when the non-ASCII character is actually
> + // spelled as Unicode, not written as a UCN. The standard requires that
> + // we not throw away any possible preprocessor tokens, but there's a
> + // loophole in the mapping of Unicode characters to basic character set
> + // characters that allows us to map these particular characters to, say,
> + // whitespace.
> + if (!isLexingRawMode()) {
> + CharSourceRange CharRange =
> + CharSourceRange::getCharRange(getSourceLocation(),
> + getSourceLocation(CurPtr));
> + Diag(BufferPtr, diag::err_non_ascii)
> + << FixItHint::CreateRemoval(CharRange);
> + }
> +
> + BufferPtr = CurPtr;
> + return LexTokenInternal(Result);
> + }
> +
> + // Otherwise, we have an explicit UCN or a character that's unlikely to show
> + // up by accident.
> + MIOpt.ReadToken();
> + FormTokenWithChars(Result, CurPtr, tok::unknown);
> +}
> +
>
> /// LexTokenInternal - This implements a simple C family lexer. It is an
> /// extremely performance critical piece of code. This assumes that the buffer
> @@ -3243,12 +3476,41 @@
> Kind = tok::unknown;
> break;
>
> + // UCNs (C99 6.4.3, C++11 [lex.charset]p2)
> case '\\':
> - // FIXME: UCN's.
> - // FALL THROUGH.
> - default:
> + if (uint32_t CodePoint = tryReadUCN(CurPtr, BufferPtr, &Result))
> + return LexUnicode(Result, CodePoint, CurPtr);
> +
> Kind = tok::unknown;
> break;
> +
> + default: {
> + if (isASCII(Char)) {
> + Kind = tok::unknown;
> + break;
> + }
> +
> + UTF32 CodePoint;
> +
> + // We can't just reset CurPtr to BufferPtr because BufferPtr may point to
> + // an escaped newline.
> + --CurPtr;
> + ConversionResult Status = convertUTF8Sequence((const UTF8 **)&CurPtr,
> + (const UTF8 *)BufferEnd,
> + &CodePoint,
> + strictConversion);
> + if (Status == conversionOK)
> + return LexUnicode(Result, CodePoint, CurPtr);
> +
> + // Non-ASCII characters tend to creep into source code unintentionally.
> + // Instead of letting the parser complain about the unknown token,
> + // just warn that we don't have valid UTF-8, then drop the character.
The comment says 'just warn', but we throw an error here:
> + if (!isLexingRawMode())
> + Diag(CurPtr, diag::err_invalid_utf8);
Dmitri
--
main(i,j){for(i=2;;i++){for(j=2;j<i;j++){if(!(i%j)){j=0;break;}}if
(j){printf("%d\n",i);}}} /*Dmitri Gribenko <gribozavr at gmail.com>*/
More information about the cfe-commits
mailing list