[cfe-commits] r148389 - in /cfe/trunk: include/clang/Basic/DiagnosticLexKinds.td lib/Lex/LiteralSupport.cpp
Eli Friedman
eli.friedman at gmail.com
Wed Jan 18 14:49:29 PST 2012
On Wed, Jan 18, 2012 at 4:27 AM, Seth Cantrell <seth.cantrell at gmail.com> wrote:
> Author: socantre
> Date: Wed Jan 18 06:27:04 2012
> New Revision: 148389
>
> URL: http://llvm.org/viewvc/llvm-project?rev=148389&view=rev
> Log:
> Improves support for Unicode in character literals
>
> Updates ProcessUCNExcape() for C++. C++11 allows UCNs in character
> and string literals that represent control characters and basic
> source characters. Also C++03 allows UCNs that refer to surrogate
> codepoints.
>
> UTF-8 sequences in character literals are now handled as single
> c-chars.
>
> Added error for multiple characters in Unicode character literals.
>
> Added errors for when a the execution charset encoding of a c-char
> cannot be represented as a single code unit in the associated
> character type. Note that for the purposes of this error the asso-
> ciated character type for a narrow character literal is char, not
> int, even though in C narrow character literals have type int.
>
> Modified:
> cfe/trunk/include/clang/Basic/DiagnosticLexKinds.td
> cfe/trunk/lib/Lex/LiteralSupport.cpp
>
> Modified: cfe/trunk/include/clang/Basic/DiagnosticLexKinds.td
> URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/include/clang/Basic/DiagnosticLexKinds.td?rev=148389&r1=148388&r2=148389&view=diff
> ==============================================================================
> --- cfe/trunk/include/clang/Basic/DiagnosticLexKinds.td (original)
> +++ cfe/trunk/include/clang/Basic/DiagnosticLexKinds.td Wed Jan 18 06:27:04 2012
> @@ -107,6 +107,8 @@
> "extraneous characters in character constant ignored">;
> def warn_char_constant_too_large : Warning<
> "character constant too long for its type">;
> +def err_multichar_utf_character_literal : Error<
> + "Unicode character literals may not contain multiple characters">;
> def err_exponent_has_no_digits : Error<"exponent has no digits">;
> def ext_imaginary_constant : Extension<"imaginary constants are an extension">;
> def err_hexconstant_requires_exponent : Error<
> @@ -121,8 +123,8 @@
> def ext_string_too_long : Extension<"string literal of length %0 exceeds "
> "maximum length %1 that %select{C90|ISO C99|C++}2 compilers are required to "
> "support">, InGroup<OverlengthStrings>;
> -def warn_ucn_escape_too_large : ExtWarn<
> - "character unicode escape sequence too long for its type">, InGroup<Unicode>;
> +def err_character_too_large : Error<
> + "character too large for enclosing character literal type">;
> def warn_ucn_not_valid_in_c89 : ExtWarn<
> "unicode escape sequences are only valid in C99 or C++">, InGroup<Unicode>;
> def warn_cxx98_compat_unicode_literal : Warning<
> @@ -132,6 +134,8 @@
> "unsupported non-standard concatenation of string literals">;
> def err_bad_string_encoding : Error<
> "illegal sequence in string literal">;
> +def err_bad_character_encoding : Error<
> + "illegal sequence in character literal">;
>
> //===----------------------------------------------------------------------===//
> // PTH Diagnostics
>
> Modified: cfe/trunk/lib/Lex/LiteralSupport.cpp
> URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Lex/LiteralSupport.cpp?rev=148389&r1=148388&r2=148389&view=diff
> ==============================================================================
> --- cfe/trunk/lib/Lex/LiteralSupport.cpp (original)
> +++ cfe/trunk/lib/Lex/LiteralSupport.cpp Wed Jan 18 06:27:04 2012
> @@ -182,7 +182,8 @@
> static bool ProcessUCNEscape(const char *&ThisTokBuf, const char *ThisTokEnd,
> uint32_t &UcnVal, unsigned short &UcnLen,
> FullSourceLoc Loc, DiagnosticsEngine *Diags,
> - const LangOptions &Features) {
> + const LangOptions &Features,
> + bool in_char_string_literal = false) {
> if (!Features.CPlusPlus && !Features.C99 && Diags)
> Diags->Report(Loc, diag::warn_ucn_not_valid_in_c89);
>
> @@ -216,11 +217,20 @@
> }
> return false;
> }
> - // Check UCN constraints (C99 6.4.3p2).
> - if ((UcnVal < 0xa0 &&
> - (UcnVal != 0x24 && UcnVal != 0x40 && UcnVal != 0x60 )) // $, @, `
> - || (UcnVal >= 0xD800 && UcnVal <= 0xDFFF)
> - || (UcnVal > 0x10FFFF)) /* the maximum legal UTF32 value */ {
> + // Check UCN constraints (C99 6.4.3p2) [C++11 lex.charset p2]
> + bool invalid_ucn = (0xD800<=UcnVal && UcnVal<=0xDFFF) // surrogate codepoints
> + || 0x10FFFF < UcnVal; // maximum legal UTF32 value
> +
> + // C++11 allows UCNs that refer to control characters and basic source
> + // characters inside character and string literals
> + if (!Features.CPlusPlus0x || !in_char_string_literal) {
> + if ((UcnVal < 0xa0 &&
> + (UcnVal != 0x24 && UcnVal != 0x40 && UcnVal != 0x60 ))) { // $, @, `
> + invalid_ucn = true;
> + }
> + }
> +
> + if (invalid_ucn) {
> if (Diags)
> Diags->Report(Loc, diag::err_ucn_escape_invalid);
> return false;
> @@ -747,14 +757,13 @@
> CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
> SourceLocation Loc, Preprocessor &PP,
> tok::TokenKind kind) {
> - // At this point we know that the character matches the regex "L?'.*'".
> + // At this point we know that the character matches the regex "(L|u|U)?'.*'".
> HadError = false;
>
> Kind = kind;
>
> - // Determine if this is a wide or UTF character.
> - if (Kind == tok::wide_char_constant || Kind == tok::utf16_char_constant ||
> - Kind == tok::utf32_char_constant) {
> + // Skip over wide character determinant.
> + if (Kind != tok::char_constant) {
> ++begin;
> }
>
> @@ -762,6 +771,10 @@
> assert(begin[0] == '\'' && "Invalid token lexed");
> ++begin;
>
> + // Trim the ending quote.
> + assert(end[-1] == '\'' && "Invalid token lexed");
> + --end;
> +
> // FIXME: The "Value" is an uint64_t so we can handle char literals of
> // up to 64-bits.
> // FIXME: This extensively assumes that 'char' is 8-bits.
> @@ -773,76 +786,114 @@
> assert(PP.getTargetInfo().getWCharWidth() <= 64 &&
> "Assumes sizeof(wchar) on target is <= 64");
>
> - // This is what we will use for overflow detection
> - llvm::APInt LitVal(PP.getTargetInfo().getIntWidth(), 0);
> -
> - unsigned NumCharsSoFar = 0;
> - bool Warned = false;
> - while (begin[0] != '\'') {
> - uint64_t ResultChar;
> -
> - // Is this a Universal Character Name escape?
> - if (begin[0] != '\\') // If this is a normal character, consume it.
> - ResultChar = (unsigned char)*begin++;
> - else { // Otherwise, this is an escape character.
> - unsigned CharWidth = getCharWidth(Kind, PP.getTargetInfo());
> - // Check for UCN.
> - if (begin[1] == 'u' || begin[1] == 'U') {
> - uint32_t utf32 = 0;
> - unsigned short UcnLen = 0;
> - if (!ProcessUCNEscape(begin, end, utf32, UcnLen,
> - FullSourceLoc(Loc, PP.getSourceManager()),
> - &PP.getDiagnostics(), PP.getLangOptions())) {
> - HadError = 1;
> - }
> - ResultChar = utf32;
> - if (CharWidth != 32 && (ResultChar >> CharWidth) != 0) {
> - PP.Diag(Loc, diag::warn_ucn_escape_too_large);
> - ResultChar &= ~0U >> (32-CharWidth);
> - }
> + SmallVector<uint32_t,4> codepoint_buffer;
> + codepoint_buffer.resize(end-begin);
> + uint32_t *buffer_begin = &codepoint_buffer.front();
> + uint32_t *buffer_end = buffer_begin + codepoint_buffer.size();
> +
> + // Unicode escapes representing characters that cannot be correctly
> + // represented in a single code unit are disallowed in character literals
> + // by this implementation.
> + uint32_t largest_character_for_kind;
> + if (tok::wide_char_constant == Kind) {
> + largest_character_for_kind = 0xFFFFFFFFu >> (32-PP.getTargetInfo().getWCharWidth());
> + } else if (tok::utf16_char_constant == Kind) {
> + largest_character_for_kind = 0xFFFF;
> + } else if (tok::utf32_char_constant == Kind) {
> + largest_character_for_kind = 0x10FFFF;
> + } else {
> + largest_character_for_kind = 0x7Fu;
> + }
> +
> + while (begin!=end) {
> + // Is this a span of non-escape characters?
> + if (begin[0] != '\\') {
> + char const *start = begin;
> + do {
> + ++begin;
> + } while (begin != end && *begin != '\\');
> +
> + uint32_t *tmp_begin = buffer_begin;
> + ConversionResult res =
> + ConvertUTF8toUTF32(reinterpret_cast<UTF8 const **>(&start),
> + reinterpret_cast<UTF8 const *>(begin),
> + &buffer_begin,buffer_end,strictConversion);
> + if (res!=conversionOK) {
> + PP.Diag(Loc, diag::err_bad_character_encoding);
This error message can lead to rather uninformative complaints which
look like the following:
fribidi_char_sets_cp1256.c:214:9: error:
illegal sequence in character literal
return '?';
^
Any ideas for how we could improve this diagnostic?
-Eli
More information about the cfe-commits
mailing list