[cfe-commits] r148389 - in /cfe/trunk: include/clang/Basic/DiagnosticLexKinds.td lib/Lex/LiteralSupport.cpp

Wed Jan 18 14:49:29 PST 2012

On Wed, Jan 18, 2012 at 4:27 AM, Seth Cantrell <seth.cantrell at gmail.com> wrote:
> Author: socantre
> Date: Wed Jan 18 06:27:04 2012
> New Revision: 148389
>
> URL: http://llvm.org/viewvc/llvm-project?rev=148389&view=rev
> Log:
> Improves support for Unicode in character literals
>
> Updates ProcessUCNExcape() for C++. C++11 allows UCNs in character
> and string literals that represent control characters and basic
> source characters. Also C++03 allows UCNs that refer to surrogate
> codepoints.
>
> UTF-8 sequences in character literals are now handled as single
> c-chars.
>
> Added error for multiple characters in Unicode character literals.
>
> Added errors for when a the execution charset encoding of a c-char
> cannot be represented as a single code unit in the associated
> character type. Note that for the purposes of this error the asso-
> ciated character type for a narrow character literal is char, not
> int, even though in C narrow character literals have type int.
>
> Modified:
>    cfe/trunk/include/clang/Basic/DiagnosticLexKinds.td
>    cfe/trunk/lib/Lex/LiteralSupport.cpp
>
> Modified: cfe/trunk/include/clang/Basic/DiagnosticLexKinds.td
> URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/include/clang/Basic/DiagnosticLexKinds.td?rev=148389&r1=148388&r2=148389&view=diff
> ==============================================================================
> --- cfe/trunk/include/clang/Basic/DiagnosticLexKinds.td (original)
> +++ cfe/trunk/include/clang/Basic/DiagnosticLexKinds.td Wed Jan 18 06:27:04 2012
> @@ -107,6 +107,8 @@
>   "extraneous characters in character constant ignored">;
>  def warn_char_constant_too_large : Warning<
>   "character constant too long for its type">;
> +def err_multichar_utf_character_literal : Error<
> +  "Unicode character literals may not contain multiple characters">;
>  def err_exponent_has_no_digits : Error<"exponent has no digits">;
>  def ext_imaginary_constant : Extension<"imaginary constants are an extension">;
>  def err_hexconstant_requires_exponent : Error<
> @@ -121,8 +123,8 @@
>  def ext_string_too_long : Extension<"string literal of length %0 exceeds "
>   "maximum length %1 that %select{C90|ISO C99|C++}2 compilers are required to "
>   "support">, InGroup<OverlengthStrings>;
> -def warn_ucn_escape_too_large : ExtWarn<
> -  "character unicode escape sequence too long for its type">, InGroup<Unicode>;
> +def err_character_too_large : Error<
> +  "character too large for enclosing character literal type">;
>  def warn_ucn_not_valid_in_c89 : ExtWarn<
>   "unicode escape sequences are only valid in C99 or C++">, InGroup<Unicode>;
>  def warn_cxx98_compat_unicode_literal : Warning<
> @@ -132,6 +134,8 @@
>   "unsupported non-standard concatenation of string literals">;
>  def err_bad_string_encoding : Error<
>   "illegal sequence in string literal">;
> +def err_bad_character_encoding : Error<
> +  "illegal sequence in character literal">;
>
>  //===----------------------------------------------------------------------===//
>  // PTH Diagnostics
>
> Modified: cfe/trunk/lib/Lex/LiteralSupport.cpp
> URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Lex/LiteralSupport.cpp?rev=148389&r1=148388&r2=148389&view=diff
> ==============================================================================
> --- cfe/trunk/lib/Lex/LiteralSupport.cpp (original)
> +++ cfe/trunk/lib/Lex/LiteralSupport.cpp Wed Jan 18 06:27:04 2012
> @@ -182,7 +182,8 @@
>  static bool ProcessUCNEscape(const char *&ThisTokBuf, const char *ThisTokEnd,
>                              uint32_t &UcnVal, unsigned short &UcnLen,
>                              FullSourceLoc Loc, DiagnosticsEngine *Diags,
> -                             const LangOptions &Features) {
> +                             const LangOptions &Features,
> +                             bool in_char_string_literal = false) {
>   if (!Features.CPlusPlus && !Features.C99 && Diags)
>     Diags->Report(Loc, diag::warn_ucn_not_valid_in_c89);
>
> @@ -216,11 +217,20 @@
>     }
>     return false;
>   }
> -  // Check UCN constraints (C99 6.4.3p2).
> -  if ((UcnVal < 0xa0 &&
> -      (UcnVal != 0x24 && UcnVal != 0x40 && UcnVal != 0x60 )) // $, @, `
> -      || (UcnVal >= 0xD800 && UcnVal <= 0xDFFF)
> -      || (UcnVal > 0x10FFFF)) /* the maximum legal UTF32 value */ {
> +  // Check UCN constraints (C99 6.4.3p2) [C++11 lex.charset p2]
> +  bool invalid_ucn = (0xD800<=UcnVal && UcnVal<=0xDFFF) // surrogate codepoints
> +                       || 0x10FFFF < UcnVal; // maximum legal UTF32 value
> +
> +  // C++11 allows UCNs that refer to control characters and basic source
> +  // characters inside character and string literals
> +  if (!Features.CPlusPlus0x || !in_char_string_literal) {
> +    if ((UcnVal < 0xa0 &&
> +         (UcnVal != 0x24 && UcnVal != 0x40 && UcnVal != 0x60 ))) {  // $, @, `
> +      invalid_ucn = true;
> +    }
> +  }
> +
> +  if (invalid_ucn) {
>     if (Diags)
>       Diags->Report(Loc, diag::err_ucn_escape_invalid);
>     return false;
> @@ -747,14 +757,13 @@
>  CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
>                                      SourceLocation Loc, Preprocessor &PP,
>                                      tok::TokenKind kind) {
> -  // At this point we know that the character matches the regex "L?'.*'".
> +  // At this point we know that the character matches the regex "(L|u|U)?'.*'".
>   HadError = false;
>
>   Kind = kind;
>
> -  // Determine if this is a wide or UTF character.
> -  if (Kind == tok::wide_char_constant || Kind == tok::utf16_char_constant ||
> -      Kind == tok::utf32_char_constant) {
> +  // Skip over wide character determinant.
> +  if (Kind != tok::char_constant) {
>     ++begin;
>   }
>
> @@ -762,6 +771,10 @@
>   assert(begin[0] == '\'' && "Invalid token lexed");
>   ++begin;
>
> +  // Trim the ending quote.
> +  assert(end[-1] == '\'' && "Invalid token lexed");
> +  --end;
> +
>   // FIXME: The "Value" is an uint64_t so we can handle char literals of
>   // up to 64-bits.
>   // FIXME: This extensively assumes that 'char' is 8-bits.
> @@ -773,76 +786,114 @@
>   assert(PP.getTargetInfo().getWCharWidth() <= 64 &&
>          "Assumes sizeof(wchar) on target is <= 64");
>
> -  // This is what we will use for overflow detection
> -  llvm::APInt LitVal(PP.getTargetInfo().getIntWidth(), 0);
> -
> -  unsigned NumCharsSoFar = 0;
> -  bool Warned = false;
> -  while (begin[0] != '\'') {
> -    uint64_t ResultChar;
> -
> -      // Is this a Universal Character Name escape?
> -    if (begin[0] != '\\')     // If this is a normal character, consume it.
> -      ResultChar = (unsigned char)*begin++;
> -    else {                    // Otherwise, this is an escape character.
> -      unsigned CharWidth = getCharWidth(Kind, PP.getTargetInfo());
> -      // Check for UCN.
> -      if (begin[1] == 'u' || begin[1] == 'U') {
> -        uint32_t utf32 = 0;
> -        unsigned short UcnLen = 0;
> -        if (!ProcessUCNEscape(begin, end, utf32, UcnLen,
> -                              FullSourceLoc(Loc, PP.getSourceManager()),
> -                              &PP.getDiagnostics(), PP.getLangOptions())) {
> -          HadError = 1;
> -        }
> -        ResultChar = utf32;
> -        if (CharWidth != 32 && (ResultChar >> CharWidth) != 0) {
> -          PP.Diag(Loc, diag::warn_ucn_escape_too_large);
> -          ResultChar &= ~0U >> (32-CharWidth);
> -        }
> +  SmallVector<uint32_t,4> codepoint_buffer;
> +  codepoint_buffer.resize(end-begin);
> +  uint32_t *buffer_begin = &codepoint_buffer.front();
> +  uint32_t *buffer_end = buffer_begin + codepoint_buffer.size();
> +
> +  // Unicode escapes representing characters that cannot be correctly
> +  // represented in a single code unit are disallowed in character literals
> +  // by this implementation.
> +  uint32_t largest_character_for_kind;
> +  if (tok::wide_char_constant == Kind) {
> +    largest_character_for_kind = 0xFFFFFFFFu >> (32-PP.getTargetInfo().getWCharWidth());
> +  } else if (tok::utf16_char_constant == Kind) {
> +    largest_character_for_kind = 0xFFFF;
> +  } else if (tok::utf32_char_constant == Kind) {
> +    largest_character_for_kind = 0x10FFFF;
> +  } else {
> +    largest_character_for_kind = 0x7Fu;
> +  }
> +
> +  while (begin!=end) {
> +    // Is this a span of non-escape characters?
> +    if (begin[0] != '\\') {
> +      char const *start = begin;
> +      do {
> +        ++begin;
> +      } while (begin != end && *begin != '\\');
> +
> +      uint32_t *tmp_begin = buffer_begin;
> +      ConversionResult res =
> +      ConvertUTF8toUTF32(reinterpret_cast<UTF8 const **>(&start),
> +                         reinterpret_cast<UTF8 const *>(begin),
> +                         &buffer_begin,buffer_end,strictConversion);
> +      if (res!=conversionOK) {
> +        PP.Diag(Loc, diag::err_bad_character_encoding);

This error message can lead to rather uninformative complaints which
look like the following:

fribidi_char_sets_cp1256.c:214:9: error:
      illegal sequence in character literal
 return '?';
        ^

Any ideas for how we could improve this diagnostic?

-Eli