r221576 - [c++1z] Support for u8 character literals.

Fri Nov 7 22:08:42 PST 2014

Author: rsmith
Date: Sat Nov  8 00:08:42 2014
New Revision: 221576

URL: http://llvm.org/viewvc/llvm-project?rev=221576&view=rev
Log:
[c++1z] Support for u8 character literals.

Modified:
    cfe/trunk/include/clang/Basic/DiagnosticLexKinds.td
    cfe/trunk/include/clang/Basic/TokenKinds.def
    cfe/trunk/include/clang/Basic/TokenKinds.h
    cfe/trunk/lib/Lex/Lexer.cpp
    cfe/trunk/lib/Lex/LiteralSupport.cpp
    cfe/trunk/lib/Lex/MacroArgs.cpp
    cfe/trunk/lib/Lex/PPExpressions.cpp
    cfe/trunk/lib/Lex/TokenConcatenation.cpp
    cfe/trunk/lib/Parse/ParseExpr.cpp
    cfe/trunk/lib/Parse/ParseTentative.cpp
    cfe/trunk/test/Lexer/utf8-char-literal.cpp
    cfe/trunk/www/cxx_status.html

Modified: cfe/trunk/include/clang/Basic/DiagnosticLexKinds.td
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/include/clang/Basic/DiagnosticLexKinds.td?rev=221576&r1=221575&r2=221576&view=diff
==============================================================================

--- cfe/trunk/include/clang/Basic/DiagnosticLexKinds.td (original)
+++ cfe/trunk/include/clang/Basic/DiagnosticLexKinds.td Sat Nov  8 00:08:42 2014
@@ -201,6 +201,9 @@ def warn_c99_compat_unicode_literal : Wa
 def warn_cxx98_compat_unicode_literal : Warning<
   "unicode literals are incompatible with C++98">,
   InGroup<CXX98Compat>, DefaultIgnore;
+def warn_cxx14_compat_u8_character_literal : Warning<
+  "unicode literals are incompatible with C++ standards before C++1z">,
+  InGroup<CXXPre1zCompat>, DefaultIgnore;
 def warn_cxx11_compat_user_defined_literal : Warning<
   "identifier after literal will be treated as a user-defined literal suffix "
   "in C++11">, InGroup<CXX11Compat>, DefaultIgnore;

Modified: cfe/trunk/include/clang/Basic/TokenKinds.def
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/include/clang/Basic/TokenKinds.def?rev=221576&r1=221575&r2=221576&view=diff
==============================================================================
--- cfe/trunk/include/clang/Basic/TokenKinds.def (original)
+++ cfe/trunk/include/clang/Basic/TokenKinds.def Sat Nov  8 00:08:42 2014
@@ -133,6 +133,9 @@ TOK(numeric_constant)    // 0x123
 TOK(char_constant)       // 'a'
 TOK(wide_char_constant)  // L'b'
 
+// C++1z Character Constants
+TOK(utf8_char_constant)  // u8'a'
+
 // C++11 Character Constants
 TOK(utf16_char_constant) // u'a'
 TOK(utf32_char_constant) // U'a'

Modified: cfe/trunk/include/clang/Basic/TokenKinds.h
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/include/clang/Basic/TokenKinds.h?rev=221576&r1=221575&r2=221576&view=diff
==============================================================================
--- cfe/trunk/include/clang/Basic/TokenKinds.h (original)
+++ cfe/trunk/include/clang/Basic/TokenKinds.h Sat Nov  8 00:08:42 2014
@@ -86,9 +86,9 @@ inline bool isStringLiteral(TokenKind K)
 /// constant, string, etc.
 inline bool isLiteral(TokenKind K) {
   return K == tok::numeric_constant || K == tok::char_constant ||
-         K == tok::wide_char_constant || K == tok::utf16_char_constant ||
-         K == tok::utf32_char_constant || isStringLiteral(K) ||
-         K == tok::angle_string_literal;
+         K == tok::wide_char_constant || K == tok::utf8_char_constant ||
+         K == tok::utf16_char_constant || K == tok::utf32_char_constant ||
+         isStringLiteral(K) || K == tok::angle_string_literal;
 }
 
 /// \brief Return true if this is any of tok::annot_* kinds.

Modified: cfe/trunk/lib/Lex/Lexer.cpp
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Lex/Lexer.cpp?rev=221576&r1=221575&r2=221576&view=diff
==============================================================================
--- cfe/trunk/lib/Lex/Lexer.cpp (original)
+++ cfe/trunk/lib/Lex/Lexer.cpp Sat Nov  8 00:08:42 2014
@@ -1889,17 +1889,20 @@ bool Lexer::LexAngledStringLiteral(Token
 
 
 /// LexCharConstant - Lex the remainder of a character constant, after having
-/// lexed either ' or L' or u' or U'.
+/// lexed either ' or L' or u8' or u' or U'.
 bool Lexer::LexCharConstant(Token &Result, const char *CurPtr,
                             tok::TokenKind Kind) {
   // Does this character contain the \0 character?
   const char *NulCharacter = nullptr;
 
-  if (!isLexingRawMode() &&
-      (Kind == tok::utf16_char_constant || Kind == tok::utf32_char_constant))
-    Diag(BufferPtr, getLangOpts().CPlusPlus
-           ? diag::warn_cxx98_compat_unicode_literal
-           : diag::warn_c99_compat_unicode_literal);
+  if (!isLexingRawMode()) {
+    if (Kind == tok::utf16_char_constant || Kind == tok::utf32_char_constant)
+      Diag(BufferPtr, getLangOpts().CPlusPlus
+                          ? diag::warn_cxx98_compat_unicode_literal
+                          : diag::warn_c99_compat_unicode_literal);
+    else if (Kind == tok::utf8_char_constant)
+      Diag(BufferPtr, diag::warn_cxx14_compat_u8_character_literal);
+  }
 
   char C = getAndAdvanceChar(CurPtr, Result);
   if (C == '\'') {
@@ -3068,6 +3071,11 @@ LexNextToken:
                                ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
                                            SizeTmp2, Result),
                                tok::utf8_string_literal);
+        if (Char2 == '\'' && LangOpts.CPlusPlus1z)
+          return LexCharConstant(
+              Result, ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
+                                  SizeTmp2, Result),
+              tok::utf8_char_constant);
 
         if (Char2 == 'R' && LangOpts.CPlusPlus11) {
           unsigned SizeTmp3;

Modified: cfe/trunk/lib/Lex/LiteralSupport.cpp
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Lex/LiteralSupport.cpp?rev=221576&r1=221575&r2=221576&view=diff
==============================================================================
--- cfe/trunk/lib/Lex/LiteralSupport.cpp (original)
+++ cfe/trunk/lib/Lex/LiteralSupport.cpp Sat Nov  8 00:08:42 2014
@@ -28,6 +28,7 @@ static unsigned getCharWidth(tok::TokenK
   default: llvm_unreachable("Unknown token type!");
   case tok::char_constant:
   case tok::string_literal:
+  case tok::utf8_char_constant:
   case tok::utf8_string_literal:
     return Target.getCharWidth();
   case tok::wide_char_constant:
@@ -1031,9 +1032,10 @@ CharLiteralParser::CharLiteralParser(con
   const char *TokBegin = begin;
 
   // Skip over wide character determinant.
-  if (Kind != tok::char_constant) {
+  if (Kind != tok::char_constant)
+    ++begin;
+  if (Kind == tok::utf8_char_constant)
     ++begin;
-  }
 
   // Skip over the entry quote.
   assert(begin[0] == '\'' && "Invalid token lexed");
@@ -1077,6 +1079,8 @@ CharLiteralParser::CharLiteralParser(con
   if (tok::wide_char_constant == Kind) {
     largest_character_for_kind =
         0xFFFFFFFFu >> (32-PP.getTargetInfo().getWCharWidth());
+  } else if (tok::utf8_char_constant == Kind) {
+    largest_character_for_kind = 0x7F;
   } else if (tok::utf16_char_constant == Kind) {
     largest_character_for_kind = 0xFFFF;
   } else if (tok::utf32_char_constant == Kind) {

Modified: cfe/trunk/lib/Lex/MacroArgs.cpp
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Lex/MacroArgs.cpp?rev=221576&r1=221575&r2=221576&view=diff
==============================================================================
--- cfe/trunk/lib/Lex/MacroArgs.cpp (original)
+++ cfe/trunk/lib/Lex/MacroArgs.cpp Sat Nov  8 00:08:42 2014
@@ -218,6 +218,7 @@ Token MacroArgs::StringifyArgument(const
     if (tok::isStringLiteral(Tok.getKind()) || // "foo", u8R"x(foo)x"_bar, etc.
         Tok.is(tok::char_constant) ||          // 'x'
         Tok.is(tok::wide_char_constant) ||     // L'x'.
+        Tok.is(tok::utf8_char_constant) ||     // u8'x'.
         Tok.is(tok::utf16_char_constant) ||    // u'x'.
         Tok.is(tok::utf32_char_constant)) {    // U'x'.
       bool Invalid = false;

Modified: cfe/trunk/lib/Lex/PPExpressions.cpp
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Lex/PPExpressions.cpp?rev=221576&r1=221575&r2=221576&view=diff
==============================================================================
--- cfe/trunk/lib/Lex/PPExpressions.cpp (original)
+++ cfe/trunk/lib/Lex/PPExpressions.cpp Sat Nov  8 00:08:42 2014
@@ -273,6 +273,7 @@ static bool EvaluateValue(PPValue &Resul
   }
   case tok::char_constant:          // 'x'
   case tok::wide_char_constant:     // L'x'
+  case tok::utf8_char_constant:     // u8'x'
   case tok::utf16_char_constant:    // u'x'
   case tok::utf32_char_constant: {  // U'x'
     // Complain about, and drop, any ud-suffix.

Modified: cfe/trunk/lib/Lex/TokenConcatenation.cpp
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Lex/TokenConcatenation.cpp?rev=221576&r1=221575&r2=221576&view=diff
==============================================================================
--- cfe/trunk/lib/Lex/TokenConcatenation.cpp (original)
+++ cfe/trunk/lib/Lex/TokenConcatenation.cpp Sat Nov  8 00:08:42 2014
@@ -99,6 +99,10 @@ TokenConcatenation::TokenConcatenation(P
     TokenInfo[tok::utf32_char_constant ] |= aci_custom;
   }
 
+  // These tokens have custom code in C++1z mode.
+  if (PP.getLangOpts().CPlusPlus1z)
+    TokenInfo[tok::utf8_char_constant] |= aci_custom;
+
   // These tokens change behavior if followed by an '='.
   TokenInfo[tok::amp         ] |= aci_avoid_equal;           // &=
   TokenInfo[tok::plus        ] |= aci_avoid_equal;           // +=
@@ -213,6 +217,7 @@ bool TokenConcatenation::AvoidConcat(con
   case tok::utf32_string_literal:
   case tok::char_constant:
   case tok::wide_char_constant:
+  case tok::utf8_char_constant:
   case tok::utf16_char_constant:
   case tok::utf32_char_constant:
     if (!PP.getLangOpts().CPlusPlus11)
@@ -236,7 +241,8 @@ bool TokenConcatenation::AvoidConcat(con
     if (Tok.getIdentifierInfo() || Tok.is(tok::wide_string_literal) ||
         Tok.is(tok::utf8_string_literal) || Tok.is(tok::utf16_string_literal) ||
         Tok.is(tok::utf32_string_literal) || Tok.is(tok::wide_char_constant) ||
-        Tok.is(tok::utf16_char_constant) || Tok.is(tok::utf32_char_constant))
+        Tok.is(tok::utf8_char_constant) || Tok.is(tok::utf16_char_constant) ||
+        Tok.is(tok::utf32_char_constant))
       return true;
 
     // If this isn't identifier + string, we're done.

Modified: cfe/trunk/lib/Parse/ParseExpr.cpp
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Parse/ParseExpr.cpp?rev=221576&r1=221575&r2=221576&view=diff
==============================================================================
--- cfe/trunk/lib/Parse/ParseExpr.cpp (original)
+++ cfe/trunk/lib/Parse/ParseExpr.cpp Sat Nov  8 00:08:42 2014
@@ -910,6 +910,7 @@ ExprResult Parser::ParseCastExpression(b
   }
   case tok::char_constant:     // constant: character-constant
   case tok::wide_char_constant:
+  case tok::utf8_char_constant:
   case tok::utf16_char_constant:
   case tok::utf32_char_constant:
     Res = Actions.ActOnCharacterConstant(Tok, /*UDLScope*/getCurScope());

Modified: cfe/trunk/lib/Parse/ParseTentative.cpp
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Parse/ParseTentative.cpp?rev=221576&r1=221575&r2=221576&view=diff
==============================================================================
--- cfe/trunk/lib/Parse/ParseTentative.cpp (original)
+++ cfe/trunk/lib/Parse/ParseTentative.cpp Sat Nov  8 00:08:42 2014
@@ -892,6 +892,7 @@ Parser::isExpressionOrTypeSpecifierSimpl
   case tok::numeric_constant:
   case tok::char_constant:
   case tok::wide_char_constant:
+  case tok::utf8_char_constant:
   case tok::utf16_char_constant:
   case tok::utf32_char_constant:
   case tok::string_literal:

Modified: cfe/trunk/test/Lexer/utf8-char-literal.cpp
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/test/Lexer/utf8-char-literal.cpp?rev=221576&r1=221575&r2=221576&view=diff
==============================================================================
--- cfe/trunk/test/Lexer/utf8-char-literal.cpp (original)
+++ cfe/trunk/test/Lexer/utf8-char-literal.cpp Sat Nov  8 00:08:42 2014
@@ -1,6 +1,15 @@
 // RUN: %clang_cc1 -triple x86_64-apple-darwin -std=c++11 -fsyntax-only -verify %s
 // RUN: %clang_cc1 -triple x86_64-apple-darwin -std=c11 -x c -fsyntax-only -verify %s
+// RUN: %clang_cc1 -triple x86_64-apple-darwin -std=c++1z -fsyntax-only -verify %s
 
 int array0[u'Ã±' == u'\xf1'? 1 : -1];
 int array1['\xF1' !=  u'\xf1'? 1 : -1];
 int array1['Ã±' !=  u'\xf1'? 1 : -1]; // expected-error {{character too large for enclosing character literal type}}
+#if __cplusplus > 201402L
+char a = u8'Ã±'; // expected-error {{character too large for enclosing character literal type}}
+char b = u8'\x80'; // ok
+char c = u8'\u0080'; // expected-error {{character too large for enclosing character literal type}}
+char d = u8'\u1234'; // expected-error {{character too large for enclosing character literal type}}
+char e = u8'á´'; // expected-error {{character too large for enclosing character literal type}}
+char f = u8'ab'; // expected-error {{Unicode character literals may not contain multiple characters}}
+#endif

Modified: cfe/trunk/www/cxx_status.html
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/www/cxx_status.html?rev=221576&r1=221575&r2=221576&view=diff
==============================================================================
--- cfe/trunk/www/cxx_status.html (original)
+++ cfe/trunk/www/cxx_status.html Sat Nov  8 00:08:42 2014
@@ -549,12 +549,17 @@ as the draft C++1z standard evolves.</p>
     <!-- Urbana papers -->
     <tr>
       <td>Fold expressions</td>
-      <td><!--<a href="http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2014/n4051.html">-->N4295<!--</a>--></td>
+      <td><!--<a href="http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2014/n4295.html">-->N4295<!--</a>--></td>
+      <td class="svn" align="center">SVN</td>
+    </tr>
+    <tr>
+      <td><tt>u8</tt> character literals</td>
+      <td><!--<a href="http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2014/n4267.html">-->N4267<!--</a>--></td>
       <td class="svn" align="center">SVN</td>
     </tr>
     <tr>
       <td>Nested namespace definition</td>
-      <td><!--<a href="http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2014/n4051.html">-->N4230<!--</a>--></td>
+      <td><!--<a href="http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2014/n4230.html">-->N4230<!--</a>--></td>
       <td class="svn" align="center">SVN</td>
     </tr>
 </table>