[cfe-commits] r136210 - in /cfe/trunk: include/clang/AST/ include/clang/Basic/ include/clang/Lex/ include/clang/Parse/ lib/AST/ lib/CodeGen/ lib/Lex/ lib/Parse/ lib/Rewrite/ lib/Sema/ lib/Serialization/ test/CXX/lex/lex.literal/lex.ccon/ test/Cod

Wed Jul 27 07:17:02 PDT 2011

On Tue, Jul 26, 2011 at 10:40 PM, Douglas Gregor <dgregor at apple.com> wrote:
> Author: dgregor
> Date: Wed Jul 27 00:40:30 2011
> New Revision: 136210
>
> URL: http://llvm.org/viewvc/llvm-project?rev=136210&view=rev
> Log:
> Add support for C++0x unicode string and character literals, from Craig Topper!

Cool! One question below.

>
> Modified:
>    cfe/trunk/include/clang/AST/Expr.h
>    cfe/trunk/include/clang/AST/Type.h
>    cfe/trunk/include/clang/Basic/DiagnosticLexKinds.td
>    cfe/trunk/include/clang/Basic/IdentifierTable.h
>    cfe/trunk/include/clang/Basic/TokenKinds.def
>    cfe/trunk/include/clang/Lex/Lexer.h
>    cfe/trunk/include/clang/Lex/LiteralSupport.h
>    cfe/trunk/include/clang/Lex/Token.h
>    cfe/trunk/include/clang/Lex/TokenConcatenation.h
>    cfe/trunk/include/clang/Parse/Parser.h
>    cfe/trunk/lib/AST/ASTImporter.cpp
>    cfe/trunk/lib/AST/Expr.cpp
>    cfe/trunk/lib/AST/StmtDumper.cpp
>    cfe/trunk/lib/AST/StmtPrinter.cpp
>    cfe/trunk/lib/AST/StmtProfile.cpp
>    cfe/trunk/lib/AST/Type.cpp
>    cfe/trunk/lib/CodeGen/CodeGenModule.cpp
>    cfe/trunk/lib/Lex/Lexer.cpp
>    cfe/trunk/lib/Lex/LiteralSupport.cpp
>    cfe/trunk/lib/Lex/MacroArgs.cpp
>    cfe/trunk/lib/Lex/PPDirectives.cpp
>    cfe/trunk/lib/Lex/PPExpressions.cpp
>    cfe/trunk/lib/Lex/Pragma.cpp
>    cfe/trunk/lib/Lex/TokenConcatenation.cpp
>    cfe/trunk/lib/Parse/ParseCXXInlineMethods.cpp
>    cfe/trunk/lib/Parse/ParseExpr.cpp
>    cfe/trunk/lib/Parse/ParseTentative.cpp
>    cfe/trunk/lib/Parse/Parser.cpp
>    cfe/trunk/lib/Rewrite/HTMLRewrite.cpp
>    cfe/trunk/lib/Rewrite/RewriteObjC.cpp
>    cfe/trunk/lib/Sema/SemaChecking.cpp
>    cfe/trunk/lib/Sema/SemaDeclAttr.cpp
>    cfe/trunk/lib/Sema/SemaExpr.cpp
>    cfe/trunk/lib/Sema/SemaExprCXX.cpp
>    cfe/trunk/lib/Sema/SemaExprObjC.cpp
>    cfe/trunk/lib/Sema/SemaInit.cpp
>    cfe/trunk/lib/Sema/SemaStmt.cpp
>    cfe/trunk/lib/Sema/SemaTemplate.cpp
>    cfe/trunk/lib/Serialization/ASTReaderStmt.cpp
>    cfe/trunk/lib/Serialization/ASTWriterStmt.cpp
>    cfe/trunk/test/CXX/lex/lex.literal/lex.ccon/p1.cpp
>    cfe/trunk/test/CodeGen/char-literal.c
>    cfe/trunk/test/CodeGen/string-literal.c
>    cfe/trunk/test/Lexer/wchar.c
>    cfe/trunk/test/Parser/char-literal-printing.c
>    cfe/trunk/test/SemaCXX/type-convert-construct.cpp
>
> Modified: cfe/trunk/include/clang/AST/Expr.h
> URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/include/clang/AST/Expr.h?rev=136210&r1=136209&r2=136210&view=diff
> ==============================================================================
> --- cfe/trunk/include/clang/AST/Expr.h (original)
> +++ cfe/trunk/include/clang/AST/Expr.h Wed Jul 27 00:40:30 2011
> @@ -1112,29 +1112,39 @@
>  };
>
>  class CharacterLiteral : public Expr {
> +public:
> +  enum CharacterKind {
> +    Ascii,
> +    Wide,
> +    UTF16,
> +    UTF32
> +  };
> +
> +private:
>   unsigned Value;
>   SourceLocation Loc;
> -  bool IsWide;
> +  unsigned Kind : 2;
>  public:
>   // type should be IntTy
> -  CharacterLiteral(unsigned value, bool iswide, QualType type, SourceLocation l)
> +  CharacterLiteral(unsigned value, CharacterKind kind, QualType type,
> +                   SourceLocation l)
>     : Expr(CharacterLiteralClass, type, VK_RValue, OK_Ordinary, false, false,
>            false, false),
> -      Value(value), Loc(l), IsWide(iswide) {
> +      Value(value), Loc(l), Kind(kind) {
>   }
>
>   /// \brief Construct an empty character literal.
>   CharacterLiteral(EmptyShell Empty) : Expr(CharacterLiteralClass, Empty) { }
>
>   SourceLocation getLocation() const { return Loc; }
> -  bool isWide() const { return IsWide; }
> +  CharacterKind getKind() const { return static_cast<CharacterKind>(Kind); }
>
>   SourceRange getSourceRange() const { return SourceRange(Loc); }
>
>   unsigned getValue() const { return Value; }
>
>   void setLocation(SourceLocation Location) { Loc = Location; }
> -  void setWide(bool W) { IsWide = W; }
> +  void setKind(CharacterKind kind) { Kind = kind; }
>   void setValue(unsigned Val) { Value = Val; }
>
>   static bool classof(const Stmt *T) {
> @@ -1243,13 +1253,23 @@
>  /// In this case, getByteLength() will return 6, but the string literal will
>  /// have type "char[2]".
>  class StringLiteral : public Expr {
> +public:
> +  enum StringKind {
> +    Ascii,
> +    Wide,
> +    UTF8,
> +    UTF16,
> +    UTF32
> +  };
> +
> +private:
>   friend class ASTStmtReader;
>
>   const char *StrData;
>   unsigned ByteLength;
> -  bool IsWide;
> -  bool IsPascal;
>   unsigned NumConcatenated;
> +  unsigned Kind : 3;
> +  bool IsPascal : 1;
>   SourceLocation TokLocs[1];
>
>   StringLiteral(QualType Ty) :
> @@ -1259,14 +1279,15 @@
>  public:
>   /// This is the "fully general" constructor that allows representation of
>   /// strings formed from multiple concatenated tokens.
> -  static StringLiteral *Create(ASTContext &C, StringRef Str, bool Wide,
> +  static StringLiteral *Create(ASTContext &C, StringRef Str, StringKind Kind,
>                                bool Pascal, QualType Ty,
>                                const SourceLocation *Loc, unsigned NumStrs);
>
>   /// Simple constructor for string literals made from one token.
> -  static StringLiteral *Create(ASTContext &C, StringRef Str, bool Wide,
> -                               bool Pascal, QualType Ty, SourceLocation Loc) {
> -    return Create(C, Str, Wide, Pascal, Ty, &Loc, 1);
> +  static StringLiteral *Create(ASTContext &C, StringRef Str, StringKind Kind,
> +                               bool Pascal, QualType Ty,
> +                               SourceLocation Loc) {
> +    return Create(C, Str, Kind, Pascal, Ty, &Loc, 1);
>   }
>
>   /// \brief Construct an empty string literal.
> @@ -1281,9 +1302,14 @@
>   /// \brief Sets the string data to the given string data.
>   void setString(ASTContext &C, StringRef Str);
>
> -  bool isWide() const { return IsWide; }
> +  StringKind getKind() const { return static_cast<StringKind>(Kind); }
> +  bool isAscii() const { return Kind == Ascii; }
> +  bool isWide() const { return Kind == Wide; }
> +  bool isUTF8() const { return Kind == UTF8; }
> +  bool isUTF16() const { return Kind == UTF16; }
> +  bool isUTF32() const { return Kind == UTF32; }
>   bool isPascal() const { return IsPascal; }
> -
> +
>   bool containsNonAsciiOrNull() const {
>     StringRef Str = getString();
>     for (unsigned i = 0, e = Str.size(); i != e; ++i)
>
> Modified: cfe/trunk/include/clang/AST/Type.h
> URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/include/clang/AST/Type.h?rev=136210&r1=136209&r2=136210&view=diff
> ==============================================================================
> --- cfe/trunk/include/clang/AST/Type.h (original)
> +++ cfe/trunk/include/clang/AST/Type.h Wed Jul 27 00:40:30 2011
> @@ -1368,6 +1368,8 @@
>   bool isBooleanType() const;
>   bool isCharType() const;
>   bool isWideCharType() const;
> +  bool isChar16Type() const;
> +  bool isChar32Type() const;
>   bool isAnyCharacterType() const;
>   bool isIntegralType(ASTContext &Ctx) const;
>
>
> Modified: cfe/trunk/include/clang/Basic/DiagnosticLexKinds.td
> URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/include/clang/Basic/DiagnosticLexKinds.td?rev=136210&r1=136209&r2=136210&view=diff
> ==============================================================================
> --- cfe/trunk/include/clang/Basic/DiagnosticLexKinds.td (original)
> +++ cfe/trunk/include/clang/Basic/DiagnosticLexKinds.td Wed Jul 27 00:40:30 2011
> @@ -77,8 +77,8 @@
>   "invalid suffix '%0' on integer constant">;
>  def err_invalid_suffix_float_constant : Error<
>   "invalid suffix '%0' on floating constant">;
> -def warn_extraneous_wide_char_constant : Warning<
> -  "extraneous characters in wide character constant ignored">;
> +def warn_extraneous_char_constant : Warning<
> +  "extraneous characters in character constant ignored">;
>  def warn_char_constant_too_large : Warning<
>   "character constant too long for its type">;
>  def err_exponent_has_no_digits : Error<"exponent has no digits">;
> @@ -102,6 +102,8 @@
>   "character unicode escape sequence too long for its type">;
>  def warn_ucn_not_valid_in_c89 : ExtWarn<
>   "unicode escape sequences are only valid in C99 or C++">;
> +def err_unsupported_string_concat : Error<
> +  "unsupported non-standard concatenation of string literals">;
>
>  //===----------------------------------------------------------------------===//
>  // PTH Diagnostics
>
> Modified: cfe/trunk/include/clang/Basic/IdentifierTable.h
> URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/include/clang/Basic/IdentifierTable.h?rev=136210&r1=136209&r2=136210&view=diff
> ==============================================================================
> --- cfe/trunk/include/clang/Basic/IdentifierTable.h (original)
> +++ cfe/trunk/include/clang/Basic/IdentifierTable.h Wed Jul 27 00:40:30 2011
> @@ -50,8 +50,8 @@
>  /// set, and all tok::identifier tokens have a pointer to one of these.
>  class IdentifierInfo {
>   // Note: DON'T make TokenID a 'tok::TokenKind'; MSVC will treat it as a
> -  //       signed char and TokenKinds > 127 won't be handled correctly.
> -  unsigned TokenID            : 8; // Front-end token ID or tok::identifier.
> +  //       signed char and TokenKinds > 255 won't be handled correctly.
> +  unsigned TokenID            : 9; // Front-end token ID or tok::identifier.
>   // Objective-C keyword ('protocol' in '@protocol') or builtin (__builtin_inf).
>   // First NUM_OBJC_KEYWORDS values are for Objective-C, the remaining values
>   // are for builtins.
> @@ -65,7 +65,7 @@
>                                    // file and wasn't modified since.
>   bool RevertedTokenID        : 1; // True if RevertTokenIDToIdentifier was
>                                    // called.
> -  // 6 bits left in 32-bit word.
> +  // 5 bits left in 32-bit word.
>   void *FETokenInfo;               // Managed by the language front-end.
>   llvm::StringMapEntry<IdentifierInfo*> *Entry;
>
> @@ -409,6 +409,7 @@
>   IdentifierInfo &get(StringRef Name, tok::TokenKind TokenCode) {
>     IdentifierInfo &II = get(Name);
>     II.TokenID = TokenCode;
> +    assert(II.TokenID == TokenCode && "TokenCode too large");
>     return II;
>   }
>
>
> Modified: cfe/trunk/include/clang/Basic/TokenKinds.def
> URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/include/clang/Basic/TokenKinds.def?rev=136210&r1=136209&r2=136210&view=diff
> ==============================================================================
> --- cfe/trunk/include/clang/Basic/TokenKinds.def (original)
> +++ cfe/trunk/include/clang/Basic/TokenKinds.def Wed Jul 27 00:40:30 2011
> @@ -114,13 +114,23 @@
>  TOK(numeric_constant)    // 0x123
>
>  // C99 6.4.4: Character Constants
> -TOK(char_constant)       // 'a'   L'b'
> +TOK(char_constant)       // 'a'
> +TOK(wide_char_constant)  // L'b'
> +
> +// C++0x Character Constants
> +TOK(utf16_char_constant) // u'a'
> +TOK(utf32_char_constant) // U'a'
>
>  // C99 6.4.5: String Literals.
>  TOK(string_literal)      // "foo"
>  TOK(wide_string_literal) // L"foo"
>  TOK(angle_string_literal)// <foo>
>
> +// C++0x String Literals.
> +TOK(utf8_string_literal) // u8"foo"
> +TOK(utf16_string_literal)// u"foo"
> +TOK(utf32_string_literal)// U"foo"
> +
>  // C99 6.4.6: Punctuators.
>  PUNCTUATOR(l_square,            "[")
>  PUNCTUATOR(r_square,            "]")
>
> Modified: cfe/trunk/include/clang/Lex/Lexer.h
> URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/include/clang/Lex/Lexer.h?rev=136210&r1=136209&r2=136210&view=diff
> ==============================================================================
> --- cfe/trunk/include/clang/Lex/Lexer.h (original)
> +++ cfe/trunk/include/clang/Lex/Lexer.h Wed Jul 27 00:40:30 2011
> @@ -471,9 +471,11 @@
>   // Helper functions to lex the remainder of a token of the specific type.
>   void LexIdentifier         (Token &Result, const char *CurPtr);
>   void LexNumericConstant    (Token &Result, const char *CurPtr);
> -  void LexStringLiteral      (Token &Result, const char *CurPtr,bool Wide);
> +  void LexStringLiteral      (Token &Result, const char *CurPtr,
> +                              tok::TokenKind Kind);
>   void LexAngledStringLiteral(Token &Result, const char *CurPtr);
> -  void LexCharConstant       (Token &Result, const char *CurPtr);
> +  void LexCharConstant       (Token &Result, const char *CurPtr,
> +                              tok::TokenKind Kind);
>   bool LexEndOfFile          (Token &Result, const char *CurPtr);
>
>   bool SkipWhitespace        (Token &Result, const char *CurPtr);
>
> Modified: cfe/trunk/include/clang/Lex/LiteralSupport.h
> URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/include/clang/Lex/LiteralSupport.h?rev=136210&r1=136209&r2=136210&view=diff
> ==============================================================================
> --- cfe/trunk/include/clang/Lex/LiteralSupport.h (original)
> +++ cfe/trunk/include/clang/Lex/LiteralSupport.h Wed Jul 27 00:40:30 2011
> @@ -19,6 +19,7 @@
>  #include "llvm/ADT/APFloat.h"
>  #include "llvm/ADT/SmallString.h"
>  #include "llvm/Support/DataTypes.h"
> +#include "clang/Basic/TokenKinds.h"
>  #include <cctype>
>
>  namespace clang {
> @@ -124,15 +125,19 @@
>  /// character literal.
>  class CharLiteralParser {
>   uint64_t Value;
> -  bool IsWide;
> +  tok::TokenKind Kind;
>   bool IsMultiChar;
>   bool HadError;
>  public:
>   CharLiteralParser(const char *begin, const char *end,
> -                    SourceLocation Loc, Preprocessor &PP);
> +                    SourceLocation Loc, Preprocessor &PP,
> +                    tok::TokenKind kind);
>
>   bool hadError() const { return HadError; }
> -  bool isWide() const { return IsWide; }
> +  bool isAscii() const { return Kind == tok::char_constant; }
> +  bool isWide() const { return Kind == tok::wide_char_constant; }
> +  bool isUTF16() const { return Kind == tok::utf16_char_constant; }
> +  bool isUTF32() const { return Kind == tok::utf32_char_constant; }
>   bool isMultiChar() const { return IsMultiChar; }
>   uint64_t getValue() const { return Value; }
>  };
> @@ -148,7 +153,8 @@
>
>   unsigned MaxTokenLength;
>   unsigned SizeBound;
> -  unsigned wchar_tByteWidth;
> +  unsigned CharByteWidth;
> +  tok::TokenKind Kind;
>   llvm::SmallString<512> ResultBuf;
>   char *ResultPtr; // cursor
>  public:
> @@ -158,14 +164,13 @@
>                       const SourceManager &sm, const LangOptions &features,
>                       const TargetInfo &target, Diagnostic *diags = 0)
>     : SM(sm), Features(features), Target(target), Diags(diags),
> -      MaxTokenLength(0), SizeBound(0), wchar_tByteWidth(0),
> -      ResultPtr(ResultBuf.data()), hadError(false), AnyWide(false), Pascal(false) {
> +      MaxTokenLength(0), SizeBound(0), CharByteWidth(0), Kind(tok::unknown),
> +      ResultPtr(ResultBuf.data()), hadError(false), Pascal(false) {
>     init(StringToks, NumStringToks);
>   }
>
>
>   bool hadError;
> -  bool AnyWide;
>   bool Pascal;
>
>   StringRef GetString() const {
> @@ -174,9 +179,7 @@
>   unsigned GetStringLength() const { return ResultPtr-ResultBuf.data(); }
>
>   unsigned GetNumStringChars() const {
> -    if (AnyWide)
> -      return GetStringLength() / wchar_tByteWidth;
> -    return GetStringLength();
> +    return GetStringLength() / CharByteWidth;
>   }
>   /// getOffsetOfStringByte - This function returns the offset of the
>   /// specified byte of the string data represented by Token.  This handles
> @@ -185,7 +188,13 @@
>   /// If the Diagnostics pointer is non-null, then this will do semantic
>   /// checking of the string literal and emit errors and warnings.
>   unsigned getOffsetOfStringByte(const Token &TheTok, unsigned ByteNo) const;
> -
> +
> +  bool isAscii() { return Kind == tok::string_literal; }
> +  bool isWide() { return Kind == tok::wide_string_literal; }
> +  bool isUTF8() { return Kind == tok::utf8_string_literal; }
> +  bool isUTF16() { return Kind == tok::utf16_string_literal; }
> +  bool isUTF32() { return Kind == tok::utf32_string_literal; }
> +
>  private:
>   void init(const Token *StringToks, unsigned NumStringToks);
>  };
>
> Modified: cfe/trunk/include/clang/Lex/Token.h
> URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/include/clang/Lex/Token.h?rev=136210&r1=136209&r2=136210&view=diff
> ==============================================================================
> --- cfe/trunk/include/clang/Lex/Token.h (original)
> +++ cfe/trunk/include/clang/Lex/Token.h Wed Jul 27 00:40:30 2011
> @@ -96,7 +96,10 @@
>   /// constant, string, etc.
>   bool isLiteral() const {
>     return is(tok::numeric_constant) || is(tok::char_constant) ||
> -           is(tok::string_literal) || is(tok::wide_string_literal) ||
> +           is(tok::wide_char_constant) || is(tok::utf16_char_constant) ||
> +           is(tok::utf32_char_constant) || is(tok::string_literal) ||
> +           is(tok::wide_string_literal) || is(tok::utf8_string_literal) ||
> +           is(tok::utf16_string_literal) || is(tok::utf32_string_literal) ||
>            is(tok::angle_string_literal);
>   }
>
>
> Modified: cfe/trunk/include/clang/Lex/TokenConcatenation.h
> URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/include/clang/Lex/TokenConcatenation.h?rev=136210&r1=136209&r2=136210&view=diff
> ==============================================================================
> --- cfe/trunk/include/clang/Lex/TokenConcatenation.h (original)
> +++ cfe/trunk/include/clang/Lex/TokenConcatenation.h Wed Jul 27 00:40:30 2011
> @@ -63,12 +63,9 @@
>                      const Token &Tok) const;
>
>   private:
> -    /// StartsWithL - Return true if the spelling of this token starts with 'L'.
> -    bool StartsWithL(const Token &Tok) const;
> -
> -    /// IsIdentifierL - Return true if the spelling of this token is literally
> -    /// 'L'.
> -    bool IsIdentifierL(const Token &Tok) const;
> +    /// IsIdentifierStringPrefix - Return true if the spelling of the token
> +    /// is literally 'L', 'u', 'U', or 'u8'.
> +    bool IsIdentifierStringPrefix(const Token &Tok) const;
>   };
>   } // end clang namespace
>
>
> Modified: cfe/trunk/include/clang/Parse/Parser.h
> URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/include/clang/Parse/Parser.h?rev=136210&r1=136209&r2=136210&view=diff
> ==============================================================================
> --- cfe/trunk/include/clang/Parse/Parser.h (original)
> +++ cfe/trunk/include/clang/Parse/Parser.h Wed Jul 27 00:40:30 2011
> @@ -265,7 +265,10 @@
>   ///
>   bool isTokenStringLiteral() const {
>     return Tok.getKind() == tok::string_literal ||
> -           Tok.getKind() == tok::wide_string_literal;
> +           Tok.getKind() == tok::wide_string_literal ||
> +           Tok.getKind() == tok::utf8_string_literal ||
> +           Tok.getKind() == tok::utf16_string_literal ||
> +           Tok.getKind() == tok::utf32_string_literal;
>   }
>
>   /// \brief Returns true if the current token is a '=' or '==' and
>
> Modified: cfe/trunk/lib/AST/ASTImporter.cpp
> URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/AST/ASTImporter.cpp?rev=136210&r1=136209&r2=136210&view=diff
> ==============================================================================
> --- cfe/trunk/lib/AST/ASTImporter.cpp (original)
> +++ cfe/trunk/lib/AST/ASTImporter.cpp Wed Jul 27 00:40:30 2011
> @@ -3814,8 +3814,8 @@
>   if (T.isNull())
>     return 0;
>
> -  return new (Importer.getToContext()) CharacterLiteral(E->getValue(),
> -                                                        E->isWide(), T,
> +  return new (Importer.getToContext()) CharacterLiteral(E->getValue(),
> +                                                        E->getKind(), T,
>                                           Importer.Import(E->getLocation()));
>  }
>
>
> Modified: cfe/trunk/lib/AST/Expr.cpp
> URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/AST/Expr.cpp?rev=136210&r1=136209&r2=136210&view=diff
> ==============================================================================
> --- cfe/trunk/lib/AST/Expr.cpp (original)
> +++ cfe/trunk/lib/AST/Expr.cpp Wed Jul 27 00:40:30 2011
> @@ -533,8 +533,7 @@
>  }
>
>  StringLiteral *StringLiteral::Create(ASTContext &C, StringRef Str,
> -                                     bool Wide,
> -                                     bool Pascal, QualType Ty,
> +                                     StringKind Kind, bool Pascal, QualType Ty,
>                                      const SourceLocation *Loc,
>                                      unsigned NumStrs) {
>   // Allocate enough space for the StringLiteral plus an array of locations for
> @@ -549,7 +548,7 @@
>   memcpy(AStrData, Str.data(), Str.size());
>   SL->StrData = AStrData;
>   SL->ByteLength = Str.size();
> -  SL->IsWide = Wide;
> +  SL->Kind = Kind;
>   SL->IsPascal = Pascal;
>   SL->TokLocs[0] = Loc[0];
>   SL->NumConcatenated = NumStrs;
> @@ -587,8 +586,8 @@
>  SourceLocation StringLiteral::
>  getLocationOfByte(unsigned ByteNo, const SourceManager &SM,
>                   const LangOptions &Features, const TargetInfo &Target) const {
> -  assert(!isWide() && "This doesn't work for wide strings yet");
> -
> +  assert(Kind == StringLiteral::Ascii && "This only works for ASCII strings");
> +
>   // Loop over all of the tokens in this string until we find the one that
>   // contains the byte we're looking for.
>   unsigned TokNo = 0;
>
> Modified: cfe/trunk/lib/AST/StmtDumper.cpp
> URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/AST/StmtDumper.cpp?rev=136210&r1=136209&r2=136210&view=diff
> ==============================================================================
> --- cfe/trunk/lib/AST/StmtDumper.cpp (original)
> +++ cfe/trunk/lib/AST/StmtDumper.cpp Wed Jul 27 00:40:30 2011
> @@ -443,8 +443,13 @@
>   DumpExpr(Str);
>   // FIXME: this doesn't print wstrings right.
>   OS << " ";
> -  if (Str->isWide())
> -    OS << "L";
> +  switch (Str->getKind()) {
> +  case StringLiteral::Ascii: break; // No prefix
> +  case StringLiteral::Wide:  OS << 'L'; break;
> +  case StringLiteral::UTF8:  OS << "u8"; break;
> +  case StringLiteral::UTF16: OS << 'u'; break;
> +  case StringLiteral::UTF32: OS << 'U'; break;
> +  }
>   OS << '"';
>   OS.write_escaped(Str->getString());
>   OS << '"';
>
> Modified: cfe/trunk/lib/AST/StmtPrinter.cpp
> URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/AST/StmtPrinter.cpp?rev=136210&r1=136209&r2=136210&view=diff
> ==============================================================================
> --- cfe/trunk/lib/AST/StmtPrinter.cpp (original)
> +++ cfe/trunk/lib/AST/StmtPrinter.cpp Wed Jul 27 00:40:30 2011
> @@ -599,8 +599,14 @@
>
>  void StmtPrinter::VisitCharacterLiteral(CharacterLiteral *Node) {
>   unsigned value = Node->getValue();
> -  if (Node->isWide())
> -    OS << "L";
> +
> +  switch (Node->getKind()) {
> +  case CharacterLiteral::Ascii: break; // no prefix.
> +  case CharacterLiteral::Wide:  OS << 'L'; break;
> +  case CharacterLiteral::UTF16: OS << 'u'; break;
> +  case CharacterLiteral::UTF32: OS << 'U'; break;
> +  }
> +
>   switch (value) {
>   case '\\':
>     OS << "'\\\\'";
> @@ -672,7 +678,13 @@
>  }
>
>  void StmtPrinter::VisitStringLiteral(StringLiteral *Str) {
> -  if (Str->isWide()) OS << 'L';
> +  switch (Str->getKind()) {
> +  case StringLiteral::Ascii: break; // no prefix.
> +  case StringLiteral::Wide:  OS << 'L'; break;
> +  case StringLiteral::UTF8:  OS << "u8"; break;
> +  case StringLiteral::UTF16: OS << 'u'; break;
> +  case StringLiteral::UTF32: OS << 'U'; break;
> +  }
>   OS << '"';
>
>   // FIXME: this doesn't print wstrings right.
>
> Modified: cfe/trunk/lib/AST/StmtProfile.cpp
> URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/AST/StmtProfile.cpp?rev=136210&r1=136209&r2=136210&view=diff
> ==============================================================================
> --- cfe/trunk/lib/AST/StmtProfile.cpp (original)
> +++ cfe/trunk/lib/AST/StmtProfile.cpp Wed Jul 27 00:40:30 2011
> @@ -252,7 +252,7 @@
>
>  void StmtProfiler::VisitCharacterLiteral(const CharacterLiteral *S) {
>   VisitExpr(S);
> -  ID.AddBoolean(S->isWide());
> +  ID.AddInteger(S->getKind());
>   ID.AddInteger(S->getValue());
>  }
>
> @@ -269,7 +269,7 @@
>  void StmtProfiler::VisitStringLiteral(const StringLiteral *S) {
>   VisitExpr(S);
>   ID.AddString(S->getString());
> -  ID.AddBoolean(S->isWide());
> +  ID.AddInteger(S->getKind());
>  }
>
>  void StmtProfiler::VisitParenExpr(const ParenExpr *S) {
>
> Modified: cfe/trunk/lib/AST/Type.cpp
> URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/AST/Type.cpp?rev=136210&r1=136209&r2=136210&view=diff
> ==============================================================================
> --- cfe/trunk/lib/AST/Type.cpp (original)
> +++ cfe/trunk/lib/AST/Type.cpp Wed Jul 27 00:40:30 2011
> @@ -635,6 +635,18 @@
>   return false;
>  }
>
> +bool Type::isChar16Type() const {
> +  if (const BuiltinType *BT = dyn_cast<BuiltinType>(CanonicalType))
> +    return BT->getKind() == BuiltinType::Char16;
> +  return false;
> +}
> +
> +bool Type::isChar32Type() const {
> +  if (const BuiltinType *BT = dyn_cast<BuiltinType>(CanonicalType))
> +    return BT->getKind() == BuiltinType::Char32;
> +  return false;
> +}
> +
>  /// \brief Determine whether this type is any of the built-in character
>  /// types.
>  bool Type::isAnyCharacterType() const {
>
> Modified: cfe/trunk/lib/CodeGen/CodeGenModule.cpp
> URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/CodeGen/CodeGenModule.cpp?rev=136210&r1=136209&r2=136210&view=diff
> ==============================================================================
> --- cfe/trunk/lib/CodeGen/CodeGenModule.cpp (original)
> +++ cfe/trunk/lib/CodeGen/CodeGenModule.cpp Wed Jul 27 00:40:30 2011
> @@ -1877,8 +1877,20 @@
>   // Resize the string to the right size.
>   uint64_t RealLen = CAT->getSize().getZExtValue();
>
> -  if (E->isWide())
> +  switch (E->getKind()) {
> +  case StringLiteral::Ascii:
> +  case StringLiteral::UTF8:
> +    break;
> +  case StringLiteral::Wide:
>     RealLen *= Context.Target.getWCharWidth() / Context.getCharWidth();
> +    break;
> +  case StringLiteral::UTF16:
> +    RealLen *= Context.Target.getChar16Width() / Context.getCharWidth();
> +    break;
> +  case StringLiteral::UTF32:
> +    RealLen *= Context.Target.getChar32Width() / Context.getCharWidth();
> +    break;
> +  }
>
>   std::string Str = E->getString().str();
>   Str.resize(RealLen, '\0');
> @@ -1893,7 +1905,7 @@
>   // FIXME: This can be more efficient.
>   // FIXME: We shouldn't need to bitcast the constant in the wide string case.
>   llvm::Constant *C = GetAddrOfConstantString(GetStringForStringLiteral(S));
> -  if (S->isWide()) {
> +  if (S->isWide() || S->isUTF16() || S->isUTF32()) {
>     llvm::Type *DestTy =
>         llvm::PointerType::getUnqual(getTypes().ConvertType(S->getType()));
>     C = llvm::ConstantExpr::getBitCast(C, DestTy);
>
> Modified: cfe/trunk/lib/Lex/Lexer.cpp
> URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Lex/Lexer.cpp?rev=136210&r1=136209&r2=136210&view=diff
> ==============================================================================
> --- cfe/trunk/lib/Lex/Lexer.cpp (original)
> +++ cfe/trunk/lib/Lex/Lexer.cpp Wed Jul 27 00:40:30 2011
> @@ -1267,8 +1267,9 @@
>  }
>
>  /// LexStringLiteral - Lex the remainder of a string literal, after having lexed
> -/// either " or L".
> -void Lexer::LexStringLiteral(Token &Result, const char *CurPtr, bool Wide) {
> +/// either " or L" or u8" or u" or U".
> +void Lexer::LexStringLiteral(Token &Result, const char *CurPtr,
> +                             tok::TokenKind Kind) {
>   const char *NulCharacter = 0; // Does this string contain the \0 character?
>
>   char C = getAndAdvanceChar(CurPtr, Result);
> @@ -1299,8 +1300,7 @@
>
>   // Update the location of the token as well as the BufferPtr instance var.
>   const char *TokStart = BufferPtr;
> -  FormTokenWithChars(Result, CurPtr,
> -                     Wide ? tok::wide_string_literal : tok::string_literal);
> +  FormTokenWithChars(Result, CurPtr, Kind);
>   Result.setLiteralData(TokStart);
>  }
>
> @@ -1339,8 +1339,9 @@
>
>
>  /// LexCharConstant - Lex the remainder of a character constant, after having
> -/// lexed either ' or L'.
> -void Lexer::LexCharConstant(Token &Result, const char *CurPtr) {
> +/// lexed either ' or L' or u' or U'.
> +void Lexer::LexCharConstant(Token &Result, const char *CurPtr,
> +                            tok::TokenKind Kind) {
>   const char *NulCharacter = 0; // Does this character contain the \0 character?
>
>   char C = getAndAdvanceChar(CurPtr, Result);
> @@ -1377,7 +1378,7 @@
>
>   // Update the location of token as well as BufferPtr.
>   const char *TokStart = BufferPtr;
> -  FormTokenWithChars(Result, CurPtr, tok::char_constant);
> +  FormTokenWithChars(Result, CurPtr, Kind);
>   Result.setLiteralData(TokStart);
>  }
>
> @@ -2185,6 +2186,55 @@
>     MIOpt.ReadToken();
>     return LexNumericConstant(Result, CurPtr);
>
> +  case 'u':   // Identifier (uber) or C++0x UTF-8 or UTF-16 string literal
> +    // Notify MIOpt that we read a non-whitespace/non-comment token.
> +    MIOpt.ReadToken();
> +
> +    if (Features.CPlusPlus0x) {
> +      Char = getCharAndSize(CurPtr, SizeTmp);
> +
> +      // UTF-16 string literal
> +      if (Char == '"')
> +        return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),
> +                                tok::utf16_string_literal);
> +
> +      // UTF-16 character constant
> +      if (Char == '\'')
> +        return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),
> +                               tok::utf16_char_constant);
> +
> +      // UTF-8 string literal
> +      if (Char == '8' && getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"')
> +        return LexStringLiteral(Result,
> +                              ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
> +                                          SizeTmp2, Result),
> +                              tok::utf8_string_literal);
> +    }
> +
> +    // treat u like the start of an identifier.
> +    return LexIdentifier(Result, CurPtr);
> +
> +  case 'U':   // Identifier (Uber) or C++0x UTF-32 string literal
> +    // Notify MIOpt that we read a non-whitespace/non-comment token.
> +    MIOpt.ReadToken();
> +
> +    if (Features.CPlusPlus0x) {
> +      Char = getCharAndSize(CurPtr, SizeTmp);
> +
> +      // UTF-32 string literal
> +      if (Char == '"')
> +        return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),
> +                                tok::utf32_string_literal);
> +
> +      // UTF-32 character constant
> +      if (Char == '\'')
> +        return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),
> +                               tok::utf32_char_constant);
> +    }
> +
> +    // treat U like the start of an identifier.
> +    return LexIdentifier(Result, CurPtr);
> +
>   case 'L':   // Identifier (Loony) or wide literal (L'x' or L"xyz").
>     // Notify MIOpt that we read a non-whitespace/non-comment token.
>     MIOpt.ReadToken();
> @@ -2193,21 +2243,22 @@
>     // Wide string literal.
>     if (Char == '"')
>       return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),
> -                              true);
> +                              tok::wide_string_literal);
>
>     // Wide character constant.
>     if (Char == '\'')
> -      return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result));
> +      return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),
> +                             tok::wide_char_constant);
>     // FALL THROUGH, treating L like the start of an identifier.
>
>   // C99 6.4.2: Identifiers.
>   case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G':
>   case 'H': case 'I': case 'J': case 'K':    /*'L'*/case 'M': case 'N':
> -  case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U':
> +  case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T':    /*'U'*/
>   case 'V': case 'W': case 'X': case 'Y': case 'Z':
>   case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g':
>   case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n':
> -  case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u':
> +  case 'o': case 'p': case 'q': case 'r': case 's': case 't':    /*'u'*/
>   case 'v': case 'w': case 'x': case 'y': case 'z':
>   case '_':
>     // Notify MIOpt that we read a non-whitespace/non-comment token.
> @@ -2230,13 +2281,13 @@
>   case '\'':
>     // Notify MIOpt that we read a non-whitespace/non-comment token.
>     MIOpt.ReadToken();
> -    return LexCharConstant(Result, CurPtr);
> +    return LexCharConstant(Result, CurPtr, tok::char_constant);
>
>   // C99 6.4.5: String Literals.
>   case '"':
>     // Notify MIOpt that we read a non-whitespace/non-comment token.
>     MIOpt.ReadToken();
> -    return LexStringLiteral(Result, CurPtr, false);
> +    return LexStringLiteral(Result, CurPtr, tok::string_literal);
>
>   // C99 6.4.6: Punctuators.
>   case '?':
>
> Modified: cfe/trunk/lib/Lex/LiteralSupport.cpp
> URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Lex/LiteralSupport.cpp?rev=136210&r1=136209&r2=136210&view=diff
> ==============================================================================
> --- cfe/trunk/lib/Lex/LiteralSupport.cpp (original)
> +++ cfe/trunk/lib/Lex/LiteralSupport.cpp Wed Jul 27 00:40:30 2011
> @@ -28,12 +28,31 @@
>   return -1;
>  }
>
> +static unsigned getCharWidth(tok::TokenKind kind, const TargetInfo &Target) {
> +  switch (kind) {
> +  default: assert(0 && "Unknown token type!");
> +  case tok::char_constant:
> +  case tok::string_literal:
> +  case tok::utf8_string_literal:
> +    return Target.getCharWidth();
> +  case tok::wide_char_constant:
> +  case tok::wide_string_literal:
> +    return Target.getWCharWidth();
> +  case tok::utf16_char_constant:
> +  case tok::utf16_string_literal:
> +    return Target.getChar16Width();
> +  case tok::utf32_char_constant:
> +  case tok::utf32_string_literal:
> +    return Target.getChar32Width();
> +  }
> +}
> +
>  /// ProcessCharEscape - Parse a standard C escape sequence, which can occur in
>  /// either a character or a string literal.
>  static unsigned ProcessCharEscape(const char *&ThisTokBuf,
>                                   const char *ThisTokEnd, bool &HadError,
> -                                  FullSourceLoc Loc, bool IsWide,
> -                                  Diagnostic *Diags, const TargetInfo &Target) {
> +                                  FullSourceLoc Loc, unsigned CharWidth,
> +                                  Diagnostic *Diags) {
>   // Skip the '\' char.
>   ++ThisTokBuf;
>
> @@ -98,9 +117,6 @@
>     }
>
>     // See if any bits will be truncated when evaluated as a character.
> -    unsigned CharWidth =
> -      IsWide ? Target.getWCharWidth() : Target.getCharWidth();
> -
>     if (CharWidth != 32 && (ResultChar >> CharWidth) != 0) {
>       Overflow = true;
>       ResultChar &= ~0U >> (32-CharWidth);
> @@ -128,9 +144,6 @@
>              ThisTokBuf[0] >= '0' && ThisTokBuf[0] <= '7');
>
>     // Check for overflow.  Reject '\777', but not L'\777'.
> -    unsigned CharWidth =
> -      IsWide ? Target.getWCharWidth() : Target.getCharWidth();
> -
>     if (CharWidth != 32 && (ResultChar >> CharWidth) != 0) {
>       if (Diags)
>         Diags->Report(Loc, diag::warn_octal_escape_too_large);
> @@ -219,8 +232,8 @@
>  /// we will likely rework our support for UCN's.
>  static void EncodeUCNEscape(const char *&ThisTokBuf, const char *ThisTokEnd,
>                             char *&ResultBuf, bool &HadError,
> -                            FullSourceLoc Loc, bool wide, Diagnostic *Diags,
> -                            const LangOptions &Features) {
> +                            FullSourceLoc Loc, unsigned CharByteWidth,
> +                            Diagnostic *Diags, const LangOptions &Features) {
>   typedef uint32_t UTF32;
>   UTF32 UcnVal = 0;
>   unsigned short UcnLen = 0;
> @@ -230,19 +243,22 @@
>     return;
>   }
>
> -  if (wide) {
> -    (void)UcnLen;
> -    assert((UcnLen== 4 || UcnLen== 8) && "only ucn length of 4 or 8 supported");
> +  assert((CharByteWidth == 1 || CharByteWidth == 2 || CharByteWidth) &&
> +         "only character widths of 1, 2, or 4 bytes supported");
>
> -    if (!Features.ShortWChar) {
> -      // Note: our internal rep of wide char tokens is always little-endian.
> -      *ResultBuf++ = (UcnVal & 0x000000FF);
> -      *ResultBuf++ = (UcnVal & 0x0000FF00) >> 8;
> -      *ResultBuf++ = (UcnVal & 0x00FF0000) >> 16;
> -      *ResultBuf++ = (UcnVal & 0xFF000000) >> 24;
> -      return;
> -    }
> +  (void)UcnLen;
> +  assert((UcnLen== 4 || UcnLen== 8) && "only ucn length of 4 or 8 supported");
> +
> +  if (CharByteWidth == 4) {
> +    // Note: our internal rep of wide char tokens is always little-endian.
> +    *ResultBuf++ = (UcnVal & 0x000000FF);
> +    *ResultBuf++ = (UcnVal & 0x0000FF00) >> 8;
> +    *ResultBuf++ = (UcnVal & 0x00FF0000) >> 16;
> +    *ResultBuf++ = (UcnVal & 0xFF000000) >> 24;
> +    return;
> +  }
>
> +  if (CharByteWidth == 2) {
>     // Convert to UTF16.
>     if (UcnVal < (UTF32)0xFFFF) {
>       *ResultBuf++ = (UcnVal & 0x000000FF);
> @@ -261,6 +277,9 @@
>     *ResultBuf++ = (surrogate2 & 0x0000FF00) >> 8;
>     return;
>   }
> +
> +  assert(CharByteWidth == 1 && "UTF-8 encoding is only for 1 byte characters");
> +
>   // Now that we've parsed/checked the UCN, we convert from UTF32->UTF8.
>   // The conversion below was inspired by:
>   //   http://www.unicode.org/Public/PROGRAMS/CVTUTF/ConvertUTF.c
> @@ -695,13 +714,18 @@
>
>
>  CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
> -                                     SourceLocation Loc, Preprocessor &PP) {
> +                                     SourceLocation Loc, Preprocessor &PP,
> +                                     tok::TokenKind kind) {
>   // At this point we know that the character matches the regex "L?'.*'".
>   HadError = false;
>
> -  // Determine if this is a wide character.
> -  IsWide = begin[0] == 'L';
> -  if (IsWide) ++begin;
> +  Kind = kind;
> +
> +  // Determine if this is a wide or UTF character.
> +  if (Kind == tok::wide_char_constant || Kind == tok::utf16_char_constant ||
> +      Kind == tok::utf32_char_constant) {
> +    ++begin;
> +  }
>
>   // Skip over the entry quote.
>   assert(begin[0] == '\'' && "Invalid token lexed");
> @@ -742,17 +766,17 @@
>         ResultChar = utf32;
>       } else {
>         // Otherwise, this is a non-UCN escape character.  Process it.
> +        unsigned CharWidth = getCharWidth(Kind, PP.getTargetInfo());
>         ResultChar = ProcessCharEscape(begin, end, HadError,
>                                        FullSourceLoc(Loc,PP.getSourceManager()),
> -                                       IsWide,
> -                                       &PP.getDiagnostics(), PP.getTargetInfo());
> +                                       CharWidth, &PP.getDiagnostics());
>       }
>     }
>
>     // If this is a multi-character constant (e.g. 'abc'), handle it.  These are
>     // implementation defined (C99 6.4.4.4p10).
>     if (NumCharsSoFar) {
> -      if (IsWide) {
> +      if (!isAscii()) {
>         // Emulate GCC's (unintentional?) behavior: L'ab' -> L'b'.
>         LitVal = 0;
>       } else {
> @@ -774,8 +798,8 @@
>   if (NumCharsSoFar > 1) {
>     // Warn about discarding the top bits for multi-char wide-character
>     // constants (L'abcd').
> -    if (IsWide)
> -      PP.Diag(Loc, diag::warn_extraneous_wide_char_constant);
> +    if (!isAscii())
> +      PP.Diag(Loc, diag::warn_extraneous_char_constant);
>     else if (NumCharsSoFar != 4)
>       PP.Diag(Loc, diag::ext_multichar_character_literal);
>     else
> @@ -787,14 +811,15 @@
>   // Transfer the value from APInt to uint64_t
>   Value = LitVal.getZExtValue();
>
> -  if (IsWide && PP.getLangOptions().ShortWChar && Value > 0xFFFF)
> +  if (((isWide() && PP.getLangOptions().ShortWChar) || isUTF16()) &&
> +      Value > 0xFFFF)
>     PP.Diag(Loc, diag::warn_ucn_escape_too_large);
>
>   // If this is a single narrow character, sign extend it (e.g. '\xFF' is "-1")
>   // if 'char' is signed for this target (C99 6.4.4.4p10).  Note that multiple
>   // character constants are not sign extended in the this implementation:
>   // '\xFF\xFF' = 65536 and '\x0\xFF' = 255, which matches GCC.
> -  if (!IsWide && NumCharsSoFar == 1 && (Value & 128) &&
> +  if (isAscii() && NumCharsSoFar == 1 && (Value & 128) &&
>       PP.getLangOptions().CharIsSigned)
>     Value = (signed char)Value;
>  }
> @@ -839,8 +864,8 @@
>                     Preprocessor &PP, bool Complain)
>   : SM(PP.getSourceManager()), Features(PP.getLangOptions()),
>     Target(PP.getTargetInfo()), Diags(Complain ? &PP.getDiagnostics() : 0),
> -    MaxTokenLength(0), SizeBound(0), wchar_tByteWidth(0),
> -    ResultPtr(ResultBuf.data()), hadError(false), AnyWide(false), Pascal(false) {
> +    MaxTokenLength(0), SizeBound(0), CharByteWidth(0), Kind(tok::unknown),
> +    ResultPtr(ResultBuf.data()), hadError(false), Pascal(false) {
>   init(StringToks, NumStringToks);
>  }
>
> @@ -860,7 +885,7 @@
>   MaxTokenLength = StringToks[0].getLength();
>   assert(StringToks[0].getLength() >= 2 && "literal token is invalid!");
>   SizeBound = StringToks[0].getLength()-2;  // -2 for "".
> -  AnyWide = StringToks[0].is(tok::wide_string_literal);
> +  Kind = StringToks[0].getKind();
>
>   hadError = false;
>
> @@ -881,8 +906,18 @@
>     if (StringToks[i].getLength() > MaxTokenLength)
>       MaxTokenLength = StringToks[i].getLength();
>
> -    // Remember if we see any wide strings.
> -    AnyWide |= StringToks[i].is(tok::wide_string_literal);
> +    // Remember if we see any wide or utf-8/16/32 strings.
> +    // Also check for illegal concatenations.
> +    if (StringToks[i].isNot(Kind) && StringToks[i].isNot(tok::string_literal)) {
> +      if (isAscii()) {
> +        Kind = StringToks[i].getKind();
> +      } else {
> +        if (Diags)
> +          Diags->Report(FullSourceLoc(StringToks[i].getLocation(), SM),
> +                        diag::err_unsupported_string_concat);
> +        hadError = true;
> +      }
> +    }
>   }
>
>   // Include space for the null terminator.
> @@ -890,19 +925,14 @@
>
>   // TODO: K&R warning: "traditional C rejects string constant concatenation"
>
> -  // Get the width in bytes of wchar_t.  If no wchar_t strings are used, do not
> -  // query the target.  As such, wchar_tByteWidth is only valid if AnyWide=true.
> -  wchar_tByteWidth = ~0U;
> -  if (AnyWide) {
> -    wchar_tByteWidth = Target.getWCharWidth();
> -    assert((wchar_tByteWidth & 7) == 0 && "Assumes wchar_t is byte multiple!");
> -    wchar_tByteWidth /= 8;
> -  }
> +  // Get the width in bytes of char/wchar_t/char16_t/char32_t
> +  CharByteWidth = getCharWidth(Kind, Target);
> +  assert((CharByteWidth & 7) == 0 && "Assumes character size is byte multiple");
> +  CharByteWidth /= 8;
>
>   // The output buffer size needs to be large enough to hold wide characters.
>   // This is a worst-case assumption which basically corresponds to L"" "long".
> -  if (AnyWide)
> -    SizeBound *= wchar_tByteWidth;
> +  SizeBound *= CharByteWidth;
>
>   // Size the temporary buffer to hold the result string data.
>   ResultBuf.resize(SizeBound);
> @@ -927,18 +957,19 @@
>       Lexer::getSpelling(StringToks[i], ThisTokBuf, SM, Features,
>                          &StringInvalid);
>     if (StringInvalid) {
> -      hadError = 1;
> +      hadError = true;
>       continue;
>     }
>
>     const char *ThisTokEnd = ThisTokBuf+ThisTokLen-1;  // Skip end quote.
> -    bool wide = false;
>     // TODO: Input character set mapping support.
>
>     // Skip L marker for wide strings.
> -    if (ThisTokBuf[0] == 'L') {
> -      wide = true;
> +    if (ThisTokBuf[0] == 'L' || ThisTokBuf[0] == 'u' || ThisTokBuf[0] == 'U') {
>       ++ThisTokBuf;
> +      // Skip 8 of u8 marker for utf8 strings.
> +      if (ThisTokBuf[0] == '8')
> +        ++ThisTokBuf;
>     }
>
>     assert(ThisTokBuf[0] == '"' && "Expected quote, lexer broken?");
> @@ -967,7 +998,7 @@
>
>         // Copy the character span over.
>         unsigned Len = ThisTokBuf-InStart;
> -        if (!AnyWide) {
> +        if (CharByteWidth == 1) {
>           memcpy(ResultPtr, InStart, Len);
>           ResultPtr += Len;
>         } else {
> @@ -975,7 +1006,7 @@
>           for (; Len; --Len, ++InStart) {
>             *ResultPtr++ = InStart[0];
>             // Add zeros at the end.
> -            for (unsigned i = 1, e = wchar_tByteWidth; i != e; ++i)
> +            for (unsigned i = 1, e = CharByteWidth; i != e; ++i)
>               *ResultPtr++ = 0;
>           }
>         }
> @@ -985,29 +1016,26 @@
>       if (ThisTokBuf[1] == 'u' || ThisTokBuf[1] == 'U') {
>         EncodeUCNEscape(ThisTokBuf, ThisTokEnd, ResultPtr,
>                         hadError, FullSourceLoc(StringToks[i].getLocation(),SM),
> -                        wide, Diags, Features);
> +                        CharByteWidth, Diags, Features);
>         continue;
>       }
>       // Otherwise, this is a non-UCN escape character.  Process it.
>       unsigned ResultChar =
>         ProcessCharEscape(ThisTokBuf, ThisTokEnd, hadError,
>                           FullSourceLoc(StringToks[i].getLocation(), SM),
> -                          AnyWide, Diags, Target);
> +                          CharByteWidth*8, Diags);
>
>       // Note: our internal rep of wide char tokens is always little-endian.
>       *ResultPtr++ = ResultChar & 0xFF;
>
> -      if (AnyWide) {
> -        for (unsigned i = 1, e = wchar_tByteWidth; i != e; ++i)
> -          *ResultPtr++ = ResultChar >> i*8;
> -      }
> +      for (unsigned i = 1, e = CharByteWidth; i != e; ++i)
> +        *ResultPtr++ = ResultChar >> i*8;
>     }
>   }
>
>   if (Pascal) {
>     ResultBuf[0] = ResultPtr-&ResultBuf[0]-1;
> -    if (AnyWide)
> -      ResultBuf[0] /= wchar_tByteWidth;
> +    ResultBuf[0] /= CharByteWidth;
>
>     // Verify that pascal strings aren't too large.
>     if (GetStringLength() > 256) {
> @@ -1016,7 +1044,7 @@
>                       diag::err_pascal_string_too_long)
>           << SourceRange(StringToks[0].getLocation(),
>                          StringToks[NumStringToks-1].getLocation());
> -      hadError = 1;
> +      hadError = true;
>       return;
>     }
>   } else if (Diags) {
> @@ -1050,7 +1078,8 @@
>   if (StringInvalid)
>     return 0;
>
> -  assert(SpellingPtr[0] != 'L' && "Doesn't handle wide strings yet");
> +  assert(SpellingPtr[0] != 'L' && SpellingPtr[0] != 'u' &&
> +         SpellingPtr[0] != 'U' && "Doesn't handle wide or utf strings yet");
>
>
>   const char *SpellingStart = SpellingPtr;
> @@ -1075,7 +1104,7 @@
>     bool HadError = false;
>     ProcessCharEscape(SpellingPtr, SpellingEnd, HadError,
>                       FullSourceLoc(Tok.getLocation(), SM),
> -                      false, Diags, Target);
> +                      CharByteWidth*8, Diags);
>     assert(!HadError && "This method isn't valid on erroneous strings");
>     --ByteNo;
>   }
>
> Modified: cfe/trunk/lib/Lex/MacroArgs.cpp
> URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Lex/MacroArgs.cpp?rev=136210&r1=136209&r2=136210&view=diff
> ==============================================================================
> --- cfe/trunk/lib/Lex/MacroArgs.cpp (original)
> +++ cfe/trunk/lib/Lex/MacroArgs.cpp Wed Jul 27 00:40:30 2011
> @@ -208,7 +208,13 @@
>     // by 6.10.3.2p2.
>     if (Tok.is(tok::string_literal) ||       // "foo"
>         Tok.is(tok::wide_string_literal) ||  // L"foo"
> -        Tok.is(tok::char_constant)) {        // 'x' and L'x'.
> +        Tok.is(tok::utf8_string_literal) ||  // u8"foo"
> +        Tok.is(tok::utf16_string_literal) || // u"foo"
> +        Tok.is(tok::utf32_string_literal) || // U"foo"
> +        Tok.is(tok::char_constant) ||        // 'x'
> +        Tok.is(tok::wide_char_constant) ||   // L'x'.
> +        Tok.is(tok::utf16_char_constant) ||  // u'x'.
> +        Tok.is(tok::utf32_char_constant)) {  // U'x'.
>       bool Invalid = false;
>       std::string TokStr = PP.getSpelling(Tok, &Invalid);
>       if (!Invalid) {
>
> Modified: cfe/trunk/lib/Lex/PPDirectives.cpp
> URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Lex/PPDirectives.cpp?rev=136210&r1=136209&r2=136210&view=diff
> ==============================================================================
> --- cfe/trunk/lib/Lex/PPDirectives.cpp (original)
> +++ cfe/trunk/lib/Lex/PPDirectives.cpp Wed Jul 27 00:40:30 2011
> @@ -777,7 +777,7 @@
>   } else {
>     // Parse and validate the string, converting it into a unique ID.
>     StringLiteralParser Literal(&StrTok, 1, *this);
> -    assert(!Literal.AnyWide && "Didn't allow wide strings in");
> +    assert(Literal.isAscii() && "Didn't allow wide strings in");
>     if (Literal.hadError)
>       return DiscardUntilEndOfDirective();
>     if (Literal.Pascal) {
> @@ -910,7 +910,7 @@
>   } else {
>     // Parse and validate the string, converting it into a unique ID.
>     StringLiteralParser Literal(&StrTok, 1, *this);
> -    assert(!Literal.AnyWide && "Didn't allow wide strings in");
> +    assert(Literal.isAscii() && "Didn't allow wide strings in");
>     if (Literal.hadError)
>       return DiscardUntilEndOfDirective();
>     if (Literal.Pascal) {
>
> Modified: cfe/trunk/lib/Lex/PPExpressions.cpp
> URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Lex/PPExpressions.cpp?rev=136210&r1=136209&r2=136210&view=diff
> ==============================================================================
> --- cfe/trunk/lib/Lex/PPExpressions.cpp (original)
> +++ cfe/trunk/lib/Lex/PPExpressions.cpp Wed Jul 27 00:40:30 2011
> @@ -236,7 +236,10 @@
>     PP.LexNonComment(PeekTok);
>     return false;
>   }
> -  case tok::char_constant: {   // 'x'
> +  case tok::char_constant:          // 'x'
> +  case tok::wide_char_constant: {   // L'x'
> +  case tok::utf16_char_constant:    // u'x'
> +  case tok::utf32_char_constant:    // U'x'
>     llvm::SmallString<32> CharBuffer;
>     bool CharInvalid = false;
>     StringRef ThisTok = PP.getSpelling(PeekTok, CharBuffer, &CharInvalid);
> @@ -244,7 +247,7 @@
>       return true;
>
>     CharLiteralParser Literal(ThisTok.begin(), ThisTok.end(),
> -                              PeekTok.getLocation(), PP);
> +                              PeekTok.getLocation(), PP, PeekTok.getKind());
>     if (Literal.hadError())
>       return true;  // A diagnostic was already emitted.
>
> @@ -255,6 +258,10 @@
>       NumBits = TI.getIntWidth();
>     else if (Literal.isWide())
>       NumBits = TI.getWCharWidth();
> +    else if (Literal.isUTF16())
> +      NumBits = TI.getChar16Width();
> +    else if (Literal.isUTF32())
> +      NumBits = TI.getChar32Width();
>     else
>       NumBits = TI.getCharWidth();
>
> @@ -262,8 +269,9 @@
>     llvm::APSInt Val(NumBits);
>     // Set the value.
>     Val = Literal.getValue();
> -    // Set the signedness.
> -    Val.setIsUnsigned(!PP.getLangOptions().CharIsSigned);
> +    // Set the signedness. UTF-16 and UTF-32 are always unsigned
> +    if (!Literal.isUTF16() && !Literal.isUTF32())
> +      Val.setIsUnsigned(!PP.getLangOptions().CharIsSigned);
>
>     if (Result.Val.getBitWidth() > Val.getBitWidth()) {
>       Result.Val = Val.extend(Result.Val.getBitWidth());
>
> Modified: cfe/trunk/lib/Lex/Pragma.cpp
> URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Lex/Pragma.cpp?rev=136210&r1=136209&r2=136210&view=diff
> ==============================================================================
> --- cfe/trunk/lib/Lex/Pragma.cpp (original)
> +++ cfe/trunk/lib/Lex/Pragma.cpp Wed Jul 27 00:40:30 2011
> @@ -444,7 +444,7 @@
>
>     // Concatenate and parse the strings.
>     StringLiteralParser Literal(&StrToks[0], StrToks.size(), *this);
> -    assert(!Literal.AnyWide && "Didn't allow wide strings in");
> +    assert(Literal.isAscii() && "Didn't allow wide strings in");
>     if (Literal.hadError)
>       return;
>     if (Literal.Pascal) {
> @@ -520,7 +520,7 @@
>
>   // Concatenate and parse the strings.
>   StringLiteralParser Literal(&StrToks[0], StrToks.size(), *this);
> -  assert(!Literal.AnyWide && "Didn't allow wide strings in");
> +  assert(Literal.isAscii() && "Didn't allow wide strings in");
>   if (Literal.hadError)
>     return;
>   if (Literal.Pascal) {
> @@ -902,7 +902,7 @@
>
>     // Concatenate and parse the strings.
>     StringLiteralParser Literal(&StrToks[0], StrToks.size(), PP);
> -    assert(!Literal.AnyWide && "Didn't allow wide strings in");
> +    assert(Literal.isAscii() && "Didn't allow wide strings in");
>     if (Literal.hadError)
>       return;
>     if (Literal.Pascal) {
>
> Modified: cfe/trunk/lib/Lex/TokenConcatenation.cpp
> URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Lex/TokenConcatenation.cpp?rev=136210&r1=136209&r2=136210&view=diff
> ==============================================================================
> --- cfe/trunk/lib/Lex/TokenConcatenation.cpp (original)
> +++ cfe/trunk/lib/Lex/TokenConcatenation.cpp Wed Jul 27 00:40:30 2011
> @@ -17,42 +17,39 @@
>  using namespace clang;
>
>
> -/// StartsWithL - Return true if the spelling of this token starts with 'L'.
> -bool TokenConcatenation::StartsWithL(const Token &Tok) const {
> -  if (!Tok.needsCleaning()) {
> -    SourceManager &SM = PP.getSourceManager();
> -    return *SM.getCharacterData(SM.getSpellingLoc(Tok.getLocation())) == 'L';
> -  }
> -
> -  if (Tok.getLength() < 256) {
> -    char Buffer[256];
> -    const char *TokPtr = Buffer;
> -    PP.getSpelling(Tok, TokPtr);
> -    return TokPtr[0] == 'L';
> -  }
> -
> -  return PP.getSpelling(Tok)[0] == 'L';
> -}
> +/// IsIdentifierStringPrefix - Return true if the spelling of the token
> +/// is literally 'L', 'u', 'U', or 'u8'.
> +bool TokenConcatenation::IsIdentifierStringPrefix(const Token &Tok) const {
> +  const LangOptions &LangOpts = PP.getLangOptions();
>
> -/// IsIdentifierL - Return true if the spelling of this token is literally
> -/// 'L'.
> -bool TokenConcatenation::IsIdentifierL(const Token &Tok) const {
>   if (!Tok.needsCleaning()) {
> -    if (Tok.getLength() != 1)
> +    if (Tok.getLength() != 1 && Tok.getLength() != 2)
>       return false;
>     SourceManager &SM = PP.getSourceManager();
> -    return *SM.getCharacterData(SM.getSpellingLoc(Tok.getLocation())) == 'L';
> +    const char *Ptr = SM.getCharacterData(SM.getSpellingLoc(Tok.getLocation()));
> +    if (Tok.getLength() == 1)
> +      return Ptr[0] == 'L' ||
> +             (LangOpts.CPlusPlus0x && (Ptr[0] == 'u' || Ptr[0] == 'U'));
> +    if (Tok.getLength() == 2)
> +      return LangOpts.CPlusPlus0x && Ptr[0] == 'u' && Ptr[1] == '8';
>   }
>
>   if (Tok.getLength() < 256) {
>     char Buffer[256];
>     const char *TokPtr = Buffer;
> -    if (PP.getSpelling(Tok, TokPtr) != 1)
> -      return false;
> -    return TokPtr[0] == 'L';
> +    unsigned length = PP.getSpelling(Tok, TokPtr);
> +    if (length == 1)
> +      return TokPtr[0] == 'L' ||
> +             (LangOpts.CPlusPlus0x && (TokPtr[0] == 'u' || TokPtr[0] == 'U'));
> +    if (length == 2)
> +      return LangOpts.CPlusPlus0x && TokPtr[0] == 'u' && TokPtr[1] == '8';
> +    return false;
>   }
>
> -  return PP.getSpelling(Tok) == "L";
> +  std::string TokStr = PP.getSpelling(Tok);
> +  return TokStr == "L" || (LangOpts.CPlusPlus0x && (TokStr == "u8" ||
> +                                                    TokStr == "u" ||
> +                                                    TokStr == "U"));
>  }
>
>  TokenConcatenation::TokenConcatenation(Preprocessor &pp) : PP(pp) {
> @@ -179,24 +176,19 @@
>     if (Tok.is(tok::numeric_constant))
>       return GetFirstChar(PP, Tok) != '.';
>
> -    if (Tok.getIdentifierInfo() || Tok.is(tok::wide_string_literal) /* ||
> -     Tok.is(tok::wide_char_literal)*/)
> +    if (Tok.getIdentifierInfo() || Tok.is(tok::wide_string_literal) ||
> +        Tok.is(tok::utf8_string_literal) || Tok.is(tok::utf16_string_literal) ||
> +        Tok.is(tok::utf32_string_literal) || Tok.is(tok::wide_char_constant) ||
> +        Tok.is(tok::utf16_char_constant) || Tok.is(tok::utf32_char_constant))
>       return true;
>
>     // If this isn't identifier + string, we're done.
>     if (Tok.isNot(tok::char_constant) && Tok.isNot(tok::string_literal))
>       return false;
>
> -    // FIXME: need a wide_char_constant!
> -
> -    // If the string was a wide string L"foo" or wide char L'f', it would
> -    // concat with the previous identifier into fooL"bar".  Avoid this.
> -    if (StartsWithL(Tok))
> -      return true;
> -
>     // Otherwise, this is a narrow character or string.  If the *identifier*
> -    // is a literal 'L', avoid pasting L "foo" -> L"foo".
> -    return IsIdentifierL(PrevTok);
> +    // is a literal 'L', 'u8', 'u' or 'U', avoid pasting L "foo" -> L"foo".
> +    return IsIdentifierStringPrefix(PrevTok);
>   case tok::numeric_constant:
>     return isalnum(FirstChar) || Tok.is(tok::numeric_constant) ||
>     FirstChar == '+' || FirstChar == '-' || FirstChar == '.';
>
> Modified: cfe/trunk/lib/Parse/ParseCXXInlineMethods.cpp
> URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Parse/ParseCXXInlineMethods.cpp?rev=136210&r1=136209&r2=136210&view=diff
> ==============================================================================
> --- cfe/trunk/lib/Parse/ParseCXXInlineMethods.cpp (original)
> +++ cfe/trunk/lib/Parse/ParseCXXInlineMethods.cpp Wed Jul 27 00:40:30 2011
> @@ -553,6 +553,9 @@
>
>     case tok::string_literal:
>     case tok::wide_string_literal:
> +    case tok::utf8_string_literal:
> +    case tok::utf16_string_literal:
> +    case tok::utf32_string_literal:
>       Toks.push_back(Tok);
>       ConsumeStringToken();
>       break;
>
> Modified: cfe/trunk/lib/Parse/ParseExpr.cpp
> URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Parse/ParseExpr.cpp?rev=136210&r1=136209&r2=136210&view=diff
> ==============================================================================
> --- cfe/trunk/lib/Parse/ParseExpr.cpp (original)
> +++ cfe/trunk/lib/Parse/ParseExpr.cpp Wed Jul 27 00:40:30 2011
> @@ -769,6 +769,9 @@
>     break;
>   }
>   case tok::char_constant:     // constant: character-constant
> +  case tok::wide_char_constant:
> +  case tok::utf16_char_constant:
> +  case tok::utf32_char_constant:
>     Res = Actions.ActOnCharacterConstant(Tok);
>     ConsumeToken();
>     break;
> @@ -780,6 +783,9 @@
>     break;
>   case tok::string_literal:    // primary-expression: string-literal
>   case tok::wide_string_literal:
> +  case tok::utf8_string_literal:
> +  case tok::utf16_string_literal:
> +  case tok::utf32_string_literal:
>     Res = ParseStringLiteralExpression();
>     break;
>   case tok::kw__Generic:   // primary-expression: generic-selection [C1X 6.5.1]
>
> Modified: cfe/trunk/lib/Parse/ParseTentative.cpp
> URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Parse/ParseTentative.cpp?rev=136210&r1=136209&r2=136210&view=diff
> ==============================================================================
> --- cfe/trunk/lib/Parse/ParseTentative.cpp (original)
> +++ cfe/trunk/lib/Parse/ParseTentative.cpp Wed Jul 27 00:40:30 2011
> @@ -605,8 +605,14 @@
>   // Obviously starts an expression.
>   case tok::numeric_constant:
>   case tok::char_constant:
> +  case tok::wide_char_constant:
> +  case tok::utf16_char_constant:
> +  case tok::utf32_char_constant:
>   case tok::string_literal:
>   case tok::wide_string_literal:
> +  case tok::utf8_string_literal:
> +  case tok::utf16_string_literal:
> +  case tok::utf32_string_literal:
>   case tok::l_square:
>   case tok::l_paren:
>   case tok::amp:
>
> Modified: cfe/trunk/lib/Parse/Parser.cpp
> URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Parse/Parser.cpp?rev=136210&r1=136209&r2=136210&view=diff
> ==============================================================================
> --- cfe/trunk/lib/Parse/Parser.cpp (original)
> +++ cfe/trunk/lib/Parse/Parser.cpp Wed Jul 27 00:40:30 2011
> @@ -298,6 +298,9 @@
>
>     case tok::string_literal:
>     case tok::wide_string_literal:
> +    case tok::utf8_string_literal:
> +    case tok::utf16_string_literal:
> +    case tok::utf32_string_literal:
>       ConsumeStringToken();
>       break;
>
>
> Modified: cfe/trunk/lib/Rewrite/HTMLRewrite.cpp
> URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Rewrite/HTMLRewrite.cpp?rev=136210&r1=136209&r2=136210&view=diff
> ==============================================================================
> --- cfe/trunk/lib/Rewrite/HTMLRewrite.cpp (original)
> +++ cfe/trunk/lib/Rewrite/HTMLRewrite.cpp Wed Jul 27 00:40:30 2011
> @@ -397,8 +397,15 @@
>       HighlightRange(RB, TokOffs, TokOffs+TokLen, BufferStart,
>                      "<span class='comment'>", "</span>");
>       break;
> +    case tok::utf8_string_literal:
> +      // Chop off the u part of u8 prefix
> +      ++TokOffs;
> +      --TokLen;
> +      // FALL THROUGH to chop the 8
>     case tok::wide_string_literal:
> -      // Chop off the L prefix
> +    case tok::utf16_string_literal:
> +    case tok::utf32_string_literal:
> +      // Chop off the L, u, U or 8 prefix
>       ++TokOffs;
>       --TokLen;
>       // FALL THROUGH.
>
> Modified: cfe/trunk/lib/Rewrite/RewriteObjC.cpp
> URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Rewrite/RewriteObjC.cpp?rev=136210&r1=136209&r2=136210&view=diff
> ==============================================================================
> --- cfe/trunk/lib/Rewrite/RewriteObjC.cpp (original)
> +++ cfe/trunk/lib/Rewrite/RewriteObjC.cpp Wed Jul 27 00:40:30 2011
> @@ -2111,8 +2111,8 @@
>   std::string StrEncoding;
>   Context->getObjCEncodingForType(Exp->getEncodedType(), StrEncoding);
>   Expr *Replacement = StringLiteral::Create(*Context, StrEncoding,
> -                                            false, false, StrType,
> -                                            SourceLocation());
> +                                            StringLiteral::Ascii, false,
> +                                            StrType, SourceLocation());
>   ReplaceStmt(Exp, Replacement);
>
>   // Replace this subexpr in the parent.
> @@ -2129,8 +2129,8 @@
>   QualType argType = Context->getPointerType(Context->CharTy);
>   SelExprs.push_back(StringLiteral::Create(*Context,
>                                            Exp->getSelector().getAsString(),
> -                                           false, false, argType,
> -                                           SourceLocation()));
> +                                           StringLiteral::Ascii, false,
> +                                           argType, SourceLocation()));
>   CallExpr *SelExp = SynthesizeCallToFunctionDecl(SelGetUidFunctionDecl,
>                                                  &SelExprs[0], SelExprs.size());
>   ReplaceStmt(Exp, SelExp);
> @@ -2797,7 +2797,8 @@
>     QualType argType = Context->getPointerType(Context->CharTy);
>     ClsExprs.push_back(StringLiteral::Create(*Context,
>                                    ClassDecl->getIdentifier()->getName(),
> -                                   false, false, argType, SourceLocation()));
> +                                   StringLiteral::Ascii, false,
> +                                   argType, SourceLocation()));
>     CallExpr *Cls = SynthesizeCallToFunctionDecl(GetMetaClassFunctionDecl,
>                                                  &ClsExprs[0],
>                                                  ClsExprs.size(),
> @@ -2875,7 +2876,7 @@
>     IdentifierInfo *clsName = Class->getIdentifier();
>     ClsExprs.push_back(StringLiteral::Create(*Context,
>                                              clsName->getName(),
> -                                             false, false,
> +                                             StringLiteral::Ascii, false,
>                                              argType, SourceLocation()));
>     CallExpr *Cls = SynthesizeCallToFunctionDecl(GetClassFunctionDecl,
>                                                  &ClsExprs[0],
> @@ -2906,7 +2907,8 @@
>     QualType argType = Context->getPointerType(Context->CharTy);
>     ClsExprs.push_back(StringLiteral::Create(*Context,
>                                    ClassDecl->getIdentifier()->getName(),
> -                                   false, false, argType, SourceLocation()));
> +                                   StringLiteral::Ascii, false, argType,
> +                                   SourceLocation()));
>     CallExpr *Cls = SynthesizeCallToFunctionDecl(GetClassFunctionDecl,
>                                                  &ClsExprs[0],
>                                                  ClsExprs.size(),
> @@ -2987,7 +2989,8 @@
>   QualType argType = Context->getPointerType(Context->CharTy);
>   SelExprs.push_back(StringLiteral::Create(*Context,
>                                        Exp->getSelector().getAsString(),
> -                                       false, false, argType, SourceLocation()));
> +                                       StringLiteral::Ascii, false,
> +                                       argType, SourceLocation()));
>   CallExpr *SelExp = SynthesizeCallToFunctionDecl(SelGetUidFunctionDecl,
>                                                  &SelExprs[0], SelExprs.size(),
>                                                   StartLoc,
>
> Modified: cfe/trunk/lib/Sema/SemaChecking.cpp
> URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Sema/SemaChecking.cpp?rev=136210&r1=136209&r2=136210&view=diff
> ==============================================================================
> --- cfe/trunk/lib/Sema/SemaChecking.cpp (original)
> +++ cfe/trunk/lib/Sema/SemaChecking.cpp Wed Jul 27 00:40:30 2011
> @@ -605,7 +605,7 @@
>   Arg = Arg->IgnoreParenCasts();
>   StringLiteral *Literal = dyn_cast<StringLiteral>(Arg);
>
> -  if (!Literal || Literal->isWide()) {
> +  if (!Literal || !Literal->isAscii()) {
>     Diag(Arg->getLocStart(), diag::err_cfstring_literal_not_string_constant)
>       << Arg->getSourceRange();
>     return true;
> @@ -1805,7 +1805,7 @@
>                              bool isPrintf) {
>
>   // CHECK: is the format string a wide literal?
> -  if (FExpr->isWide()) {
> +  if (!FExpr->isAscii()) {
>     Diag(FExpr->getLocStart(),
>          diag::warn_format_string_is_wide_literal)
>     << OrigFormatExpr->getSourceRange();
>
> Modified: cfe/trunk/lib/Sema/SemaDeclAttr.cpp
> URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Sema/SemaDeclAttr.cpp?rev=136210&r1=136209&r2=136210&view=diff
> ==============================================================================
> --- cfe/trunk/lib/Sema/SemaDeclAttr.cpp (original)
> +++ cfe/trunk/lib/Sema/SemaDeclAttr.cpp Wed Jul 27 00:40:30 2011
> @@ -712,7 +712,7 @@
>     Arg = Arg->IgnoreParenCasts();
>     StringLiteral *Str = dyn_cast<StringLiteral>(Arg);
>
> -    if (Str == 0 || Str->isWide()) {
> +    if (!Str || !Str->isAscii()) {
>       S.Diag(Attr.getLoc(), diag::err_attribute_argument_n_not_string)
>           << "weakref" << 1;
>       return;
> @@ -737,7 +737,7 @@
>   Arg = Arg->IgnoreParenCasts();
>   StringLiteral *Str = dyn_cast<StringLiteral>(Arg);
>
> -  if (Str == 0 || Str->isWide()) {
> +  if (!Str || !Str->isAscii()) {
>     S.Diag(Attr.getLoc(), diag::err_attribute_argument_n_not_string)
>       << "alias" << 1;
>     return;
> @@ -1162,7 +1162,7 @@
>   Arg = Arg->IgnoreParenCasts();
>   StringLiteral *Str = dyn_cast<StringLiteral>(Arg);
>
> -  if (Str == 0 || Str->isWide()) {
> +  if (!Str || !Str->isAscii()) {
>     S.Diag(Attr.getLoc(), diag::err_attribute_argument_n_not_string)
>       << "visibility" << 1;
>     return;
> @@ -2464,7 +2464,7 @@
>   case AttributeList::AT_pcs: {
>     Expr *Arg = Attr.getArg(0);
>     StringLiteral *Str = dyn_cast<StringLiteral>(Arg);
> -    if (Str == 0 || Str->isWide()) {
> +    if (!Str || !Str->isAscii()) {
>       S.Diag(Attr.getLoc(), diag::err_attribute_argument_n_not_string)
>         << "pcs" << 1;
>       Attr.setInvalid();
> @@ -2519,7 +2519,7 @@
>   case AttributeList::AT_pcs: {
>     Expr *Arg = attr.getArg(0);
>     StringLiteral *Str = dyn_cast<StringLiteral>(Arg);
> -    if (Str == 0 || Str->isWide()) {
> +    if (!Str || !Str->isAscii()) {
>       Diag(attr.getLoc(), diag::err_attribute_argument_n_not_string)
>         << "pcs" << 1;
>       attr.setInvalid();
> @@ -2868,7 +2868,7 @@
>
>     Expr *Arg = Attr.getArg(0);
>     StringLiteral *Str = dyn_cast<StringLiteral>(Arg);
> -    if (Str == 0 || Str->isWide()) {
> +    if (!Str || !Str->isAscii()) {
>       S.Diag(Attr.getLoc(), diag::err_attribute_argument_n_not_string)
>         << "uuid" << 1;
>       return;
>
> Modified: cfe/trunk/lib/Sema/SemaExpr.cpp
> URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Sema/SemaExpr.cpp?rev=136210&r1=136209&r2=136210&view=diff
> ==============================================================================
> --- cfe/trunk/lib/Sema/SemaExpr.cpp (original)
> +++ cfe/trunk/lib/Sema/SemaExpr.cpp Wed Jul 27 00:40:30 2011
> @@ -997,11 +997,25 @@
>     StringTokLocs.push_back(StringToks[i].getLocation());
>
>   QualType StrTy = Context.CharTy;
> -  if (Literal.AnyWide)
> +  if (Literal.isWide())
>     StrTy = Context.getWCharType();
> +  else if (Literal.isUTF16())
> +    StrTy = Context.Char16Ty;
> +  else if (Literal.isUTF32())
> +    StrTy = Context.Char32Ty;
>   else if (Literal.Pascal)
>     StrTy = Context.UnsignedCharTy;
>
> +  StringLiteral::StringKind Kind = StringLiteral::Ascii;
> +  if (Literal.isWide())
> +    Kind = StringLiteral::Wide;
> +  else if (Literal.isUTF8())
> +    Kind = StringLiteral::UTF8;
> +  else if (Literal.isUTF16())
> +    Kind = StringLiteral::UTF16;
> +  else if (Literal.isUTF32())
> +    Kind = StringLiteral::UTF32;
> +
>   // A C++ string literal has a const-qualified element type (C++ 2.13.4p1).
>   if (getLangOptions().CPlusPlus || getLangOptions().ConstStrings)
>     StrTy.addConst();
> @@ -1015,7 +1029,7 @@
>
>   // Pass &StringTokLocs[0], StringTokLocs.size() to factory!
>   return Owned(StringLiteral::Create(Context, Literal.GetString(),
> -                                     Literal.AnyWide, Literal.Pascal, StrTy,
> +                                     Kind, Literal.Pascal, StrTy,
>                                      &StringTokLocs[0],
>                                      StringTokLocs.size()));
>  }
> @@ -2412,7 +2426,7 @@
>     return ExprError();
>
>   CharLiteralParser Literal(ThisTok.begin(), ThisTok.end(), Tok.getLocation(),
> -                            PP);
> +                            PP, Tok.getKind());
>   if (Literal.hadError())
>     return ExprError();
>
> @@ -2421,14 +2435,25 @@
>     Ty = Context.IntTy;   // 'x' and L'x' -> int in C.
>   else if (Literal.isWide())
>     Ty = Context.WCharTy; // L'x' -> wchar_t in C++.
> +  else if (Literal.isUTF16())
> +    Ty = Context.Char16Ty; // u'x' -> char16_t in C++0x.
> +  else if (Literal.isUTF32())
> +    Ty = Context.Char32Ty; // U'x' -> char32_t in C++0x.
>   else if (Literal.isMultiChar())
>     Ty = Context.IntTy;   // 'wxyz' -> int in C++.
>   else
>     Ty = Context.CharTy;  // 'x' -> char in C++
>
> -  return Owned(new (Context) CharacterLiteral(Literal.getValue(),
> -                                              Literal.isWide(),
> -                                              Ty, Tok.getLocation()));
> +  CharacterLiteral::CharacterKind Kind = CharacterLiteral::Ascii;
> +  if (Literal.isWide())
> +    Kind = CharacterLiteral::Wide;
> +  else if (Literal.isUTF16())
> +    Kind = CharacterLiteral::UTF16;
> +  else if (Literal.isUTF32())
> +    Kind = CharacterLiteral::UTF32;
> +
> +  return Owned(new (Context) CharacterLiteral(Literal.getValue(), Kind, Ty,
> +                                              Tok.getLocation()));
>  }
>
>  ExprResult Sema::ActOnNumericConstant(const Token &Tok) {
> @@ -8624,7 +8649,7 @@
>
>   // Strip off any parens and casts.
>   StringLiteral *SL = dyn_cast<StringLiteral>(SrcExpr->IgnoreParenCasts());
> -  if (!SL || SL->isWide())
> +  if (!SL || !SL->isAscii())
>     return;
>
>   Hint = FixItHint::CreateInsertion(SL->getLocStart(), "@");
>
> Modified: cfe/trunk/lib/Sema/SemaExprCXX.cpp
> URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Sema/SemaExprCXX.cpp?rev=136210&r1=136209&r2=136210&view=diff
> ==============================================================================
> --- cfe/trunk/lib/Sema/SemaExprCXX.cpp (original)
> +++ cfe/trunk/lib/Sema/SemaExprCXX.cpp Wed Jul 27 00:40:30 2011
> @@ -2041,12 +2041,20 @@
>           = ToPtrType->getPointeeType()->getAs<BuiltinType>()) {
>         // This conversion is considered only when there is an
>         // explicit appropriate pointer target type (C++ 4.2p2).
> -        if (!ToPtrType->getPointeeType().hasQualifiers() &&
> -            ((StrLit->isWide() && ToPointeeType->isWideCharType()) ||
> -             (!StrLit->isWide() &&
> -              (ToPointeeType->getKind() == BuiltinType::Char_U ||
> -               ToPointeeType->getKind() == BuiltinType::Char_S))))
> -          return true;
> +        if (!ToPtrType->getPointeeType().hasQualifiers()) {
> +          switch (StrLit->getKind()) {
> +            case StringLiteral::UTF8:
> +            case StringLiteral::UTF16:
> +            case StringLiteral::UTF32:
> +              // We don't allow UTF literals to be implicitly converted
> +              break;
> +            case StringLiteral::Ascii:
> +              return (ToPointeeType->getKind() == BuiltinType::Char_U ||
> +                      ToPointeeType->getKind() == BuiltinType::Char_S);
> +            case StringLiteral::Wide:
> +              return ToPointeeType->isWideCharType();
> +          }
> +        }
>       }
>
>   return false;
>
> Modified: cfe/trunk/lib/Sema/SemaExprObjC.cpp
> URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Sema/SemaExprObjC.cpp?rev=136210&r1=136209&r2=136210&view=diff
> ==============================================================================
> --- cfe/trunk/lib/Sema/SemaExprObjC.cpp (original)
> +++ cfe/trunk/lib/Sema/SemaExprObjC.cpp Wed Jul 27 00:40:30 2011
> @@ -47,8 +47,8 @@
>     for (unsigned i = 0; i != NumStrings; ++i) {
>       S = Strings[i];
>
> -      // ObjC strings can't be wide.
> -      if (S->isWide()) {
> +      // ObjC strings can't be wide or UTF.
> +      if (!S->isAscii()) {
>         Diag(S->getLocStart(), diag::err_cfstring_literal_not_string_constant)
>           << S->getSourceRange();
>         return true;
> @@ -64,7 +64,7 @@
>     // Create the aggregate string with the appropriate content and location
>     // information.
>     S = StringLiteral::Create(Context, StrBuf,
> -                              /*Wide=*/false, /*Pascal=*/false,
> +                              StringLiteral::Ascii, /*Pascal=*/false,
>                               Context.getPointerType(Context.CharTy),
>                               &StrLocs[0], StrLocs.size());
>   }
>
> Modified: cfe/trunk/lib/Sema/SemaInit.cpp
> URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Sema/SemaInit.cpp?rev=136210&r1=136209&r2=136210&view=diff
> ==============================================================================
> --- cfe/trunk/lib/Sema/SemaInit.cpp (original)
> +++ cfe/trunk/lib/Sema/SemaInit.cpp Wed Jul 27 00:40:30 2011
> @@ -49,20 +49,30 @@
>   if (SL == 0) return 0;
>
>   QualType ElemTy = Context.getCanonicalType(AT->getElementType());
> -  // char array can be initialized with a narrow string.
> -  // Only allow char x[] = "foo";  not char x[] = L"foo";
> -  if (!SL->isWide())
> +
> +  switch (SL->getKind()) {
> +  case StringLiteral::Ascii:
> +  case StringLiteral::UTF8:
> +    // char array can be initialized with a narrow string.
> +    // Only allow char x[] = "foo";  not char x[] = L"foo";
>     return ElemTy->isCharType() ? Init : 0;
> +  case StringLiteral::UTF16:
> +    return ElemTy->isChar16Type() ? Init : 0;
> +  case StringLiteral::UTF32:
> +    return ElemTy->isChar32Type() ? Init : 0;
> +  case StringLiteral::Wide:
> +    // wchar_t array can be initialized with a wide string: C99 6.7.8p15 (with
> +    // correction from DR343): "An array with element type compatible with a
> +    // qualified or unqualified version of wchar_t may be initialized by a wide
> +    // string literal, optionally enclosed in braces."
> +    if (Context.typesAreCompatible(Context.getWCharType(),
> +                                   ElemTy.getUnqualifiedType()))
> +      return Init;
>
> -  // wchar_t array can be initialized with a wide string: C99 6.7.8p15 (with
> -  // correction from DR343): "An array with element type compatible with a
> -  // qualified or unqualified version of wchar_t may be initialized by a wide
> -  // string literal, optionally enclosed in braces."
> -  if (Context.typesAreCompatible(Context.getWCharType(),
> -                                 ElemTy.getUnqualifiedType()))
> -    return Init;
> +    return 0;
> +  }
>
> -  return 0;
> +  llvm_unreachable("missed a StringLiteral kind?");
>  }
>
>  static Expr *IsStringInit(Expr *init, QualType declType, ASTContext &Context) {
>
> Modified: cfe/trunk/lib/Sema/SemaStmt.cpp
> URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Sema/SemaStmt.cpp?rev=136210&r1=136209&r2=136210&view=diff
> ==============================================================================
> --- cfe/trunk/lib/Sema/SemaStmt.cpp (original)
> +++ cfe/trunk/lib/Sema/SemaStmt.cpp Wed Jul 27 00:40:30 2011
> @@ -1952,13 +1952,13 @@
>   SmallVector<TargetInfo::ConstraintInfo, 4> OutputConstraintInfos;
>
>   // The parser verifies that there is a string literal here.
> -  if (AsmString->isWide())
> +  if (!AsmString->isAscii())
>     return StmtError(Diag(AsmString->getLocStart(),diag::err_asm_wide_character)
>       << AsmString->getSourceRange());
>
>   for (unsigned i = 0; i != NumOutputs; i++) {
>     StringLiteral *Literal = Constraints[i];
> -    if (Literal->isWide())
> +    if (!Literal->isAscii())
>       return StmtError(Diag(Literal->getLocStart(),diag::err_asm_wide_character)
>         << Literal->getSourceRange());
>
> @@ -1987,7 +1987,7 @@
>
>   for (unsigned i = NumOutputs, e = NumOutputs + NumInputs; i != e; i++) {
>     StringLiteral *Literal = Constraints[i];
> -    if (Literal->isWide())
> +    if (!Literal->isAscii())
>       return StmtError(Diag(Literal->getLocStart(),diag::err_asm_wide_character)
>         << Literal->getSourceRange());
>
> @@ -2034,7 +2034,7 @@
>   // Check that the clobbers are valid.
>   for (unsigned i = 0; i != NumClobbers; i++) {
>     StringLiteral *Literal = Clobbers[i];
> -    if (Literal->isWide())
> +    if (!Literal->isAscii())
>       return StmtError(Diag(Literal->getLocStart(),diag::err_asm_wide_character)
>         << Literal->getSourceRange());
>
>
> Modified: cfe/trunk/lib/Sema/SemaTemplate.cpp
> URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Sema/SemaTemplate.cpp?rev=136210&r1=136209&r2=136210&view=diff
> ==============================================================================
> --- cfe/trunk/lib/Sema/SemaTemplate.cpp (original)
> +++ cfe/trunk/lib/Sema/SemaTemplate.cpp Wed Jul 27 00:40:30 2011
> @@ -4131,10 +4131,22 @@
>   assert(Arg.getKind() == TemplateArgument::Integral &&
>          "Operation is only valid for integral template arguments");
>   QualType T = Arg.getIntegralType();
> -  if (T->isCharType() || T->isWideCharType())
> +  if (T->isAnyCharacterType()) {
> +    CharacterLiteral::CharacterKind Kind;
> +    if (T->isWideCharType())
> +      Kind = CharacterLiteral::Wide;
> +    else if (T->isChar16Type())
> +      Kind = CharacterLiteral::UTF16;
> +    else if (T->isChar32Type())
> +      Kind = CharacterLiteral::UTF32;
> +    else
> +      Kind = CharacterLiteral::Ascii;
> +
>     return Owned(new (Context) CharacterLiteral(
> -                                             Arg.getAsIntegral()->getZExtValue(),
> -                                             T->isWideCharType(), T, Loc));
> +                                            Arg.getAsIntegral()->getZExtValue(),
> +                                            Kind, T, Loc));
> +  }
> +
>   if (T->isBooleanType())
>     return Owned(new (Context) CXXBoolLiteralExpr(
>                                             Arg.getAsIntegral()->getBoolValue(),
>
> Modified: cfe/trunk/lib/Serialization/ASTReaderStmt.cpp
> URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Serialization/ASTReaderStmt.cpp?rev=136210&r1=136209&r2=136210&view=diff
> ==============================================================================
> --- cfe/trunk/lib/Serialization/ASTReaderStmt.cpp (original)
> +++ cfe/trunk/lib/Serialization/ASTReaderStmt.cpp Wed Jul 27 00:40:30 2011
> @@ -371,7 +371,7 @@
>   assert(Record[Idx] == E->getNumConcatenated() &&
>          "Wrong number of concatenated tokens!");
>   ++Idx;
> -  E->IsWide = Record[Idx++];
> +  E->Kind = static_cast<StringLiteral::StringKind>(Record[Idx++]);
>   E->IsPascal = Record[Idx++];
>
>   // Read string data
> @@ -388,7 +388,7 @@
>   VisitExpr(E);
>   E->setValue(Record[Idx++]);
>   E->setLocation(ReadSourceLocation(Record, Idx));
> -  E->setWide(Record[Idx++]);
> +  E->setKind(static_cast<CharacterLiteral::CharacterKind>(Record[Idx++]));
>  }
>
>  void ASTStmtReader::VisitParenExpr(ParenExpr *E) {
>
> Modified: cfe/trunk/lib/Serialization/ASTWriterStmt.cpp
> URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Serialization/ASTWriterStmt.cpp?rev=136210&r1=136209&r2=136210&view=diff
> ==============================================================================
> --- cfe/trunk/lib/Serialization/ASTWriterStmt.cpp (original)
> +++ cfe/trunk/lib/Serialization/ASTWriterStmt.cpp Wed Jul 27 00:40:30 2011
> @@ -324,7 +324,7 @@
>   VisitExpr(E);
>   Record.push_back(E->getByteLength());
>   Record.push_back(E->getNumConcatenated());
> -  Record.push_back(E->isWide());
> +  Record.push_back(E->getKind());
>   Record.push_back(E->isPascal());
>   // FIXME: String data should be stored as a blob at the end of the
>   // StringLiteral. However, we can't do so now because we have no
> @@ -340,7 +340,7 @@
>   VisitExpr(E);
>   Record.push_back(E->getValue());
>   Writer.AddSourceLocation(E->getLocation(), Record);
> -  Record.push_back(E->isWide());
> +  Record.push_back(E->getKind());
>
>   AbbrevToUse = Writer.getCharacterLiteralAbbrev();
>
>
> Modified: cfe/trunk/test/CXX/lex/lex.literal/lex.ccon/p1.cpp
> URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/test/CXX/lex/lex.literal/lex.ccon/p1.cpp?rev=136210&r1=136209&r2=136210&view=diff
> ==============================================================================
> --- cfe/trunk/test/CXX/lex/lex.literal/lex.ccon/p1.cpp (original)
> +++ cfe/trunk/test/CXX/lex/lex.literal/lex.ccon/p1.cpp Wed Jul 27 00:40:30 2011
> @@ -1,4 +1,5 @@
> -// RUN: %clang_cc1 -fsyntax-only -verify %s
> +// RUN: %clang_cc1 -std=c++0x -fsyntax-only -verify %s
> +// Runs in c++0x mode so that char16_t and char32_t are available.
>
>  // Check types of char literals
>  extern char a;
> @@ -7,3 +8,7 @@
>  extern __typeof('asdf') b;
>  extern wchar_t c;
>  extern __typeof(L'a') c;
> +extern char16_t d;
> +extern __typeof(u'a') d;
> +extern char32_t e;
> +extern __typeof(U'a') e;
>
> Modified: cfe/trunk/test/CodeGen/char-literal.c
> URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/test/CodeGen/char-literal.c?rev=136210&r1=136209&r2=136210&view=diff
> ==============================================================================
> --- cfe/trunk/test/CodeGen/char-literal.c (original)
> +++ cfe/trunk/test/CodeGen/char-literal.c Wed Jul 27 00:40:30 2011
> @@ -1,5 +1,5 @@
> -// RUN: %clang_cc1 -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s
> -// Runs in c++ mode so that wchar_t is available.
> +// RUN: %clang_cc1 -x c++ -std=c++0x -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s
> +// Runs in c++0x mode so that wchar_t, char16_t, and char32_t are available.
>
>  int main() {
>   // CHECK: store i8 97
> @@ -16,6 +16,20 @@
>   // CHECK: store i32 98
>   wchar_t wb = L'ab';
>
> +  // CHECK: store i16 97
> +  char16_t ua = u'a';
> +
> +  // Should pick second character.
> +  // CHECK: store i16 98
> +  char16_t ub = u'ab';
> +
> +  // CHECK: store i32 97
> +  char32_t Ua = U'a';
> +
> +  // Should pick second character.
> +  // CHECK: store i32 98
> +  char32_t Ub = U'ab';
> +
>   // Should pick last character and store its lowest byte.
>   // This does not match gcc, which takes the last character, converts it to
>   // utf8, and then picks the second-lowest byte of that (they probably store
> @@ -26,10 +40,36 @@
>   // CHECK: store i32 61451
>   wchar_t wc = L'\uF00B';
>
> +  // -4085 == 0xf00b
> +  // CHECK: store i16 -4085
> +  char16_t uc = u'\uF00B';
> +
> +  // CHECK: store i32 61451
> +  char32_t Uc = U'\uF00B';
> +
>   // CHECK: store i32 1110027
>   wchar_t wd = L'\U0010F00B';
>
> +  // Should take lower word of the 4byte UNC sequence. This does not match
> +  // gcc. I don't understand what gcc does (it looks like it converts to utf16,
> +  // then takes the second (!) utf16 word, swaps the lower two nibbles, and
> +  // stores that?).
> +  // CHECK: store i16 -4085
> +  char16_t ud = u'\U0010F00B';  // has utf16 encoding dbc8 dcb0
> +
> +  // CHECK: store i32 1110027
> +  char32_t Ud = U'\U0010F00B';
> +
>   // Should pick second character.
>   // CHECK: store i32 1110027
>   wchar_t we = L'\u1234\U0010F00B';
> +
> +  // Should pick second character.
> +  // CHECK: store i16 -4085
> +  char16_t ue = u'\u1234\U0010F00B';
> +
> +  // Should pick second character.
> +  // CHECK: store i32 1110027
> +  char32_t Ue = U'\u1234\U0010F00B';
> +
>  }
>
> Modified: cfe/trunk/test/CodeGen/string-literal.c
> URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/test/CodeGen/string-literal.c?rev=136210&r1=136209&r2=136210&view=diff
> ==============================================================================
> --- cfe/trunk/test/CodeGen/string-literal.c (original)
> +++ cfe/trunk/test/CodeGen/string-literal.c Wed Jul 27 00:40:30 2011
> @@ -1,4 +1,5 @@
> -// RUN: %clang_cc1 -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s
> +// RUN: %clang_cc1 -x c++ -std=c++0x -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s
> +// Runs in c++0x mode so that wchar_t, char16_t, and char32_t are available.
>
>  int main() {
>   // CHECK: internal unnamed_addr constant [10 x i8] c"abc\00\00\00\00\00\00\00", align 1
> @@ -9,8 +10,24 @@
>   char b[10] = "\u1120\u0220\U00102030";
>
>   // CHECK: private unnamed_addr constant [12 x i8] c"A\00\00\00B\00\00\00\00\00\00\00", align 1
> -  void *foo = L"AB";
> +  const wchar_t *foo = L"AB";
>
>   // CHECK: private unnamed_addr constant [12 x i8] c"4\12\00\00\0B\F0\10\00\00\00\00\00", align 1
> -  void *bar = L"\u1234\U0010F00B";
> +  const wchar_t *bar = L"\u1234\U0010F00B";
> +
> +  // CHECK: private unnamed_addr constant [12 x i8] c"C\00\00\00D\00\00\00\00\00\00\00", align 1
> +  const char32_t *c = U"CD";
> +
> +  // CHECK: private unnamed_addr constant [12 x i8] c"5\12\00\00\0C\F0\10\00\00\00\00\00", align 1
> +  const char32_t *d = U"\u1235\U0010F00C";
> +
> +  // CHECK: private unnamed_addr constant [6 x i8] c"E\00F\00\00\00", align 1
> +  const char16_t *e = u"EF";
> +
> +  // This should convert to utf16.
> +  // CHECK: private unnamed_addr constant [10 x i8] c" \11 \02\C8\DB0\DC\00\00", align 1
> +  const char16_t *f = u"\u1120\u0220\U00102030";
> +
> +  // CHECK: private unnamed_addr constant [4 x i8] c"def\00", align 1
> +  const char *g = u8"def";
>  }
>
> Modified: cfe/trunk/test/Lexer/wchar.c
> URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/test/Lexer/wchar.c?rev=136210&r1=136209&r2=136210&view=diff
> ==============================================================================
> --- cfe/trunk/test/Lexer/wchar.c (original)
> +++ cfe/trunk/test/Lexer/wchar.c Wed Jul 27 00:40:30 2011
> @@ -5,8 +5,8 @@
>
>   (void)L'\U00010000';  // expected-warning {{character unicode escape sequence too long for its type}}
>
> -  (void)L'ab';  // expected-warning {{extraneous characters in wide character constant ignored}}
> +  (void)L'ab';  // expected-warning {{extraneous characters in character constant ignored}}
>
> -  (void)L'a\u1000';  // expected-warning {{extraneous characters in wide character constant ignored}}
> +  (void)L'a\u1000';  // expected-warning {{extraneous characters in character constant ignored}}
>  }
>
>
> Modified: cfe/trunk/test/Parser/char-literal-printing.c
> URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/test/Parser/char-literal-printing.c?rev=136210&r1=136209&r2=136210&view=diff
> ==============================================================================
> --- cfe/trunk/test/Parser/char-literal-printing.c (original)
> +++ cfe/trunk/test/Parser/char-literal-printing.c Wed Jul 27 00:40:30 2011
> @@ -1,6 +1,5 @@
> -// RUN: %clang_cc1 -ast-print %s
> -
> -#include <stddef.h>
> +// RUN: %clang_cc1 -x c++ -std=c++0x -ast-print %s
> +// Runs in c++0x mode so that wchar_t, char16_t, and char32_t are available.
>
>  char    test1(void) { return '\\'; }
>  wchar_t test2(void) { return L'\\'; }
> @@ -29,3 +28,35 @@
>  wchar_t test24(void) { return L'\x3'; }
>
>  wchar_t test25(void) { return L'\x333'; }
> +
> +char16_t test26(void) { return u'\\'; }
> +char16_t test27(void) { return u'\''; }
> +char16_t test28(void) { return u'\a'; }
> +char16_t test29(void) { return u'\b'; }
> +char16_t test30(void) { return u'\e'; }
> +char16_t test31(void) { return u'\f'; }
> +char16_t test32(void) { return u'\n'; }
> +char16_t test33(void) { return u'\r'; }
> +char16_t test34(void) { return u'\t'; }
> +char16_t test35(void) { return u'\v'; }
> +
> +char16_t test36(void) { return u'c'; }
> +char16_t test37(void) { return u'\x3'; }
> +
> +char16_t test38(void) { return u'\x333'; }
> +
> +char32_t test39(void) { return U'\\'; }
> +char32_t test40(void) { return U'\''; }
> +char32_t test41(void) { return U'\a'; }
> +char32_t test42(void) { return U'\b'; }
> +char32_t test43(void) { return U'\e'; }
> +char32_t test44(void) { return U'\f'; }
> +char32_t test45(void) { return U'\n'; }
> +char32_t test46(void) { return U'\r'; }
> +char32_t test47(void) { return U'\t'; }
> +char32_t test48(void) { return U'\v'; }
> +
> +char32_t test49(void) { return U'c'; }
> +char32_t test50(void) { return U'\x3'; }
> +
> +char32_t test51(void) { return U'\x333'; }
>
> Modified: cfe/trunk/test/SemaCXX/type-convert-construct.cpp
> URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/test/SemaCXX/type-convert-construct.cpp?rev=136210&r1=136209&r2=136210&view=diff
> ==============================================================================
> --- cfe/trunk/test/SemaCXX/type-convert-construct.cpp (original)
> +++ cfe/trunk/test/SemaCXX/type-convert-construct.cpp Wed Jul 27 00:40:30 2011
> @@ -1,4 +1,5 @@
> -// RUN: %clang_cc1 -fsyntax-only -verify %s
> +// RUN: %clang_cc1 -std=gnu++0x -fsyntax-only -verify %s
> +// Runs in c++0x mode so that char16_t and char32_t are available.
>
>  void f() {
>   float v1 = float(1);
> @@ -14,4 +15,8 @@
>   str = "a string"; // expected-warning{{conversion from string literal to 'char *' is deprecated}}
>   wchar_t *wstr;
>   wstr = L"a wide string"; // expected-warning{{conversion from string literal to 'wchar_t *' is deprecated}}
> +  char16_t *ustr;
> +  ustr = u"a UTF-16 string"; // expected-error {{assigning to 'char16_t *' from incompatible type 'const char16_t [16]'}}
> +  char32_t *Ustr;
> +  Ustr = U"a UTF-32 string"; // expected-error {{assigning to 'char32_t *' from incompatible type 'const char32_t [16]'}}
>  }

Are there any tests left for wide characters in non-c++0x mode? Should there be?

Nico