r201532 - PR18855: Add support for UCNs and UTF-8 encoding within ud-suffixes.
Richard Smith
richard-llvm at metafoo.co.uk
Mon Feb 17 13:52:31 PST 2014
Author: rsmith
Date: Mon Feb 17 15:52:30 2014
New Revision: 201532
URL: http://llvm.org/viewvc/llvm-project?rev=201532&view=rev
Log:
PR18855: Add support for UCNs and UTF-8 encoding within ud-suffixes.
Modified:
cfe/trunk/include/clang/Lex/Lexer.h
cfe/trunk/include/clang/Lex/LiteralSupport.h
cfe/trunk/lib/Lex/Lexer.cpp
cfe/trunk/lib/Lex/LiteralSupport.cpp
cfe/trunk/lib/Lex/Preprocessor.cpp
cfe/trunk/test/Parser/cxx11-user-defined-literals.cpp
Modified: cfe/trunk/include/clang/Lex/Lexer.h
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/include/clang/Lex/Lexer.h?rev=201532&r1=201531&r2=201532&view=diff
==============================================================================
--- cfe/trunk/include/clang/Lex/Lexer.h (original)
+++ cfe/trunk/include/clang/Lex/Lexer.h Mon Feb 17 15:52:30 2014
@@ -614,8 +614,28 @@ private:
/// \return The Unicode codepoint specified by the UCN, or 0 if the UCN is
/// invalid.
uint32_t tryReadUCN(const char *&CurPtr, const char *SlashLoc, Token *Tok);
-};
+ /// \brief Try to consume a UCN as part of an identifier at the current
+ /// location.
+ /// \param CurPtr Initially points to the range of characters in the source
+ /// buffer containing the '\'. Updated to point past the end of
+ /// the UCN on success.
+ /// \param Size The number of characters occupied by the '\' (including
+ /// trigraphs and escaped newlines).
+ /// \param Result The token being produced. Marked as containing a UCN on
+ /// success.
+ /// \return \c true if a UCN was lexed and it produced an acceptable
+ /// identifier character, \c false otherwise.
+ bool tryConsumeIdentifierUCN(const char *&CurPtr, unsigned Size,
+ Token &Result);
+
+ /// \brief Try to consume an identifier character encoded in UTF-8.
+ /// \param CurPtr Points to the start of the (potential) UTF-8 code unit
+ /// sequence. On success, updated to point past the end of it.
+ /// \return \c true if a UTF-8 sequence mapping to an acceptable identifier
+ /// character was lexed, \c false otherwise.
+ bool tryConsumeIdentifierUTF8Char(const char *&CurPtr);
+};
} // end namespace clang
Modified: cfe/trunk/include/clang/Lex/LiteralSupport.h
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/include/clang/Lex/LiteralSupport.h?rev=201532&r1=201531&r2=201532&view=diff
==============================================================================
--- cfe/trunk/include/clang/Lex/LiteralSupport.h (original)
+++ cfe/trunk/include/clang/Lex/LiteralSupport.h Mon Feb 17 15:52:30 2014
@@ -33,6 +33,9 @@ class TargetInfo;
class SourceManager;
class LangOptions;
+/// Copy characters from Input to Buf, expanding any UCNs.
+void expandUCNs(SmallVectorImpl<char> &Buf, StringRef Input);
+
/// NumericLiteralParser - This performs strict semantic analysis of the content
/// of a ppnumber, classifying it as either integer, floating, or erroneous,
/// determines the radix of the value and can convert it to a useful value.
@@ -48,6 +51,8 @@ class NumericLiteralParser {
bool saw_exponent, saw_period, saw_ud_suffix;
+ SmallString<32> UDSuffixBuf;
+
public:
NumericLiteralParser(StringRef TokSpelling,
SourceLocation TokLoc,
@@ -72,7 +77,7 @@ public:
}
StringRef getUDSuffix() const {
assert(saw_ud_suffix);
- return StringRef(SuffixBegin, ThisTokEnd - SuffixBegin);
+ return UDSuffixBuf;
}
unsigned getUDSuffixOffset() const {
assert(saw_ud_suffix);
Modified: cfe/trunk/lib/Lex/Lexer.cpp
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Lex/Lexer.cpp?rev=201532&r1=201531&r2=201532&view=diff
==============================================================================
--- cfe/trunk/lib/Lex/Lexer.cpp (original)
+++ cfe/trunk/lib/Lex/Lexer.cpp Mon Feb 17 15:52:30 2014
@@ -1445,7 +1445,50 @@ static void maybeDiagnoseIDCharCompat(Di
<< Range;
}
}
- }
+}
+
+bool Lexer::tryConsumeIdentifierUCN(const char *&CurPtr, unsigned Size,
+ Token &Result) {
+ const char *UCNPtr = CurPtr + Size;
+ uint32_t CodePoint = tryReadUCN(UCNPtr, CurPtr, /*Token=*/0);
+ if (CodePoint == 0 || !isAllowedIDChar(CodePoint, LangOpts))
+ return false;
+
+ if (!isLexingRawMode())
+ maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint,
+ makeCharRange(*this, CurPtr, UCNPtr),
+ /*IsFirst=*/false);
+
+ Result.setFlag(Token::HasUCN);
+ if ((UCNPtr - CurPtr == 6 && CurPtr[1] == 'u') ||
+ (UCNPtr - CurPtr == 10 && CurPtr[1] == 'U'))
+ CurPtr = UCNPtr;
+ else
+ while (CurPtr != UCNPtr)
+ (void)getAndAdvanceChar(CurPtr, Result);
+ return true;
+}
+
+bool Lexer::tryConsumeIdentifierUTF8Char(const char *&CurPtr) {
+ const char *UnicodePtr = CurPtr;
+ UTF32 CodePoint;
+ ConversionResult Result =
+ llvm::convertUTF8Sequence((const UTF8 **)&UnicodePtr,
+ (const UTF8 *)BufferEnd,
+ &CodePoint,
+ strictConversion);
+ if (Result != conversionOK ||
+ !isAllowedIDChar(static_cast<uint32_t>(CodePoint), LangOpts))
+ return false;
+
+ if (!isLexingRawMode())
+ maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint,
+ makeCharRange(*this, CurPtr, UnicodePtr),
+ /*IsFirst=*/false);
+
+ CurPtr = UnicodePtr;
+ return true;
+}
bool Lexer::LexIdentifier(Token &Result, const char *CurPtr) {
// Match [_A-Za-z0-9]*, we have already matched [_A-Za-z$]
@@ -1500,47 +1543,10 @@ FinishIdentifier:
C = getCharAndSize(CurPtr, Size);
continue;
- } else if (C == '\\') {
- const char *UCNPtr = CurPtr + Size;
- uint32_t CodePoint = tryReadUCN(UCNPtr, CurPtr, /*Token=*/0);
- if (CodePoint == 0 || !isAllowedIDChar(CodePoint, LangOpts))
- goto FinishIdentifier;
-
- if (!isLexingRawMode()) {
- maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint,
- makeCharRange(*this, CurPtr, UCNPtr),
- /*IsFirst=*/false);
- }
-
- Result.setFlag(Token::HasUCN);
- if ((UCNPtr - CurPtr == 6 && CurPtr[1] == 'u') ||
- (UCNPtr - CurPtr == 10 && CurPtr[1] == 'U'))
- CurPtr = UCNPtr;
- else
- while (CurPtr != UCNPtr)
- (void)getAndAdvanceChar(CurPtr, Result);
-
+ } else if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) {
C = getCharAndSize(CurPtr, Size);
continue;
- } else if (!isASCII(C)) {
- const char *UnicodePtr = CurPtr;
- UTF32 CodePoint;
- ConversionResult Result =
- llvm::convertUTF8Sequence((const UTF8 **)&UnicodePtr,
- (const UTF8 *)BufferEnd,
- &CodePoint,
- strictConversion);
- if (Result != conversionOK ||
- !isAllowedIDChar(static_cast<uint32_t>(CodePoint), LangOpts))
- goto FinishIdentifier;
-
- if (!isLexingRawMode()) {
- maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint,
- makeCharRange(*this, CurPtr, UnicodePtr),
- /*IsFirst=*/false);
- }
-
- CurPtr = UnicodePtr;
+ } else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr)) {
C = getCharAndSize(CurPtr, Size);
continue;
} else if (!isIdentifierBody(C)) {
@@ -1576,7 +1582,7 @@ bool Lexer::LexNumericConstant(Token &Re
unsigned Size;
char C = getCharAndSize(CurPtr, Size);
char PrevCh = 0;
- while (isPreprocessingNumberBody(C)) { // FIXME: UCNs in ud-suffix.
+ while (isPreprocessingNumberBody(C)) {
CurPtr = ConsumeChar(CurPtr, Size, Result);
PrevCh = C;
C = getCharAndSize(CurPtr, Size);
@@ -1618,6 +1624,12 @@ bool Lexer::LexNumericConstant(Token &Re
}
}
+ // If we have a UCN or UTF-8 character (perhaps in a ud-suffix), continue.
+ if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result))
+ return LexNumericConstant(Result, CurPtr);
+ if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr))
+ return LexNumericConstant(Result, CurPtr);
+
// Update the location of token as well as BufferPtr.
const char *TokStart = BufferPtr;
FormTokenWithChars(Result, CurPtr, tok::numeric_constant);
@@ -1631,23 +1643,35 @@ const char *Lexer::LexUDSuffix(Token &Re
bool IsStringLiteral) {
assert(getLangOpts().CPlusPlus);
- // Maximally munch an identifier. FIXME: UCNs.
+ // Maximally munch an identifier.
unsigned Size;
char C = getCharAndSize(CurPtr, Size);
- if (isIdentifierHead(C)) {
- if (!getLangOpts().CPlusPlus11) {
- if (!isLexingRawMode())
- Diag(CurPtr,
- C == '_' ? diag::warn_cxx11_compat_user_defined_literal
- : diag::warn_cxx11_compat_reserved_user_defined_literal)
- << FixItHint::CreateInsertion(getSourceLocation(CurPtr), " ");
+ bool Consumed = false;
+
+ if (!isIdentifierHead(C)) {
+ if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result))
+ Consumed = true;
+ else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr))
+ Consumed = true;
+ else
return CurPtr;
- }
+ }
+
+ if (!getLangOpts().CPlusPlus11) {
+ if (!isLexingRawMode())
+ Diag(CurPtr,
+ C == '_' ? diag::warn_cxx11_compat_user_defined_literal
+ : diag::warn_cxx11_compat_reserved_user_defined_literal)
+ << FixItHint::CreateInsertion(getSourceLocation(CurPtr), " ");
+ return CurPtr;
+ }
- // C++11 [lex.ext]p10, [usrlit.suffix]p1: A program containing a ud-suffix
- // that does not start with an underscore is ill-formed. As a conforming
- // extension, we treat all such suffixes as if they had whitespace before
- // them.
+ // C++11 [lex.ext]p10, [usrlit.suffix]p1: A program containing a ud-suffix
+ // that does not start with an underscore is ill-formed. As a conforming
+ // extension, we treat all such suffixes as if they had whitespace before
+ // them. We assume a suffix beginning with a UCN or UTF-8 character is more
+ // likely to be a ud-suffix than a macro, however, and accept that.
+ if (!Consumed) {
bool IsUDSuffix = false;
if (C == '_')
IsUDSuffix = true;
@@ -1685,16 +1709,22 @@ const char *Lexer::LexUDSuffix(Token &Re
Diag(CurPtr, getLangOpts().MSVCCompat
? diag::ext_ms_reserved_user_defined_literal
: diag::ext_reserved_user_defined_literal)
- << FixItHint::CreateInsertion(getSourceLocation(CurPtr), " ");
+ << FixItHint::CreateInsertion(getSourceLocation(CurPtr), " ");
return CurPtr;
}
- Result.setFlag(Token::HasUDSuffix);
- do {
- CurPtr = ConsumeChar(CurPtr, Size, Result);
- C = getCharAndSize(CurPtr, Size);
- } while (isIdentifierBody(C));
+ CurPtr = ConsumeChar(CurPtr, Size, Result);
}
+
+ Result.setFlag(Token::HasUDSuffix);
+ while (true) {
+ C = getCharAndSize(CurPtr, Size);
+ if (isIdentifierBody(C)) { CurPtr = ConsumeChar(CurPtr, Size, Result); }
+ else if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) {}
+ else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr)) {}
+ else break;
+ }
+
return CurPtr;
}
Modified: cfe/trunk/lib/Lex/LiteralSupport.cpp
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Lex/LiteralSupport.cpp?rev=201532&r1=201531&r2=201532&view=diff
==============================================================================
--- cfe/trunk/lib/Lex/LiteralSupport.cpp (original)
+++ cfe/trunk/lib/Lex/LiteralSupport.cpp Mon Feb 17 15:52:30 2014
@@ -212,6 +212,48 @@ static unsigned ProcessCharEscape(const
return ResultChar;
}
+static void appendCodePoint(unsigned Codepoint,
+ llvm::SmallVectorImpl<char> &Str) {
+ char ResultBuf[4];
+ char *ResultPtr = ResultBuf;
+ bool Res = llvm::ConvertCodePointToUTF8(Codepoint, ResultPtr);
+ (void)Res;
+ assert(Res && "Unexpected conversion failure");
+ Str.append(ResultBuf, ResultPtr);
+}
+
+void clang::expandUCNs(SmallVectorImpl<char> &Buf, StringRef Input) {
+ for (StringRef::iterator I = Input.begin(), E = Input.end(); I != E; ++I) {
+ if (*I != '\\') {
+ Buf.push_back(*I);
+ continue;
+ }
+
+ ++I;
+ assert(*I == 'u' || *I == 'U');
+
+ unsigned NumHexDigits;
+ if (*I == 'u')
+ NumHexDigits = 4;
+ else
+ NumHexDigits = 8;
+
+ assert(I + NumHexDigits <= E);
+
+ uint32_t CodePoint = 0;
+ for (++I; NumHexDigits != 0; ++I, --NumHexDigits) {
+ unsigned Value = llvm::hexDigitValue(*I);
+ assert(Value != -1U);
+
+ CodePoint <<= 4;
+ CodePoint += Value;
+ }
+
+ appendCodePoint(CodePoint, Buf);
+ --I;
+ }
+}
+
/// ProcessUCNEscape - Read the Universal Character Name, check constraints and
/// return the UTF32.
static bool ProcessUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
@@ -625,8 +667,9 @@ NumericLiteralParser::NumericLiteralPars
}
if (s != ThisTokEnd) {
- if (isValidUDSuffix(PP.getLangOpts(),
- StringRef(SuffixBegin, ThisTokEnd - SuffixBegin))) {
+ // FIXME: Don't bother expanding UCNs if !tok.hasUCN().
+ expandUCNs(UDSuffixBuf, StringRef(SuffixBegin, ThisTokEnd - SuffixBegin));
+ if (isValidUDSuffix(PP.getLangOpts(), UDSuffixBuf)) {
// Any suffix pieces we might have parsed are actually part of the
// ud-suffix.
isLong = false;
@@ -992,7 +1035,8 @@ CharLiteralParser::CharLiteralParser(con
do {
--end;
} while (end[-1] != '\'');
- UDSuffixBuf.assign(end, UDSuffixEnd);
+ // FIXME: Don't bother with this if !tok.hasUCN().
+ expandUCNs(UDSuffixBuf, StringRef(end, UDSuffixEnd - end));
UDSuffixOffset = end - TokBegin;
}
@@ -1311,23 +1355,34 @@ void StringLiteralParser::init(const Tok
StringRef UDSuffix(ThisTokEnd, UDSuffixEnd - ThisTokEnd);
if (UDSuffixBuf.empty()) {
- UDSuffixBuf.assign(UDSuffix);
+ if (StringToks[i].hasUCN())
+ expandUCNs(UDSuffixBuf, UDSuffix);
+ else
+ UDSuffixBuf.assign(UDSuffix);
UDSuffixToken = i;
UDSuffixOffset = ThisTokEnd - ThisTokBuf;
UDSuffixTokLoc = StringToks[i].getLocation();
- } else if (!UDSuffixBuf.equals(UDSuffix)) {
+ } else {
+ SmallString<32> ExpandedUDSuffix;
+ if (StringToks[i].hasUCN()) {
+ expandUCNs(ExpandedUDSuffix, UDSuffix);
+ UDSuffix = ExpandedUDSuffix;
+ }
+
// C++11 [lex.ext]p8: At the end of phase 6, if a string literal is the
// result of a concatenation involving at least one user-defined-string-
// literal, all the participating user-defined-string-literals shall
// have the same ud-suffix.
- if (Diags) {
- SourceLocation TokLoc = StringToks[i].getLocation();
- Diags->Report(TokLoc, diag::err_string_concat_mixed_suffix)
- << UDSuffixBuf << UDSuffix
- << SourceRange(UDSuffixTokLoc, UDSuffixTokLoc)
- << SourceRange(TokLoc, TokLoc);
+ if (!UDSuffixBuf.equals(UDSuffix)) {
+ if (Diags) {
+ SourceLocation TokLoc = StringToks[i].getLocation();
+ Diags->Report(TokLoc, diag::err_string_concat_mixed_suffix)
+ << UDSuffixBuf << UDSuffix
+ << SourceRange(UDSuffixTokLoc, UDSuffixTokLoc)
+ << SourceRange(TokLoc, TokLoc);
+ }
+ hadError = true;
}
- hadError = true;
}
}
Modified: cfe/trunk/lib/Lex/Preprocessor.cpp
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Lex/Preprocessor.cpp?rev=201532&r1=201531&r2=201532&view=diff
==============================================================================
--- cfe/trunk/lib/Lex/Preprocessor.cpp (original)
+++ cfe/trunk/lib/Lex/Preprocessor.cpp Mon Feb 17 15:52:30 2014
@@ -503,48 +503,6 @@ void Preprocessor::EndSourceFile() {
// Lexer Event Handling.
//===----------------------------------------------------------------------===//
-static void appendCodePoint(unsigned Codepoint,
- llvm::SmallVectorImpl<char> &Str) {
- char ResultBuf[4];
- char *ResultPtr = ResultBuf;
- bool Res = llvm::ConvertCodePointToUTF8(Codepoint, ResultPtr);
- (void)Res;
- assert(Res && "Unexpected conversion failure");
- Str.append(ResultBuf, ResultPtr);
-}
-
-static void expandUCNs(SmallVectorImpl<char> &Buf, StringRef Input) {
- for (StringRef::iterator I = Input.begin(), E = Input.end(); I != E; ++I) {
- if (*I != '\\') {
- Buf.push_back(*I);
- continue;
- }
-
- ++I;
- assert(*I == 'u' || *I == 'U');
-
- unsigned NumHexDigits;
- if (*I == 'u')
- NumHexDigits = 4;
- else
- NumHexDigits = 8;
-
- assert(I + NumHexDigits <= E);
-
- uint32_t CodePoint = 0;
- for (++I; NumHexDigits != 0; ++I, --NumHexDigits) {
- unsigned Value = llvm::hexDigitValue(*I);
- assert(Value != -1U);
-
- CodePoint <<= 4;
- CodePoint += Value;
- }
-
- appendCodePoint(CodePoint, Buf);
- --I;
- }
-}
-
/// LookUpIdentifierInfo - Given a tok::raw_identifier token, look up the
/// identifier information for the token and install it into the token,
/// updating the token kind accordingly.
Modified: cfe/trunk/test/Parser/cxx11-user-defined-literals.cpp
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/test/Parser/cxx11-user-defined-literals.cpp?rev=201532&r1=201531&r2=201532&view=diff
==============================================================================
--- cfe/trunk/test/Parser/cxx11-user-defined-literals.cpp (original)
+++ cfe/trunk/test/Parser/cxx11-user-defined-literals.cpp Mon Feb 17 15:52:30 2014
@@ -111,3 +111,35 @@ void operator "" ""
U"" // expected-error {{cannot have an encoding prefix}}
"" _also_not_char(const char *);
void operator "" u8"" "\u0123" "hello"_all_of_the_things ""(const char*); // expected-error {{must be '""'}}
+
+// Make sure we treat UCNs and UTF-8 as equivalent.
+int operator""_µs(unsigned long long) {} // expected-note {{previous}}
+int hundred_µs = 50_µs + 50_\u00b5s;
+int operator""_\u00b5s(unsigned long long) {} // expected-error {{redefinition of 'operator "" _µs'}}
+
+int operator""_\U0000212B(long double) {} // expected-note {{previous}}
+int hundred_â« = 50.0_â« + 50._\U0000212B;
+int operator""_â«(long double) {} // expected-error {{redefinition of 'operator "" _â«'}}
+
+int operator""_ð(char) {} // expected-note {{previous}}
+int ð = '4'_ð + '2'_\U00010000;
+int operator""_\U00010000(char) {} // expected-error {{redefinition of 'operator "" _ð'}}
+
+// These all declare the same function.
+int operator""_â®""_\u212e""_\U0000212e""(const char*, size_t);
+int operator""_\u212e""_\U0000212e""_â®""(const char*, size_t);
+int operator""_\U0000212e""_â®""_\u212e""(const char*, size_t);
+int mix_ucn_utf8 = ""_â®""_\u212e""_\U0000212e"";
+
+void operator""_â®""_â¯(unsigned long long) {} // expected-error {{differing user-defined suffixes ('_â®' and '_â¯') in string literal concatenation}}
+void operator""_â®""_\u212f(unsigned long long) {} // expected-error {{differing user-defined suffixes ('_â®' and '_â¯') in string literal concatenation}}
+void operator""_\u212e""_â¯(unsigned long long) {} // expected-error {{differing user-defined suffixes ('_â®' and '_â¯') in string literal concatenation}}
+void operator""_\u212e""_\u212f(unsigned long long) {} // expected-error {{differing user-defined suffixes ('_â®' and '_â¯') in string literal concatenation}}
+
+void operator""_â®""_â®(unsigned long long) {} // expected-note {{previous}}
+void operator""_\u212e""_\u212e(unsigned long long) {} // expected-error {{redefinition}}
+
+#define ¢ *0.01 // expected-error {{macro names must be identifiers}}
+constexpr int operator""_¢(long double d) { return d * 100; } // expected-error {{non-ASCII}}
+constexpr int operator""_¢(unsigned long long n) { return n; } // expected-error {{non-ASCII}}
+static_assert(0.02_¢ == 2_¢, ""); // expected-error 2{{non-ASCII}}
More information about the cfe-commits
mailing list