[clang] 601102d - Cleanup identifier parsing; NFC
Aaron Ballman via cfe-commits
cfe-commits at lists.llvm.org
Tue Sep 14 06:12:29 PDT 2021
Author: Corentin Jabot
Date: 2021-09-14T09:12:22-04:00
New Revision: 601102d282d5e9a1429fea52ee17303aec8a7c10
URL: https://github.com/llvm/llvm-project/commit/601102d282d5e9a1429fea52ee17303aec8a7c10
DIFF: https://github.com/llvm/llvm-project/commit/601102d282d5e9a1429fea52ee17303aec8a7c10.diff
LOG: Cleanup identifier parsing; NFC
Rename methods to clearly signal when they only deal with ASCII,
simplify the parsing of identifier, and use start/continue instead of
head/body for consistency with Unicode terminology.
Added:
Modified:
clang-tools-extra/clang-include-fixer/IncludeFixer.cpp
clang-tools-extra/clang-tidy/google/IntegerTypesCheck.cpp
clang-tools-extra/clang-tidy/utils/RenamerClangTidyCheck.cpp
clang-tools-extra/clangd/CodeComplete.cpp
clang-tools-extra/clangd/SourceCode.cpp
clang-tools-extra/clangd/refactor/Rename.cpp
clang/include/clang/Basic/CharInfo.h
clang/include/clang/Lex/Lexer.h
clang/lib/ARCMigrate/ObjCMT.cpp
clang/lib/ARCMigrate/TransUnbridgedCasts.cpp
clang/lib/AST/MicrosoftMangle.cpp
clang/lib/Basic/Module.cpp
clang/lib/Edit/EditedSource.cpp
clang/lib/Frontend/LayoutOverrideSource.cpp
clang/lib/Frontend/Rewrite/FrontendActions.cpp
clang/lib/Lex/DependencyDirectivesSourceMinimizer.cpp
clang/lib/Lex/Lexer.cpp
clang/lib/Lex/ModuleMap.cpp
clang/lib/Sema/SemaAvailability.cpp
clang/lib/Sema/SemaDeclAttr.cpp
clang/lib/Sema/SemaExprObjC.cpp
clang/lib/Sema/SemaType.cpp
clang/lib/Tooling/Transformer/Parsing.cpp
clang/unittests/Basic/CharInfoTest.cpp
Removed:
################################################################################
diff --git a/clang-tools-extra/clang-include-fixer/IncludeFixer.cpp b/clang-tools-extra/clang-include-fixer/IncludeFixer.cpp
index 1a765666563ca..2af1f622aa92f 100644
--- a/clang-tools-extra/clang-include-fixer/IncludeFixer.cpp
+++ b/clang-tools-extra/clang-include-fixer/IncludeFixer.cpp
@@ -245,7 +245,7 @@ clang::TypoCorrection IncludeFixerSemaSource::CorrectTypo(
// parent_path.
// FIXME: Don't rely on source text.
const char *End = Source.end();
- while (isIdentifierBody(*End) || *End == ':')
+ while (isAsciiIdentifierContinue(*End) || *End == ':')
++End;
return std::string(Source.begin(), End);
diff --git a/clang-tools-extra/clang-tidy/google/IntegerTypesCheck.cpp b/clang-tools-extra/clang-tidy/google/IntegerTypesCheck.cpp
index b5600c19a6d53..e6bda4f6abc37 100644
--- a/clang-tools-extra/clang-tidy/google/IntegerTypesCheck.cpp
+++ b/clang-tools-extra/clang-tidy/google/IntegerTypesCheck.cpp
@@ -129,7 +129,7 @@ void IntegerTypesCheck::check(const MatchFinder::MatchResult &Result) {
const StringRef Port = "unsigned short port";
const char *Data = Result.SourceManager->getCharacterData(Loc);
if (!std::strncmp(Data, Port.data(), Port.size()) &&
- !isIdentifierBody(Data[Port.size()]))
+ !isAsciiIdentifierContinue(Data[Port.size()]))
return;
std::string Replacement =
diff --git a/clang-tools-extra/clang-tidy/utils/RenamerClangTidyCheck.cpp b/clang-tools-extra/clang-tidy/utils/RenamerClangTidyCheck.cpp
index 2bb97eca14ab2..d719f847f50d8 100644
--- a/clang-tools-extra/clang-tidy/utils/RenamerClangTidyCheck.cpp
+++ b/clang-tools-extra/clang-tidy/utils/RenamerClangTidyCheck.cpp
@@ -464,7 +464,7 @@ void RenamerClangTidyCheck::check(const MatchFinder::MatchResult &Result) {
Failure.FixStatus = ShouldFixStatus::ConflictsWithKeyword;
else if (Ident->hasMacroDefinition())
Failure.FixStatus = ShouldFixStatus::ConflictsWithMacroDefinition;
- } else if (!isValidIdentifier(Info.Fixup)) {
+ } else if (!isValidAsciiIdentifier(Info.Fixup)) {
Failure.FixStatus = ShouldFixStatus::FixInvalidIdentifier;
}
diff --git a/clang-tools-extra/clangd/CodeComplete.cpp b/clang-tools-extra/clangd/CodeComplete.cpp
index abaedff82686c..54d0e69c4cf45 100644
--- a/clang-tools-extra/clangd/CodeComplete.cpp
+++ b/clang-tools-extra/clangd/CodeComplete.cpp
@@ -1842,14 +1842,14 @@ CompletionPrefix guessCompletionPrefix(llvm::StringRef Content,
CompletionPrefix Result;
// Consume the unqualified name. We only handle ASCII characters.
- // isIdentifierBody will let us match "0invalid", but we don't mind.
- while (!Rest.empty() && isIdentifierBody(Rest.back()))
+ // isAsciiIdentifierContinue will let us match "0invalid", but we don't mind.
+ while (!Rest.empty() && isAsciiIdentifierContinue(Rest.back()))
Rest = Rest.drop_back();
Result.Name = Content.slice(Rest.size(), Offset);
// Consume qualifiers.
while (Rest.consume_back("::") && !Rest.endswith(":")) // reject ::::
- while (!Rest.empty() && isIdentifierBody(Rest.back()))
+ while (!Rest.empty() && isAsciiIdentifierContinue(Rest.back()))
Rest = Rest.drop_back();
Result.Qualifier =
Content.slice(Rest.size(), Result.Name.begin() - Content.begin());
@@ -2057,8 +2057,8 @@ bool allowImplicitCompletion(llvm::StringRef Content, unsigned Offset) {
return true;
// Complete words. Give non-ascii characters the benefit of the doubt.
- return !Content.empty() &&
- (isIdentifierBody(Content.back()) || !llvm::isASCII(Content.back()));
+ return !Content.empty() && (isAsciiIdentifierContinue(Content.back()) ||
+ !llvm::isASCII(Content.back()));
}
} // namespace clangd
diff --git a/clang-tools-extra/clangd/SourceCode.cpp b/clang-tools-extra/clangd/SourceCode.cpp
index 5a9cf05ea818a..800a574bf5b6a 100644
--- a/clang-tools-extra/clangd/SourceCode.cpp
+++ b/clang-tools-extra/clangd/SourceCode.cpp
@@ -945,9 +945,9 @@ llvm::Optional<SpelledWord> SpelledWord::touching(SourceLocation SpelledLoc,
if (Invalid)
return llvm::None;
unsigned B = Offset, E = Offset;
- while (B > 0 && isIdentifierBody(Code[B - 1]))
+ while (B > 0 && isAsciiIdentifierContinue(Code[B - 1]))
--B;
- while (E < Code.size() && isIdentifierBody(Code[E]))
+ while (E < Code.size() && isAsciiIdentifierContinue(Code[E]))
++E;
if (B == E)
return llvm::None;
diff --git a/clang-tools-extra/clangd/refactor/Rename.cpp b/clang-tools-extra/clangd/refactor/Rename.cpp
index 2193626ae099e..76182375ea170 100644
--- a/clang-tools-extra/clangd/refactor/Rename.cpp
+++ b/clang-tools-extra/clangd/refactor/Rename.cpp
@@ -478,10 +478,10 @@ static bool mayBeValidIdentifier(llvm::StringRef Ident) {
// We don't check all the rules for non-ascii characters (most are allowed).
bool AllowDollar = true; // lenient
if (llvm::isASCII(Ident.front()) &&
- !isIdentifierHead(Ident.front(), AllowDollar))
+ !isAsciiIdentifierStart(Ident.front(), AllowDollar))
return false;
for (char C : Ident) {
- if (llvm::isASCII(C) && !isIdentifierBody(C, AllowDollar))
+ if (llvm::isASCII(C) && !isAsciiIdentifierContinue(C, AllowDollar))
return false;
}
return true;
diff --git a/clang/include/clang/Basic/CharInfo.h b/clang/include/clang/Basic/CharInfo.h
index 472eb38b9829d..c751b6a005e28 100644
--- a/clang/include/clang/Basic/CharInfo.h
+++ b/clang/include/clang/Basic/CharInfo.h
@@ -50,8 +50,8 @@ LLVM_READNONE inline bool isASCII(uint32_t c) { return c <= 127; }
/// Returns true if this is a valid first character of a C identifier,
/// which is [a-zA-Z_].
-LLVM_READONLY inline bool isIdentifierHead(unsigned char c,
- bool AllowDollar = false) {
+LLVM_READONLY inline bool isAsciiIdentifierStart(unsigned char c,
+ bool AllowDollar = false) {
using namespace charinfo;
if (InfoTable[c] & (CHAR_UPPER|CHAR_LOWER|CHAR_UNDER))
return true;
@@ -60,8 +60,8 @@ LLVM_READONLY inline bool isIdentifierHead(unsigned char c,
/// Returns true if this is a body character of a C identifier,
/// which is [a-zA-Z0-9_].
-LLVM_READONLY inline bool isIdentifierBody(unsigned char c,
- bool AllowDollar = false) {
+LLVM_READONLY inline bool isAsciiIdentifierContinue(unsigned char c,
+ bool AllowDollar = false) {
using namespace charinfo;
if (InfoTable[c] & (CHAR_UPPER|CHAR_LOWER|CHAR_DIGIT|CHAR_UNDER))
return true;
@@ -186,13 +186,13 @@ LLVM_READONLY inline char toUppercase(char c) {
///
/// Note that this is a very simple check; it does not accept UCNs as valid
/// identifier characters.
-LLVM_READONLY inline bool isValidIdentifier(StringRef S,
- bool AllowDollar = false) {
- if (S.empty() || !isIdentifierHead(S[0], AllowDollar))
+LLVM_READONLY inline bool isValidAsciiIdentifier(StringRef S,
+ bool AllowDollar = false) {
+ if (S.empty() || !isAsciiIdentifierStart(S[0], AllowDollar))
return false;
for (StringRef::iterator I = S.begin(), E = S.end(); I != E; ++I)
- if (!isIdentifierBody(*I, AllowDollar))
+ if (!isAsciiIdentifierContinue(*I, AllowDollar))
return false;
return true;
diff --git a/clang/include/clang/Lex/Lexer.h b/clang/include/clang/Lex/Lexer.h
index a291520ae5cad..82f494e7c8cf9 100644
--- a/clang/include/clang/Lex/Lexer.h
+++ b/clang/include/clang/Lex/Lexer.h
@@ -536,7 +536,8 @@ class Lexer : public PreprocessorLexer {
bool SkipTrailingWhitespaceAndNewLine);
/// Returns true if the given character could appear in an identifier.
- static bool isIdentifierBodyChar(char c, const LangOptions &LangOpts);
+ static bool isAsciiIdentifierContinueChar(char c,
+ const LangOptions &LangOpts);
/// Checks whether new line pointed by Str is preceded by escape
/// sequence.
@@ -573,10 +574,7 @@ class Lexer : public PreprocessorLexer {
bool CheckUnicodeWhitespace(Token &Result, uint32_t C, const char *CurPtr);
- /// Given that a token begins with the Unicode character \p C, figure out
- /// what kind of token it is and dispatch to the appropriate lexing helper
- /// function.
- bool LexUnicode(Token &Result, uint32_t C, const char *CurPtr);
+ bool LexUnicodeIdentifierStart(Token &Result, uint32_t C, const char *CurPtr);
/// FormTokenWithChars - When we lex a token, we have identified a span
/// starting at BufferPtr, going to TokEnd that forms the token. This method
@@ -701,7 +699,11 @@ class Lexer : public PreprocessorLexer {
bool IsStringLiteral);
// Helper functions to lex the remainder of a token of the specific type.
- bool LexIdentifier (Token &Result, const char *CurPtr);
+
+ // This function handles both ASCII and Unicode identifiers after
+ // the first codepoint of the identifyier has been parsed.
+ bool LexIdentifierContinue(Token &Result, const char *CurPtr);
+
bool LexNumericConstant (Token &Result, const char *CurPtr);
bool LexStringLiteral (Token &Result, const char *CurPtr,
tok::TokenKind Kind);
diff --git a/clang/lib/ARCMigrate/ObjCMT.cpp b/clang/lib/ARCMigrate/ObjCMT.cpp
index c8069b51567c2..c8a389d1f2e5a 100644
--- a/clang/lib/ARCMigrate/ObjCMT.cpp
+++ b/clang/lib/ARCMigrate/ObjCMT.cpp
@@ -1144,7 +1144,7 @@ static bool AttributesMatch(const Decl *Decl1, const Decl *Decl2,
static bool IsValidIdentifier(ASTContext &Ctx,
const char *Name) {
- if (!isIdentifierHead(Name[0]))
+ if (!isAsciiIdentifierStart(Name[0]))
return false;
std::string NameString = Name;
NameString[0] = toLowercase(NameString[0]);
diff --git a/clang/lib/ARCMigrate/TransUnbridgedCasts.cpp b/clang/lib/ARCMigrate/TransUnbridgedCasts.cpp
index e767ad5346c30..b14364509a0bf 100644
--- a/clang/lib/ARCMigrate/TransUnbridgedCasts.cpp
+++ b/clang/lib/ARCMigrate/TransUnbridgedCasts.cpp
@@ -253,7 +253,8 @@ class UnbridgedCastRewriter : public RecursiveASTVisitor<UnbridgedCastRewriter>{
SourceManager &SM = Pass.Ctx.getSourceManager();
char PrevChar = *SM.getCharacterData(InsertLoc.getLocWithOffset(-1));
- if (Lexer::isIdentifierBodyChar(PrevChar, Pass.Ctx.getLangOpts()))
+ if (Lexer::isAsciiIdentifierContinueChar(PrevChar,
+ Pass.Ctx.getLangOpts()))
BridgeCall += ' ';
if (Kind == OBC_BridgeTransfer)
diff --git a/clang/lib/AST/MicrosoftMangle.cpp b/clang/lib/AST/MicrosoftMangle.cpp
index d3d65087d3adc..63fc7e428c07f 100644
--- a/clang/lib/AST/MicrosoftMangle.cpp
+++ b/clang/lib/AST/MicrosoftMangle.cpp
@@ -3884,7 +3884,7 @@ void MicrosoftMangleContextImpl::mangleStringLiteral(const StringLiteral *SL,
// - ?[A-Z]: The range from \xc1 to \xda.
// - ?[0-9]: The set of [,/\:. \n\t'-].
// - ?$XX: A fallback which maps nibbles.
- if (isIdentifierBody(Byte, /*AllowDollar=*/true)) {
+ if (isAsciiIdentifierContinue(Byte, /*AllowDollar=*/true)) {
Mangler.getStream() << Byte;
} else if (isLetter(Byte & 0x7f)) {
Mangler.getStream() << '?' << static_cast<char>(Byte & 0x7f);
diff --git a/clang/lib/Basic/Module.cpp b/clang/lib/Basic/Module.cpp
index b6cf1624ef017..4ec0699e40d49 100644
--- a/clang/lib/Basic/Module.cpp
+++ b/clang/lib/Basic/Module.cpp
@@ -203,7 +203,7 @@ static void printModuleId(raw_ostream &OS, InputIter Begin, InputIter End,
OS << ".";
StringRef Name = getModuleNameFromComponent(*It);
- if (!AllowStringLiterals || isValidIdentifier(Name))
+ if (!AllowStringLiterals || isValidAsciiIdentifier(Name))
OS << Name;
else {
OS << '"';
diff --git a/clang/lib/Edit/EditedSource.cpp b/clang/lib/Edit/EditedSource.cpp
index 74e6005faeb04..43da3451aa15a 100644
--- a/clang/lib/Edit/EditedSource.cpp
+++ b/clang/lib/Edit/EditedSource.cpp
@@ -314,8 +314,8 @@ bool EditedSource::commit(const Commit &commit) {
static bool canBeJoined(char left, char right, const LangOptions &LangOpts) {
// FIXME: Should use TokenConcatenation to make sure we don't allow stuff like
// making two '<' adjacent.
- return !(Lexer::isIdentifierBodyChar(left, LangOpts) &&
- Lexer::isIdentifierBodyChar(right, LangOpts));
+ return !(Lexer::isAsciiIdentifierContinueChar(left, LangOpts) &&
+ Lexer::isAsciiIdentifierContinueChar(right, LangOpts));
}
/// Returns true if it is ok to eliminate the trailing whitespace between
diff --git a/clang/lib/Frontend/LayoutOverrideSource.cpp b/clang/lib/Frontend/LayoutOverrideSource.cpp
index 76762d58fe254..c735c6c42cb32 100644
--- a/clang/lib/Frontend/LayoutOverrideSource.cpp
+++ b/clang/lib/Frontend/LayoutOverrideSource.cpp
@@ -16,11 +16,11 @@ using namespace clang;
/// Parse a simple identifier.
static std::string parseName(StringRef S) {
- if (S.empty() || !isIdentifierHead(S[0]))
+ if (S.empty() || !isAsciiIdentifierStart(S[0]))
return "";
unsigned Offset = 1;
- while (Offset < S.size() && isIdentifierBody(S[Offset]))
+ while (Offset < S.size() && isAsciiIdentifierContinue(S[Offset]))
++Offset;
return S.substr(0, Offset).str();
diff --git a/clang/lib/Frontend/Rewrite/FrontendActions.cpp b/clang/lib/Frontend/Rewrite/FrontendActions.cpp
index 09ed07be923ea..6685109f8d333 100644
--- a/clang/lib/Frontend/Rewrite/FrontendActions.cpp
+++ b/clang/lib/Frontend/Rewrite/FrontendActions.cpp
@@ -231,7 +231,7 @@ class RewriteIncludesAction::RewriteImportsListener : public ASTReaderListener {
assert(OS && "loaded module file after finishing rewrite action?");
(*OS) << "#pragma clang module build ";
- if (isValidIdentifier(MF->ModuleName))
+ if (isValidAsciiIdentifier(MF->ModuleName))
(*OS) << MF->ModuleName;
else {
(*OS) << '"';
diff --git a/clang/lib/Lex/DependencyDirectivesSourceMinimizer.cpp b/clang/lib/Lex/DependencyDirectivesSourceMinimizer.cpp
index cfca167f8bf1e..f5cbd5e51b9b9 100644
--- a/clang/lib/Lex/DependencyDirectivesSourceMinimizer.cpp
+++ b/clang/lib/Lex/DependencyDirectivesSourceMinimizer.cpp
@@ -131,17 +131,17 @@ LLVM_NODISCARD static bool isRawStringLiteral(const char *First,
--Current;
if (*Current != 'R')
return false;
- if (First == Current || !isIdentifierBody(*--Current))
+ if (First == Current || !isAsciiIdentifierContinue(*--Current))
return true;
// Check for a prefix of "u", "U", or "L".
if (*Current == 'u' || *Current == 'U' || *Current == 'L')
- return First == Current || !isIdentifierBody(*--Current);
+ return First == Current || !isAsciiIdentifierContinue(*--Current);
// Check for a prefix of "u8".
if (*Current != '8' || First == Current || *Current-- != 'u')
return false;
- return First == Current || !isIdentifierBody(*--Current);
+ return First == Current || !isAsciiIdentifierContinue(*--Current);
}
static void skipRawString(const char *&First, const char *const End) {
@@ -319,7 +319,7 @@ static bool isQuoteCppDigitSeparator(const char *const Start,
if (!isPreprocessingNumberBody(Prev))
return false;
// The next character should be a valid identifier body character.
- return (Cur + 1) < End && isIdentifierBody(*(Cur + 1));
+ return (Cur + 1) < End && isAsciiIdentifierContinue(*(Cur + 1));
}
static void skipLine(const char *&First, const char *const End) {
@@ -484,7 +484,7 @@ void Minimizer::printAdjacentModuleNameParts(const char *&First,
const char *Last = First;
do
++Last;
- while (Last != End && (isIdentifierBody(*Last) || *Last == '.'));
+ while (Last != End && (isAsciiIdentifierContinue(*Last) || *Last == '.'));
append(First, Last);
First = Last;
}
@@ -507,7 +507,7 @@ bool Minimizer::printAtImportBody(const char *&First, const char *const End) {
}
// Don't handle macro expansions inside @import for now.
- if (!isIdentifierBody(*First) && *First != '.')
+ if (!isAsciiIdentifierContinue(*First) && *First != '.')
return true;
printAdjacentModuleNameParts(First, End);
@@ -524,9 +524,9 @@ void Minimizer::printDirectiveBody(const char *&First, const char *const End) {
LLVM_NODISCARD static const char *lexRawIdentifier(const char *First,
const char *const End) {
- assert(isIdentifierBody(*First) && "invalid identifer");
+ assert(isAsciiIdentifierContinue(*First) && "invalid identifer");
const char *Last = First + 1;
- while (Last != End && isIdentifierBody(*Last))
+ while (Last != End && isAsciiIdentifierContinue(*Last))
++Last;
return Last;
}
@@ -540,7 +540,7 @@ getIdentifierContinuation(const char *First, const char *const End) {
skipNewline(First, End);
if (First == End)
return nullptr;
- return isIdentifierBody(First[0]) ? First : nullptr;
+ return isAsciiIdentifierContinue(First[0]) ? First : nullptr;
}
Minimizer::IdInfo Minimizer::lexIdentifier(const char *First,
@@ -569,7 +569,7 @@ void Minimizer::printAdjacentMacroArgs(const char *&First,
do
++Last;
while (Last != End &&
- (isIdentifierBody(*Last) || *Last == '.' || *Last == ','));
+ (isAsciiIdentifierContinue(*Last) || *Last == '.' || *Last == ','));
append(First, Last);
First = Last;
}
@@ -588,7 +588,7 @@ bool Minimizer::printMacroArgs(const char *&First, const char *const End) {
}
// This is intentionally fairly liberal.
- if (!(isIdentifierBody(*First) || *First == '.' || *First == ','))
+ if (!(isAsciiIdentifierContinue(*First) || *First == '.' || *First == ','))
return true;
printAdjacentMacroArgs(First, End);
@@ -602,7 +602,7 @@ bool Minimizer::printMacroArgs(const char *&First, const char *const End) {
bool Minimizer::isNextIdentifier(StringRef Id, const char *&First,
const char *const End) {
skipWhitespace(First, End);
- if (First == End || !isIdentifierHead(*First))
+ if (First == End || !isAsciiIdentifierStart(*First))
return false;
IdInfo FoundId = lexIdentifier(First, End);
@@ -639,7 +639,7 @@ bool Minimizer::lexModule(const char *&First, const char *const End) {
if (Id.Name == "export") {
Export = true;
skipWhitespace(First, End);
- if (!isIdentifierBody(*First)) {
+ if (!isAsciiIdentifierContinue(*First)) {
skipLine(First, End);
return false;
}
@@ -663,7 +663,7 @@ bool Minimizer::lexModule(const char *&First, const char *const End) {
case '"':
break;
default:
- if (!isIdentifierBody(*First)) {
+ if (!isAsciiIdentifierContinue(*First)) {
skipLine(First, End);
return false;
}
@@ -690,7 +690,7 @@ bool Minimizer::lexDefine(const char *&First, const char *const End) {
append("#define ");
skipWhitespace(First, End);
- if (!isIdentifierHead(*First))
+ if (!isAsciiIdentifierStart(*First))
return reportError(First, diag::err_pp_macro_not_identifier);
IdInfo Id = lexIdentifier(First, End);
@@ -722,7 +722,7 @@ bool Minimizer::lexDefine(const char *&First, const char *const End) {
bool Minimizer::lexPragma(const char *&First, const char *const End) {
// #pragma.
skipWhitespace(First, End);
- if (First == End || !isIdentifierHead(*First))
+ if (First == End || !isAsciiIdentifierStart(*First))
return false;
IdInfo FoundId = lexIdentifier(First, End);
@@ -827,7 +827,7 @@ bool Minimizer::lexPPLine(const char *&First, const char *const End) {
if (First == End)
return reportError(First, diag::err_pp_expected_eol);
- if (!isIdentifierHead(*First)) {
+ if (!isAsciiIdentifierStart(*First)) {
skipLine(First, End);
return false;
}
diff --git a/clang/lib/Lex/Lexer.cpp b/clang/lib/Lex/Lexer.cpp
index 05b84e0c14b3e..2685924392d05 100644
--- a/clang/lib/Lex/Lexer.cpp
+++ b/clang/lib/Lex/Lexer.cpp
@@ -1062,8 +1062,8 @@ StringRef Lexer::getImmediateMacroNameForDiagnostics(
return ExpansionBuffer.substr(ExpansionInfo.second, MacroTokenLength);
}
-bool Lexer::isIdentifierBodyChar(char c, const LangOptions &LangOpts) {
- return isIdentifierBody(c, LangOpts.DollarIdents);
+bool Lexer::isAsciiIdentifierContinueChar(char c, const LangOptions &LangOpts) {
+ return isAsciiIdentifierContinue(c, LangOpts.DollarIdents);
}
bool Lexer::isNewLineEscaped(const char *BufferStart, const char *Str) {
@@ -1712,103 +1712,128 @@ bool Lexer::tryConsumeIdentifierUTF8Char(const char *&CurPtr) {
return true;
}
-bool Lexer::LexIdentifier(Token &Result, const char *CurPtr) {
- // Match [_A-Za-z0-9]*, we have already matched [_A-Za-z$]
- unsigned Size;
- unsigned char C = *CurPtr++;
- while (isIdentifierBody(C))
- C = *CurPtr++;
-
- --CurPtr; // Back up over the skipped character.
-
- // Fast path, no $,\,? in identifier found. '\' might be an escaped newline
- // or UCN, and ? might be a trigraph for '\', an escaped newline or UCN.
- //
- // TODO: Could merge these checks into an InfoTable flag to make the
- // comparison cheaper
- if (isASCII(C) && C != '\\' && C != '?' &&
- (C != '$' || !LangOpts.DollarIdents)) {
-FinishIdentifier:
- const char *IdStart = BufferPtr;
- FormTokenWithChars(Result, CurPtr, tok::raw_identifier);
- Result.setRawIdentifierData(IdStart);
-
- // If we are in raw mode, return this identifier raw. There is no need to
- // look up identifier information or attempt to macro expand it.
- if (LexingRawMode)
- return true;
-
- // Fill in Result.IdentifierInfo and update the token kind,
- // looking up the identifier in the identifier table.
- IdentifierInfo *II = PP->LookUpIdentifierInfo(Result);
- // Note that we have to call PP->LookUpIdentifierInfo() even for code
- // completion, it writes IdentifierInfo into Result, and callers rely on it.
-
- // If the completion point is at the end of an identifier, we want to treat
- // the identifier as incomplete even if it resolves to a macro or a keyword.
- // This allows e.g. 'class^' to complete to 'classifier'.
- if (isCodeCompletionPoint(CurPtr)) {
- // Return the code-completion token.
- Result.setKind(tok::code_completion);
- // Skip the code-completion char and all immediate identifier characters.
- // This ensures we get consistent behavior when completing at any point in
- // an identifier (i.e. at the start, in the middle, at the end). Note that
- // only simple cases (i.e. [a-zA-Z0-9_]) are supported to keep the code
- // simpler.
- assert(*CurPtr == 0 && "Completion character must be 0");
- ++CurPtr;
- // Note that code completion token is not added as a separate character
- // when the completion point is at the end of the buffer. Therefore, we need
- // to check if the buffer has ended.
- if (CurPtr < BufferEnd) {
- while (isIdentifierBody(*CurPtr))
- ++CurPtr;
- }
- BufferPtr = CurPtr;
- return true;
+bool Lexer::LexUnicodeIdentifierStart(Token &Result, uint32_t C,
+ const char *CurPtr) {
+ if (isAllowedInitiallyIDChar(C, LangOpts)) {
+ if (!isLexingRawMode() && !ParsingPreprocessorDirective &&
+ !PP->isPreprocessedOutput()) {
+ maybeDiagnoseIDCharCompat(PP->getDiagnostics(), C,
+ makeCharRange(*this, BufferPtr, CurPtr),
+ /*IsFirst=*/true);
+ maybeDiagnoseUTF8Homoglyph(PP->getDiagnostics(), C,
+ makeCharRange(*this, BufferPtr, CurPtr));
}
- // Finally, now that we know we have an identifier, pass this off to the
- // preprocessor, which may macro expand it or something.
- if (II->isHandleIdentifierCase())
- return PP->HandleIdentifier(Result);
+ MIOpt.ReadToken();
+ return LexIdentifierContinue(Result, CurPtr);
+ }
- return true;
+ if (!isLexingRawMode() && !ParsingPreprocessorDirective &&
+ !PP->isPreprocessedOutput() && !isASCII(*BufferPtr) &&
+ !isAllowedInitiallyIDChar(C, LangOpts) && !isUnicodeWhitespace(C)) {
+ // Non-ASCII characters tend to creep into source code unintentionally.
+ // Instead of letting the parser complain about the unknown token,
+ // just drop the character.
+ // Note that we can /only/ do this when the non-ASCII character is actually
+ // spelled as Unicode, not written as a UCN. The standard requires that
+ // we not throw away any possible preprocessor tokens, but there's a
+ // loophole in the mapping of Unicode characters to basic character set
+ // characters that allows us to map these particular characters to, say,
+ // whitespace.
+ diagnoseInvalidUnicodeCodepointInIdentifier(
+ PP->getDiagnostics(), LangOpts, C,
+ makeCharRange(*this, BufferPtr, CurPtr), /*IsStart*/ true);
+ BufferPtr = CurPtr;
+ return false;
}
- // Otherwise, $,\,? in identifier found. Enter slower path.
+ // Otherwise, we have an explicit UCN or a character that's unlikely to show
+ // up by accident.
+ MIOpt.ReadToken();
+ FormTokenWithChars(Result, CurPtr, tok::unknown);
+ return true;
+}
- C = getCharAndSize(CurPtr, Size);
+bool Lexer::LexIdentifierContinue(Token &Result, const char *CurPtr) {
+ // Match [_A-Za-z0-9]*, we have already matched an identifier start.
while (true) {
+ unsigned char C = *CurPtr;
+ // Fast path.
+ if (isAsciiIdentifierContinue(C)) {
+ ++CurPtr;
+ continue;
+ }
+
+ unsigned Size;
+ // Slow path: handle trigraph, unicode codepoints, UCNs.
+ C = getCharAndSize(CurPtr, Size);
+ if (isAsciiIdentifierContinue(C)) {
+ CurPtr = ConsumeChar(CurPtr, Size, Result);
+ continue;
+ }
if (C == '$') {
// If we hit a $ and they are not supported in identifiers, we are done.
- if (!LangOpts.DollarIdents) goto FinishIdentifier;
-
+ if (!LangOpts.DollarIdents)
+ break;
// Otherwise, emit a diagnostic and continue.
if (!isLexingRawMode())
Diag(CurPtr, diag::ext_dollar_in_identifier);
CurPtr = ConsumeChar(CurPtr, Size, Result);
- C = getCharAndSize(CurPtr, Size);
continue;
- } else if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) {
- C = getCharAndSize(CurPtr, Size);
+ }
+ if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result))
continue;
- } else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr)) {
- C = getCharAndSize(CurPtr, Size);
+ if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr))
continue;
- } else if (!isIdentifierBody(C)) {
- goto FinishIdentifier;
- }
+ // Neither an expected Unicode codepoint nor a UCN.
+ break;
+ }
- // Otherwise, this character is good, consume it.
- CurPtr = ConsumeChar(CurPtr, Size, Result);
+ const char *IdStart = BufferPtr;
+ FormTokenWithChars(Result, CurPtr, tok::raw_identifier);
+ Result.setRawIdentifierData(IdStart);
- C = getCharAndSize(CurPtr, Size);
- while (isIdentifierBody(C)) {
- CurPtr = ConsumeChar(CurPtr, Size, Result);
- C = getCharAndSize(CurPtr, Size);
+ // If we are in raw mode, return this identifier raw. There is no need to
+ // look up identifier information or attempt to macro expand it.
+ if (LexingRawMode)
+ return true;
+
+ // Fill in Result.IdentifierInfo and update the token kind,
+ // looking up the identifier in the identifier table.
+ IdentifierInfo *II = PP->LookUpIdentifierInfo(Result);
+ // Note that we have to call PP->LookUpIdentifierInfo() even for code
+ // completion, it writes IdentifierInfo into Result, and callers rely on it.
+
+ // If the completion point is at the end of an identifier, we want to treat
+ // the identifier as incomplete even if it resolves to a macro or a keyword.
+ // This allows e.g. 'class^' to complete to 'classifier'.
+ if (isCodeCompletionPoint(CurPtr)) {
+ // Return the code-completion token.
+ Result.setKind(tok::code_completion);
+ // Skip the code-completion char and all immediate identifier characters.
+ // This ensures we get consistent behavior when completing at any point in
+ // an identifier (i.e. at the start, in the middle, at the end). Note that
+ // only simple cases (i.e. [a-zA-Z0-9_]) are supported to keep the code
+ // simpler.
+ assert(*CurPtr == 0 && "Completion character must be 0");
+ ++CurPtr;
+ // Note that code completion token is not added as a separate character
+ // when the completion point is at the end of the buffer. Therefore, we need
+ // to check if the buffer has ended.
+ if (CurPtr < BufferEnd) {
+ while (isAsciiIdentifierContinue(*CurPtr))
+ ++CurPtr;
}
+ BufferPtr = CurPtr;
+ return true;
}
+
+ // Finally, now that we know we have an identifier, pass this off to the
+ // preprocessor, which may macro expand it or something.
+ if (II->isHandleIdentifierCase())
+ return PP->HandleIdentifier(Result);
+
+ return true;
}
/// isHexaLiteral - Return true if Start points to a hex constant.
@@ -1864,7 +1889,7 @@ bool Lexer::LexNumericConstant(Token &Result, const char *CurPtr) {
if (C == '\'' && (getLangOpts().CPlusPlus14 || getLangOpts().C2x)) {
unsigned NextSize;
char Next = getCharAndSizeNoWarn(CurPtr + Size, NextSize, getLangOpts());
- if (isIdentifierBody(Next)) {
+ if (isAsciiIdentifierContinue(Next)) {
if (!isLexingRawMode())
Diag(CurPtr, getLangOpts().CPlusPlus
? diag::warn_cxx11_compat_digit_separator
@@ -1899,7 +1924,7 @@ const char *Lexer::LexUDSuffix(Token &Result, const char *CurPtr,
char C = getCharAndSize(CurPtr, Size);
bool Consumed = false;
- if (!isIdentifierHead(C)) {
+ if (!isAsciiIdentifierStart(C)) {
if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result))
Consumed = true;
else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr))
@@ -1938,7 +1963,7 @@ const char *Lexer::LexUDSuffix(Token &Result, const char *CurPtr,
unsigned NextSize;
char Next = getCharAndSizeNoWarn(CurPtr + Consumed, NextSize,
getLangOpts());
- if (!isIdentifierBody(Next)) {
+ if (!isAsciiIdentifierContinue(Next)) {
// End of suffix. Check whether this is on the allowed list.
const StringRef CompleteSuffix(Buffer, Chars);
IsUDSuffix = StringLiteralParser::isValidUDSuffix(getLangOpts(),
@@ -1970,10 +1995,12 @@ const char *Lexer::LexUDSuffix(Token &Result, const char *CurPtr,
Result.setFlag(Token::HasUDSuffix);
while (true) {
C = getCharAndSize(CurPtr, Size);
- if (isIdentifierBody(C)) { CurPtr = ConsumeChar(CurPtr, Size, Result); }
- else if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) {}
- else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr)) {}
- else break;
+ if (isAsciiIdentifierContinue(C)) {
+ CurPtr = ConsumeChar(CurPtr, Size, Result);
+ } else if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) {
+ } else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr)) {
+ } else
+ break;
}
return CurPtr;
@@ -3205,47 +3232,6 @@ bool Lexer::CheckUnicodeWhitespace(Token &Result, uint32_t C,
return false;
}
-bool Lexer::LexUnicode(Token &Result, uint32_t C, const char *CurPtr) {
- if (isAllowedInitiallyIDChar(C, LangOpts)) {
- if (!isLexingRawMode() && !ParsingPreprocessorDirective &&
- !PP->isPreprocessedOutput()) {
- maybeDiagnoseIDCharCompat(PP->getDiagnostics(), C,
- makeCharRange(*this, BufferPtr, CurPtr),
- /*IsFirst=*/true);
- maybeDiagnoseUTF8Homoglyph(PP->getDiagnostics(), C,
- makeCharRange(*this, BufferPtr, CurPtr));
- }
-
- MIOpt.ReadToken();
- return LexIdentifier(Result, CurPtr);
- }
-
- if (!isLexingRawMode() && !ParsingPreprocessorDirective &&
- !PP->isPreprocessedOutput() && !isASCII(*BufferPtr) &&
- !isAllowedInitiallyIDChar(C, LangOpts) && !isUnicodeWhitespace(C)) {
- // Non-ASCII characters tend to creep into source code unintentionally.
- // Instead of letting the parser complain about the unknown token,
- // just drop the character.
- // Note that we can /only/ do this when the non-ASCII character is actually
- // spelled as Unicode, not written as a UCN. The standard requires that
- // we not throw away any possible preprocessor tokens, but there's a
- // loophole in the mapping of Unicode characters to basic character set
- // characters that allows us to map these particular characters to, say,
- // whitespace.
- diagnoseInvalidUnicodeCodepointInIdentifier(
- PP->getDiagnostics(), LangOpts, C,
- makeCharRange(*this, BufferPtr, CurPtr), /*IsStart*/ true);
- BufferPtr = CurPtr;
- return false;
- }
-
- // Otherwise, we have an explicit UCN or a character that's unlikely to show
- // up by accident.
- MIOpt.ReadToken();
- FormTokenWithChars(Result, CurPtr, tok::unknown);
- return true;
-}
-
void Lexer::PropagateLineStartLeadingSpaceInfo(Token &Result) {
IsAtStartOfLine = Result.isAtStartOfLine();
HasLeadingSpace = Result.hasLeadingSpace();
@@ -3489,7 +3475,7 @@ bool Lexer::LexTokenInternal(Token &Result, bool TokAtPhysicalStartOfLine) {
}
// treat u like the start of an identifier.
- return LexIdentifier(Result, CurPtr);
+ return LexIdentifierContinue(Result, CurPtr);
case 'U': // Identifier (Uber) or C11/C++11 UTF-32 string literal
// Notify MIOpt that we read a non-whitespace/non-comment token.
@@ -3518,7 +3504,7 @@ bool Lexer::LexTokenInternal(Token &Result, bool TokAtPhysicalStartOfLine) {
}
// treat U like the start of an identifier.
- return LexIdentifier(Result, CurPtr);
+ return LexIdentifierContinue(Result, CurPtr);
case 'R': // Identifier or C++0x raw string literal
// Notify MIOpt that we read a non-whitespace/non-comment token.
@@ -3534,7 +3520,7 @@ bool Lexer::LexTokenInternal(Token &Result, bool TokAtPhysicalStartOfLine) {
}
// treat R like the start of an identifier.
- return LexIdentifier(Result, CurPtr);
+ return LexIdentifierContinue(Result, CurPtr);
case 'L': // Identifier (Loony) or wide literal (L'x' or L"xyz").
// Notify MIOpt that we read a non-whitespace/non-comment token.
@@ -3573,7 +3559,7 @@ bool Lexer::LexTokenInternal(Token &Result, bool TokAtPhysicalStartOfLine) {
case '_':
// Notify MIOpt that we read a non-whitespace/non-comment token.
MIOpt.ReadToken();
- return LexIdentifier(Result, CurPtr);
+ return LexIdentifierContinue(Result, CurPtr);
case '$': // $ in identifiers.
if (LangOpts.DollarIdents) {
@@ -3581,7 +3567,7 @@ bool Lexer::LexTokenInternal(Token &Result, bool TokAtPhysicalStartOfLine) {
Diag(CurPtr-1, diag::ext_dollar_in_identifier);
// Notify MIOpt that we read a non-whitespace/non-comment token.
MIOpt.ReadToken();
- return LexIdentifier(Result, CurPtr);
+ return LexIdentifierContinue(Result, CurPtr);
}
Kind = tok::unknown;
@@ -3996,7 +3982,7 @@ bool Lexer::LexTokenInternal(Token &Result, bool TokAtPhysicalStartOfLine) {
goto LexNextToken;
}
- return LexUnicode(Result, CodePoint, CurPtr);
+ return LexUnicodeIdentifierStart(Result, CodePoint, CurPtr);
}
}
@@ -4028,7 +4014,7 @@ bool Lexer::LexTokenInternal(Token &Result, bool TokAtPhysicalStartOfLine) {
// (We manually eliminate the tail call to avoid recursion.)
goto LexNextToken;
}
- return LexUnicode(Result, CodePoint, CurPtr);
+ return LexUnicodeIdentifierStart(Result, CodePoint, CurPtr);
}
if (isLexingRawMode() || ParsingPreprocessorDirective ||
diff --git a/clang/lib/Lex/ModuleMap.cpp b/clang/lib/Lex/ModuleMap.cpp
index f9af7c2a24fb9..8475417171c82 100644
--- a/clang/lib/Lex/ModuleMap.cpp
+++ b/clang/lib/Lex/ModuleMap.cpp
@@ -338,7 +338,7 @@ static StringRef sanitizeFilenameAsIdentifier(StringRef Name,
if (Name.empty())
return Name;
- if (!isValidIdentifier(Name)) {
+ if (!isValidAsciiIdentifier(Name)) {
// If we don't already have something with the form of an identifier,
// create a buffer with the sanitized name.
Buffer.clear();
@@ -346,7 +346,7 @@ static StringRef sanitizeFilenameAsIdentifier(StringRef Name,
Buffer.push_back('_');
Buffer.reserve(Buffer.size() + Name.size());
for (unsigned I = 0, N = Name.size(); I != N; ++I) {
- if (isIdentifierBody(Name[I]))
+ if (isAsciiIdentifierContinue(Name[I]))
Buffer.push_back(Name[I]);
else
Buffer.push_back('_');
diff --git a/clang/lib/Sema/SemaAvailability.cpp b/clang/lib/Sema/SemaAvailability.cpp
index edbeced3da4e0..d1c3d25b089d3 100644
--- a/clang/lib/Sema/SemaAvailability.cpp
+++ b/clang/lib/Sema/SemaAvailability.cpp
@@ -268,7 +268,7 @@ tryParseObjCMethodName(StringRef Name, SmallVectorImpl<StringRef> &SlotNames,
for (StringRef S : SlotNames) {
if (S.empty())
continue;
- if (!isValidIdentifier(S, AllowDollar))
+ if (!isValidAsciiIdentifier(S, AllowDollar))
return None;
}
return NumParams;
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index f93db4babd018..35c32043e377d 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -6105,7 +6105,7 @@ validateSwiftFunctionName(Sema &S, const ParsedAttr &AL, SourceLocation Loc,
if (BaseName.empty()) {
BaseName = ContextName;
ContextName = StringRef();
- } else if (ContextName.empty() || !isValidIdentifier(ContextName)) {
+ } else if (ContextName.empty() || !isValidAsciiIdentifier(ContextName)) {
S.Diag(Loc, diag::warn_attr_swift_name_invalid_identifier)
<< AL << /*context*/ 1;
return false;
@@ -6113,7 +6113,7 @@ validateSwiftFunctionName(Sema &S, const ParsedAttr &AL, SourceLocation Loc,
IsMember = true;
}
- if (!isValidIdentifier(BaseName) || BaseName == "_") {
+ if (!isValidAsciiIdentifier(BaseName) || BaseName == "_") {
S.Diag(Loc, diag::warn_attr_swift_name_invalid_identifier)
<< AL << /*basename*/ 0;
return false;
@@ -6163,7 +6163,7 @@ validateSwiftFunctionName(Sema &S, const ParsedAttr &AL, SourceLocation Loc,
do {
std::tie(CurrentParam, Parameters) = Parameters.split(':');
- if (!isValidIdentifier(CurrentParam)) {
+ if (!isValidAsciiIdentifier(CurrentParam)) {
S.Diag(Loc, diag::warn_attr_swift_name_invalid_identifier)
<< AL << /*parameter*/2;
return false;
@@ -6332,13 +6332,13 @@ bool Sema::DiagnoseSwiftName(Decl *D, StringRef Name, SourceLocation Loc,
if (BaseName.empty()) {
BaseName = ContextName;
ContextName = StringRef();
- } else if (!isValidIdentifier(ContextName)) {
+ } else if (!isValidAsciiIdentifier(ContextName)) {
Diag(Loc, diag::warn_attr_swift_name_invalid_identifier) << AL
<< /*context*/1;
return false;
}
- if (!isValidIdentifier(BaseName)) {
+ if (!isValidAsciiIdentifier(BaseName)) {
Diag(Loc, diag::warn_attr_swift_name_invalid_identifier) << AL
<< /*basename*/0;
return false;
diff --git a/clang/lib/Sema/SemaExprObjC.cpp b/clang/lib/Sema/SemaExprObjC.cpp
index 102edb0136e59..bdc8e1e0b336a 100644
--- a/clang/lib/Sema/SemaExprObjC.cpp
+++ b/clang/lib/Sema/SemaExprObjC.cpp
@@ -3772,7 +3772,7 @@ static void addFixitForObjCARCConversion(
SourceManager &SM = S.getSourceManager();
char PrevChar = *SM.getCharacterData(range.getBegin().getLocWithOffset(-1));
- if (Lexer::isIdentifierBodyChar(PrevChar, S.getLangOpts()))
+ if (Lexer::isAsciiIdentifierContinueChar(PrevChar, S.getLangOpts()))
BridgeCall += ' ';
BridgeCall += CFBridgeName;
@@ -3790,7 +3790,7 @@ static void addFixitForObjCARCConversion(
SourceManager &SM = S.getSourceManager();
char PrevChar = *SM.getCharacterData(range.getBegin().getLocWithOffset(-1));
- if (Lexer::isIdentifierBodyChar(PrevChar, S.getLangOpts()))
+ if (Lexer::isAsciiIdentifierContinueChar(PrevChar, S.getLangOpts()))
BridgeCall += ' ';
BridgeCall += CFBridgeName;
diff --git a/clang/lib/Sema/SemaType.cpp b/clang/lib/Sema/SemaType.cpp
index 69c9de35c68eb..dcf18d3b4ba3d 100644
--- a/clang/lib/Sema/SemaType.cpp
+++ b/clang/lib/Sema/SemaType.cpp
@@ -4260,8 +4260,8 @@ static void fixItNullability(Sema &S, DiagBuilderT &Diag,
InsertionText = InsertionText.drop_back().drop_front();
else
InsertionText = InsertionText.drop_front();
- } else if (!isIdentifierBody(NextChar[0], /*allow dollar*/true) &&
- !isIdentifierBody(NextChar[-1], /*allow dollar*/true)) {
+ } else if (!isAsciiIdentifierContinue(NextChar[0], /*allow dollar*/ true) &&
+ !isAsciiIdentifierContinue(NextChar[-1], /*allow dollar*/ true)) {
InsertionText = InsertionText.drop_back().drop_front();
}
diff --git a/clang/lib/Tooling/Transformer/Parsing.cpp b/clang/lib/Tooling/Transformer/Parsing.cpp
index 66fa04a15594a..242db2a16b43e 100644
--- a/clang/lib/Tooling/Transformer/Parsing.cpp
+++ b/clang/lib/Tooling/Transformer/Parsing.cpp
@@ -165,7 +165,7 @@ static ExpectedProgress<llvm::NoneType> parseChar(char c, ParseState State) {
static ExpectedProgress<std::string> parseId(ParseState State) {
State.Input = consumeWhitespace(State.Input);
auto Id = State.Input.take_while(
- [](char c) { return isASCII(c) && isIdentifierBody(c); });
+ [](char c) { return isASCII(c) && isAsciiIdentifierContinue(c); });
if (Id.empty())
return makeParseError(State, "failed to parse name");
return makeParseProgress(advance(State, Id.size()), Id.str());
diff --git a/clang/unittests/Basic/CharInfoTest.cpp b/clang/unittests/Basic/CharInfoTest.cpp
index 4f84bebec30be..491c9afceb6f8 100644
--- a/clang/unittests/Basic/CharInfoTest.cpp
+++ b/clang/unittests/Basic/CharInfoTest.cpp
@@ -50,44 +50,44 @@ TEST(CharInfoTest, isASCII) {
EXPECT_FALSE(isASCII('\xff'));
}
-TEST(CharInfoTest, isIdentifierHead) {
- EXPECT_TRUE(isIdentifierHead('a'));
- EXPECT_TRUE(isIdentifierHead('A'));
- EXPECT_TRUE(isIdentifierHead('z'));
- EXPECT_TRUE(isIdentifierHead('Z'));
- EXPECT_TRUE(isIdentifierHead('_'));
-
- EXPECT_FALSE(isIdentifierHead('0'));
- EXPECT_FALSE(isIdentifierHead('.'));
- EXPECT_FALSE(isIdentifierHead('`'));
- EXPECT_FALSE(isIdentifierHead('\0'));
-
- EXPECT_FALSE(isIdentifierHead('$'));
- EXPECT_TRUE(isIdentifierHead('$', /*AllowDollar=*/true));
-
- EXPECT_FALSE(isIdentifierHead('\x80'));
- EXPECT_FALSE(isIdentifierHead('\xc2'));
- EXPECT_FALSE(isIdentifierHead('\xff'));
+TEST(CharInfoTest, isAsciiIdentifierStart) {
+ EXPECT_TRUE(isAsciiIdentifierStart('a'));
+ EXPECT_TRUE(isAsciiIdentifierStart('A'));
+ EXPECT_TRUE(isAsciiIdentifierStart('z'));
+ EXPECT_TRUE(isAsciiIdentifierStart('Z'));
+ EXPECT_TRUE(isAsciiIdentifierStart('_'));
+
+ EXPECT_FALSE(isAsciiIdentifierStart('0'));
+ EXPECT_FALSE(isAsciiIdentifierStart('.'));
+ EXPECT_FALSE(isAsciiIdentifierStart('`'));
+ EXPECT_FALSE(isAsciiIdentifierStart('\0'));
+
+ EXPECT_FALSE(isAsciiIdentifierStart('$'));
+ EXPECT_TRUE(isAsciiIdentifierStart('$', /*AllowDollar=*/true));
+
+ EXPECT_FALSE(isAsciiIdentifierStart('\x80'));
+ EXPECT_FALSE(isAsciiIdentifierStart('\xc2'));
+ EXPECT_FALSE(isAsciiIdentifierStart('\xff'));
}
-TEST(CharInfoTest, isIdentifierBody) {
- EXPECT_TRUE(isIdentifierBody('a'));
- EXPECT_TRUE(isIdentifierBody('A'));
- EXPECT_TRUE(isIdentifierBody('z'));
- EXPECT_TRUE(isIdentifierBody('Z'));
- EXPECT_TRUE(isIdentifierBody('_'));
+TEST(CharInfoTest, isAsciiIdentifierContinue) {
+ EXPECT_TRUE(isAsciiIdentifierContinue('a'));
+ EXPECT_TRUE(isAsciiIdentifierContinue('A'));
+ EXPECT_TRUE(isAsciiIdentifierContinue('z'));
+ EXPECT_TRUE(isAsciiIdentifierContinue('Z'));
+ EXPECT_TRUE(isAsciiIdentifierContinue('_'));
- EXPECT_TRUE(isIdentifierBody('0'));
- EXPECT_FALSE(isIdentifierBody('.'));
- EXPECT_FALSE(isIdentifierBody('`'));
- EXPECT_FALSE(isIdentifierBody('\0'));
+ EXPECT_TRUE(isAsciiIdentifierContinue('0'));
+ EXPECT_FALSE(isAsciiIdentifierContinue('.'));
+ EXPECT_FALSE(isAsciiIdentifierContinue('`'));
+ EXPECT_FALSE(isAsciiIdentifierContinue('\0'));
- EXPECT_FALSE(isIdentifierBody('$'));
- EXPECT_TRUE(isIdentifierBody('$', /*AllowDollar=*/true));
+ EXPECT_FALSE(isAsciiIdentifierContinue('$'));
+ EXPECT_TRUE(isAsciiIdentifierContinue('$', /*AllowDollar=*/true));
- EXPECT_FALSE(isIdentifierBody('\x80'));
- EXPECT_FALSE(isIdentifierBody('\xc2'));
- EXPECT_FALSE(isIdentifierBody('\xff'));
+ EXPECT_FALSE(isAsciiIdentifierContinue('\x80'));
+ EXPECT_FALSE(isAsciiIdentifierContinue('\xc2'));
+ EXPECT_FALSE(isAsciiIdentifierContinue('\xff'));
}
TEST(CharInfoTest, isHorizontalWhitespace) {
@@ -413,91 +413,91 @@ TEST(CharInfoTest, toUppercase) {
EXPECT_EQ('\0', toUppercase('\0'));
}
-TEST(CharInfoTest, isValidIdentifier) {
- EXPECT_FALSE(isValidIdentifier(""));
+TEST(CharInfoTest, isValidAsciiIdentifier) {
+ EXPECT_FALSE(isValidAsciiIdentifier(""));
// 1 character
- EXPECT_FALSE(isValidIdentifier("."));
- EXPECT_FALSE(isValidIdentifier("\n"));
- EXPECT_FALSE(isValidIdentifier(" "));
- EXPECT_FALSE(isValidIdentifier("\x80"));
- EXPECT_FALSE(isValidIdentifier("\xc2"));
- EXPECT_FALSE(isValidIdentifier("\xff"));
- EXPECT_FALSE(isValidIdentifier("$"));
- EXPECT_FALSE(isValidIdentifier("1"));
-
- EXPECT_TRUE(isValidIdentifier("_"));
- EXPECT_TRUE(isValidIdentifier("a"));
- EXPECT_TRUE(isValidIdentifier("z"));
- EXPECT_TRUE(isValidIdentifier("A"));
- EXPECT_TRUE(isValidIdentifier("Z"));
- EXPECT_TRUE(isValidIdentifier("$", /*AllowDollar=*/true));
+ EXPECT_FALSE(isValidAsciiIdentifier("."));
+ EXPECT_FALSE(isValidAsciiIdentifier("\n"));
+ EXPECT_FALSE(isValidAsciiIdentifier(" "));
+ EXPECT_FALSE(isValidAsciiIdentifier("\x80"));
+ EXPECT_FALSE(isValidAsciiIdentifier("\xc2"));
+ EXPECT_FALSE(isValidAsciiIdentifier("\xff"));
+ EXPECT_FALSE(isValidAsciiIdentifier("$"));
+ EXPECT_FALSE(isValidAsciiIdentifier("1"));
+
+ EXPECT_TRUE(isValidAsciiIdentifier("_"));
+ EXPECT_TRUE(isValidAsciiIdentifier("a"));
+ EXPECT_TRUE(isValidAsciiIdentifier("z"));
+ EXPECT_TRUE(isValidAsciiIdentifier("A"));
+ EXPECT_TRUE(isValidAsciiIdentifier("Z"));
+ EXPECT_TRUE(isValidAsciiIdentifier("$", /*AllowDollar=*/true));
// 2 characters, '_' suffix
- EXPECT_FALSE(isValidIdentifier("._"));
- EXPECT_FALSE(isValidIdentifier("\n_"));
- EXPECT_FALSE(isValidIdentifier(" _"));
- EXPECT_FALSE(isValidIdentifier("\x80_"));
- EXPECT_FALSE(isValidIdentifier("\xc2_"));
- EXPECT_FALSE(isValidIdentifier("\xff_"));
- EXPECT_FALSE(isValidIdentifier("$_"));
- EXPECT_FALSE(isValidIdentifier("1_"));
-
- EXPECT_TRUE(isValidIdentifier("__"));
- EXPECT_TRUE(isValidIdentifier("a_"));
- EXPECT_TRUE(isValidIdentifier("z_"));
- EXPECT_TRUE(isValidIdentifier("A_"));
- EXPECT_TRUE(isValidIdentifier("Z_"));
- EXPECT_TRUE(isValidIdentifier("$_", /*AllowDollar=*/true));
+ EXPECT_FALSE(isValidAsciiIdentifier("._"));
+ EXPECT_FALSE(isValidAsciiIdentifier("\n_"));
+ EXPECT_FALSE(isValidAsciiIdentifier(" _"));
+ EXPECT_FALSE(isValidAsciiIdentifier("\x80_"));
+ EXPECT_FALSE(isValidAsciiIdentifier("\xc2_"));
+ EXPECT_FALSE(isValidAsciiIdentifier("\xff_"));
+ EXPECT_FALSE(isValidAsciiIdentifier("$_"));
+ EXPECT_FALSE(isValidAsciiIdentifier("1_"));
+
+ EXPECT_TRUE(isValidAsciiIdentifier("__"));
+ EXPECT_TRUE(isValidAsciiIdentifier("a_"));
+ EXPECT_TRUE(isValidAsciiIdentifier("z_"));
+ EXPECT_TRUE(isValidAsciiIdentifier("A_"));
+ EXPECT_TRUE(isValidAsciiIdentifier("Z_"));
+ EXPECT_TRUE(isValidAsciiIdentifier("$_", /*AllowDollar=*/true));
// 2 characters, '_' prefix
- EXPECT_FALSE(isValidIdentifier("_."));
- EXPECT_FALSE(isValidIdentifier("_\n"));
- EXPECT_FALSE(isValidIdentifier("_ "));
- EXPECT_FALSE(isValidIdentifier("_\x80"));
- EXPECT_FALSE(isValidIdentifier("_\xc2"));
- EXPECT_FALSE(isValidIdentifier("_\xff"));
- EXPECT_FALSE(isValidIdentifier("_$"));
- EXPECT_TRUE(isValidIdentifier("_1"));
-
- EXPECT_TRUE(isValidIdentifier("__"));
- EXPECT_TRUE(isValidIdentifier("_a"));
- EXPECT_TRUE(isValidIdentifier("_z"));
- EXPECT_TRUE(isValidIdentifier("_A"));
- EXPECT_TRUE(isValidIdentifier("_Z"));
- EXPECT_TRUE(isValidIdentifier("_$", /*AllowDollar=*/true));
+ EXPECT_FALSE(isValidAsciiIdentifier("_."));
+ EXPECT_FALSE(isValidAsciiIdentifier("_\n"));
+ EXPECT_FALSE(isValidAsciiIdentifier("_ "));
+ EXPECT_FALSE(isValidAsciiIdentifier("_\x80"));
+ EXPECT_FALSE(isValidAsciiIdentifier("_\xc2"));
+ EXPECT_FALSE(isValidAsciiIdentifier("_\xff"));
+ EXPECT_FALSE(isValidAsciiIdentifier("_$"));
+ EXPECT_TRUE(isValidAsciiIdentifier("_1"));
+
+ EXPECT_TRUE(isValidAsciiIdentifier("__"));
+ EXPECT_TRUE(isValidAsciiIdentifier("_a"));
+ EXPECT_TRUE(isValidAsciiIdentifier("_z"));
+ EXPECT_TRUE(isValidAsciiIdentifier("_A"));
+ EXPECT_TRUE(isValidAsciiIdentifier("_Z"));
+ EXPECT_TRUE(isValidAsciiIdentifier("_$", /*AllowDollar=*/true));
// 3 characters, '__' prefix
- EXPECT_FALSE(isValidIdentifier("__."));
- EXPECT_FALSE(isValidIdentifier("__\n"));
- EXPECT_FALSE(isValidIdentifier("__ "));
- EXPECT_FALSE(isValidIdentifier("__\x80"));
- EXPECT_FALSE(isValidIdentifier("__\xc2"));
- EXPECT_FALSE(isValidIdentifier("__\xff"));
- EXPECT_FALSE(isValidIdentifier("__$"));
- EXPECT_TRUE(isValidIdentifier("__1"));
-
- EXPECT_TRUE(isValidIdentifier("___"));
- EXPECT_TRUE(isValidIdentifier("__a"));
- EXPECT_TRUE(isValidIdentifier("__z"));
- EXPECT_TRUE(isValidIdentifier("__A"));
- EXPECT_TRUE(isValidIdentifier("__Z"));
- EXPECT_TRUE(isValidIdentifier("__$", /*AllowDollar=*/true));
+ EXPECT_FALSE(isValidAsciiIdentifier("__."));
+ EXPECT_FALSE(isValidAsciiIdentifier("__\n"));
+ EXPECT_FALSE(isValidAsciiIdentifier("__ "));
+ EXPECT_FALSE(isValidAsciiIdentifier("__\x80"));
+ EXPECT_FALSE(isValidAsciiIdentifier("__\xc2"));
+ EXPECT_FALSE(isValidAsciiIdentifier("__\xff"));
+ EXPECT_FALSE(isValidAsciiIdentifier("__$"));
+ EXPECT_TRUE(isValidAsciiIdentifier("__1"));
+
+ EXPECT_TRUE(isValidAsciiIdentifier("___"));
+ EXPECT_TRUE(isValidAsciiIdentifier("__a"));
+ EXPECT_TRUE(isValidAsciiIdentifier("__z"));
+ EXPECT_TRUE(isValidAsciiIdentifier("__A"));
+ EXPECT_TRUE(isValidAsciiIdentifier("__Z"));
+ EXPECT_TRUE(isValidAsciiIdentifier("__$", /*AllowDollar=*/true));
// 3 characters, '_' prefix and suffix
- EXPECT_FALSE(isValidIdentifier("_._"));
- EXPECT_FALSE(isValidIdentifier("_\n_"));
- EXPECT_FALSE(isValidIdentifier("_ _"));
- EXPECT_FALSE(isValidIdentifier("_\x80_"));
- EXPECT_FALSE(isValidIdentifier("_\xc2_"));
- EXPECT_FALSE(isValidIdentifier("_\xff_"));
- EXPECT_FALSE(isValidIdentifier("_$_"));
- EXPECT_TRUE(isValidIdentifier("_1_"));
-
- EXPECT_TRUE(isValidIdentifier("___"));
- EXPECT_TRUE(isValidIdentifier("_a_"));
- EXPECT_TRUE(isValidIdentifier("_z_"));
- EXPECT_TRUE(isValidIdentifier("_A_"));
- EXPECT_TRUE(isValidIdentifier("_Z_"));
- EXPECT_TRUE(isValidIdentifier("_$_", /*AllowDollar=*/true));
+ EXPECT_FALSE(isValidAsciiIdentifier("_._"));
+ EXPECT_FALSE(isValidAsciiIdentifier("_\n_"));
+ EXPECT_FALSE(isValidAsciiIdentifier("_ _"));
+ EXPECT_FALSE(isValidAsciiIdentifier("_\x80_"));
+ EXPECT_FALSE(isValidAsciiIdentifier("_\xc2_"));
+ EXPECT_FALSE(isValidAsciiIdentifier("_\xff_"));
+ EXPECT_FALSE(isValidAsciiIdentifier("_$_"));
+ EXPECT_TRUE(isValidAsciiIdentifier("_1_"));
+
+ EXPECT_TRUE(isValidAsciiIdentifier("___"));
+ EXPECT_TRUE(isValidAsciiIdentifier("_a_"));
+ EXPECT_TRUE(isValidAsciiIdentifier("_z_"));
+ EXPECT_TRUE(isValidAsciiIdentifier("_A_"));
+ EXPECT_TRUE(isValidAsciiIdentifier("_Z_"));
+ EXPECT_TRUE(isValidAsciiIdentifier("_$_", /*AllowDollar=*/true));
}
More information about the cfe-commits
mailing list