[clang] d8f5a18 - Perf/lexer faster slow get char and size (#70543)
via cfe-commits
cfe-commits at lists.llvm.org
Sun Oct 29 11:17:06 PDT 2023
Author: serge-sans-paille
Date: 2023-10-29T18:17:02Z
New Revision: d8f5a18b6e587aeaa8b99707e87b652f49b160cd
URL: https://github.com/llvm/llvm-project/commit/d8f5a18b6e587aeaa8b99707e87b652f49b160cd
DIFF: https://github.com/llvm/llvm-project/commit/d8f5a18b6e587aeaa8b99707e87b652f49b160cd.diff
LOG: Perf/lexer faster slow get char and size (#70543)
Co-authored-by: serge-sans-paille <sguelton at mozilla.com>
Added:
Modified:
clang/include/clang/Lex/Lexer.h
clang/lib/Lex/DependencyDirectivesScanner.cpp
clang/lib/Lex/Lexer.cpp
Removed:
################################################################################
diff --git a/clang/include/clang/Lex/Lexer.h b/clang/include/clang/Lex/Lexer.h
index ac0ef14c591bdd7..899e665e7454652 100644
--- a/clang/include/clang/Lex/Lexer.h
+++ b/clang/include/clang/Lex/Lexer.h
@@ -575,19 +575,23 @@ class Lexer : public PreprocessorLexer {
/// sequence.
static bool isNewLineEscaped(const char *BufferStart, const char *Str);
+ /// Represents a char and the number of bytes parsed to produce it.
+ struct SizedChar {
+ char Char;
+ unsigned Size;
+ };
+
/// getCharAndSizeNoWarn - Like the getCharAndSize method, but does not ever
/// emit a warning.
- static inline char getCharAndSizeNoWarn(const char *Ptr, unsigned &Size,
- const LangOptions &LangOpts) {
+ static inline SizedChar getCharAndSizeNoWarn(const char *Ptr,
+ const LangOptions &LangOpts) {
// If this is not a trigraph and not a UCN or escaped newline, return
// quickly.
if (isObviouslySimpleCharacter(Ptr[0])) {
- Size = 1;
- return *Ptr;
+ return {*Ptr, 1u};
}
- Size = 0;
- return getCharAndSizeSlowNoWarn(Ptr, Size, LangOpts);
+ return getCharAndSizeSlowNoWarn(Ptr, LangOpts);
}
/// Returns the leading whitespace for line that corresponds to the given
@@ -665,8 +669,7 @@ class Lexer : public PreprocessorLexer {
// quickly.
if (isObviouslySimpleCharacter(Ptr[0])) return *Ptr++;
- unsigned Size = 0;
- char C = getCharAndSizeSlow(Ptr, Size, &Tok);
+ auto [C, Size] = getCharAndSizeSlow(Ptr, &Tok);
Ptr += Size;
return C;
}
@@ -682,9 +685,7 @@ class Lexer : public PreprocessorLexer {
// Otherwise, re-lex the character with a current token, allowing
// diagnostics to be emitted and flags to be set.
- Size = 0;
- getCharAndSizeSlow(Ptr, Size, &Tok);
- return Ptr+Size;
+ return Ptr + getCharAndSizeSlow(Ptr, &Tok).Size;
}
/// getCharAndSize - Peek a single 'character' from the specified buffer,
@@ -699,14 +700,14 @@ class Lexer : public PreprocessorLexer {
return *Ptr;
}
- Size = 0;
- return getCharAndSizeSlow(Ptr, Size);
+ auto CharAndSize = getCharAndSizeSlow(Ptr);
+ Size = CharAndSize.Size;
+ return CharAndSize.Char;
}
/// getCharAndSizeSlow - Handle the slow/uncommon case of the getCharAndSize
/// method.
- char getCharAndSizeSlow(const char *Ptr, unsigned &Size,
- Token *Tok = nullptr);
+ SizedChar getCharAndSizeSlow(const char *Ptr, Token *Tok = nullptr);
/// getEscapedNewLineSize - Return the size of the specified escaped newline,
/// or 0 if it is not an escaped newline. P[-1] is known to be a "\" on entry
@@ -720,8 +721,8 @@ class Lexer : public PreprocessorLexer {
/// getCharAndSizeSlowNoWarn - Same as getCharAndSizeSlow, but never emits a
/// diagnostic.
- static char getCharAndSizeSlowNoWarn(const char *Ptr, unsigned &Size,
- const LangOptions &LangOpts);
+ static SizedChar getCharAndSizeSlowNoWarn(const char *Ptr,
+ const LangOptions &LangOpts);
//===--------------------------------------------------------------------===//
// Other lexer functions.
diff --git a/clang/lib/Lex/DependencyDirectivesScanner.cpp b/clang/lib/Lex/DependencyDirectivesScanner.cpp
index 2bd2c5f8388c0dd..980f865cf24c97e 100644
--- a/clang/lib/Lex/DependencyDirectivesScanner.cpp
+++ b/clang/lib/Lex/DependencyDirectivesScanner.cpp
@@ -565,9 +565,8 @@ Scanner::cleanStringIfNeeded(const dependency_directives_scan::Token &Tok) {
const char *BufPtr = Input.begin() + Tok.Offset;
const char *AfterIdent = Input.begin() + Tok.getEnd();
while (BufPtr < AfterIdent) {
- unsigned Size;
- Spelling[SpellingLength++] =
- Lexer::getCharAndSizeNoWarn(BufPtr, Size, LangOpts);
+ auto [Char, Size] = Lexer::getCharAndSizeNoWarn(BufPtr, LangOpts);
+ Spelling[SpellingLength++] = Char;
BufPtr += Size;
}
diff --git a/clang/lib/Lex/Lexer.cpp b/clang/lib/Lex/Lexer.cpp
index 675ec28e514797e..1c53997527732a9 100644
--- a/clang/lib/Lex/Lexer.cpp
+++ b/clang/lib/Lex/Lexer.cpp
@@ -287,9 +287,9 @@ static size_t getSpellingSlow(const Token &Tok, const char *BufPtr,
if (tok::isStringLiteral(Tok.getKind())) {
// Munch the encoding-prefix and opening double-quote.
while (BufPtr < BufEnd) {
- unsigned Size;
- Spelling[Length++] = Lexer::getCharAndSizeNoWarn(BufPtr, Size, LangOpts);
- BufPtr += Size;
+ auto CharAndSize = Lexer::getCharAndSizeNoWarn(BufPtr, LangOpts);
+ Spelling[Length++] = CharAndSize.Char;
+ BufPtr += CharAndSize.Size;
if (Spelling[Length - 1] == '"')
break;
@@ -316,9 +316,9 @@ static size_t getSpellingSlow(const Token &Tok, const char *BufPtr,
}
while (BufPtr < BufEnd) {
- unsigned Size;
- Spelling[Length++] = Lexer::getCharAndSizeNoWarn(BufPtr, Size, LangOpts);
- BufPtr += Size;
+ auto CharAndSize = Lexer::getCharAndSizeNoWarn(BufPtr, LangOpts);
+ Spelling[Length++] = CharAndSize.Char;
+ BufPtr += CharAndSize.Size;
}
assert(Length < Tok.getLength() &&
@@ -772,10 +772,9 @@ unsigned Lexer::getTokenPrefixLength(SourceLocation TokStart, unsigned CharNo,
// If we have a character that may be a trigraph or escaped newline, use a
// lexer to parse it correctly.
for (; CharNo; --CharNo) {
- unsigned Size;
- Lexer::getCharAndSizeNoWarn(TokPtr, Size, LangOpts);
- TokPtr += Size;
- PhysOffset += Size;
+ auto CharAndSize = Lexer::getCharAndSizeNoWarn(TokPtr, LangOpts);
+ TokPtr += CharAndSize.Size;
+ PhysOffset += CharAndSize.Size;
}
// Final detail: if we end up on an escaped newline, we want to return the
@@ -1357,15 +1356,16 @@ SourceLocation Lexer::findLocationAfterToken(
///
/// NOTE: When this method is updated, getCharAndSizeSlowNoWarn (below) should
/// be updated to match.
-char Lexer::getCharAndSizeSlow(const char *Ptr, unsigned &Size,
- Token *Tok) {
+Lexer::SizedChar Lexer::getCharAndSizeSlow(const char *Ptr, Token *Tok) {
+ unsigned Size = 0;
// If we have a slash, look for an escaped newline.
if (Ptr[0] == '\\') {
++Size;
++Ptr;
Slash:
// Common case, backslash-char where the char is not whitespace.
- if (!isWhitespace(Ptr[0])) return '\\';
+ if (!isWhitespace(Ptr[0]))
+ return {'\\', Size};
// See if we have optional whitespace characters between the slash and
// newline.
@@ -1382,11 +1382,13 @@ char Lexer::getCharAndSizeSlow(const char *Ptr, unsigned &Size,
Ptr += EscapedNewLineSize;
// Use slow version to accumulate a correct size field.
- return getCharAndSizeSlow(Ptr, Size, Tok);
+ auto CharAndSize = getCharAndSizeSlow(Ptr, Tok);
+ CharAndSize.Size += Size;
+ return CharAndSize;
}
// Otherwise, this is not an escaped newline, just return the slash.
- return '\\';
+ return {'\\', Size};
}
// If this is a trigraph, process it.
@@ -1401,13 +1403,12 @@ char Lexer::getCharAndSizeSlow(const char *Ptr, unsigned &Size,
Ptr += 3;
Size += 3;
if (C == '\\') goto Slash;
- return C;
+ return {C, Size};
}
}
// If this is neither, return a single character.
- ++Size;
- return *Ptr;
+ return {*Ptr, Size + 1u};
}
/// getCharAndSizeSlowNoWarn - Handle the slow/uncommon case of the
@@ -1416,15 +1417,18 @@ char Lexer::getCharAndSizeSlow(const char *Ptr, unsigned &Size,
///
/// NOTE: When this method is updated, getCharAndSizeSlow (above) should
/// be updated to match.
-char Lexer::getCharAndSizeSlowNoWarn(const char *Ptr, unsigned &Size,
- const LangOptions &LangOpts) {
+Lexer::SizedChar Lexer::getCharAndSizeSlowNoWarn(const char *Ptr,
+ const LangOptions &LangOpts) {
+
+ unsigned Size = 0;
// If we have a slash, look for an escaped newline.
if (Ptr[0] == '\\') {
++Size;
++Ptr;
Slash:
// Common case, backslash-char where the char is not whitespace.
- if (!isWhitespace(Ptr[0])) return '\\';
+ if (!isWhitespace(Ptr[0]))
+ return {'\\', Size};
// See if we have optional whitespace characters followed by a newline.
if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) {
@@ -1433,11 +1437,13 @@ char Lexer::getCharAndSizeSlowNoWarn(const char *Ptr, unsigned &Size,
Ptr += EscapedNewLineSize;
// Use slow version to accumulate a correct size field.
- return getCharAndSizeSlowNoWarn(Ptr, Size, LangOpts);
+ auto CharAndSize = getCharAndSizeSlowNoWarn(Ptr, LangOpts);
+ CharAndSize.Size += Size;
+ return CharAndSize;
}
// Otherwise, this is not an escaped newline, just return the slash.
- return '\\';
+ return {'\\', Size};
}
// If this is a trigraph, process it.
@@ -1448,13 +1454,12 @@ char Lexer::getCharAndSizeSlowNoWarn(const char *Ptr, unsigned &Size,
Ptr += 3;
Size += 3;
if (C == '\\') goto Slash;
- return C;
+ return {C, Size};
}
}
// If this is neither, return a single character.
- ++Size;
- return *Ptr;
+ return {*Ptr, Size + 1u};
}
//===----------------------------------------------------------------------===//
@@ -1964,11 +1969,14 @@ bool Lexer::LexIdentifierContinue(Token &Result, const char *CurPtr) {
/// isHexaLiteral - Return true if Start points to a hex constant.
/// in microsoft mode (where this is supposed to be several
diff erent tokens).
bool Lexer::isHexaLiteral(const char *Start, const LangOptions &LangOpts) {
- unsigned Size;
- char C1 = Lexer::getCharAndSizeNoWarn(Start, Size, LangOpts);
+ auto CharAndSize1 = Lexer::getCharAndSizeNoWarn(Start, LangOpts);
+ char C1 = CharAndSize1.Char;
if (C1 != '0')
return false;
- char C2 = Lexer::getCharAndSizeNoWarn(Start + Size, Size, LangOpts);
+
+ auto CharAndSize2 =
+ Lexer::getCharAndSizeNoWarn(Start + CharAndSize1.Size, LangOpts);
+ char C2 = CharAndSize2.Char;
return (C2 == 'x' || C2 == 'X');
}
@@ -2012,8 +2020,7 @@ bool Lexer::LexNumericConstant(Token &Result, const char *CurPtr) {
// If we have a digit separator, continue.
if (C == '\'' && (LangOpts.CPlusPlus14 || LangOpts.C23)) {
- unsigned NextSize;
- char Next = getCharAndSizeNoWarn(CurPtr + Size, NextSize, LangOpts);
+ auto [Next, NextSize] = getCharAndSizeNoWarn(CurPtr + Size, LangOpts);
if (isAsciiIdentifierContinue(Next)) {
if (!isLexingRawMode())
Diag(CurPtr, LangOpts.CPlusPlus
@@ -2085,8 +2092,8 @@ const char *Lexer::LexUDSuffix(Token &Result, const char *CurPtr,
unsigned Consumed = Size;
unsigned Chars = 1;
while (true) {
- unsigned NextSize;
- char Next = getCharAndSizeNoWarn(CurPtr + Consumed, NextSize, LangOpts);
+ auto [Next, NextSize] =
+ getCharAndSizeNoWarn(CurPtr + Consumed, LangOpts);
if (!isAsciiIdentifierContinue(Next)) {
// End of suffix. Check whether this is on the allowed list.
const StringRef CompleteSuffix(Buffer, Chars);
More information about the cfe-commits
mailing list