[clang] [Lexer] Prevent hitting the file system for ASTReader tokens (PR #192492)
via cfe-commits
cfe-commits at lists.llvm.org
Thu Apr 16 10:26:30 PDT 2026
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-clang
Author: lucasvallejoo
<details>
<summary>Changes</summary>
This patch resolves an issue where the Lexer would attempt to measure token lengths from the physical file system (via MeasureTokenLength) even when the SourceLocation was already loaded from a precompiled AST or module.
In environments like interactive C++ (ROOT/Cling) where the original headers might be temporary or removed after the PCH generation, this caused fatal 'file not found' errors.
This upstreaming effort matches ROOT-7111. It includes a robust, cross-platform regression test that deletes the underlying header and uses -ast-dump to force source location resolution without triggering the diagnostics engine's file system checks.
**Note:** This work is submitted as part of my evaluation process for the ROOT/Compiler Fellowship program.
CC: @<!-- -->vgvassilev
---
Patch is 64.60 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/192492.diff
2 Files Affected:
- (modified) clang/lib/Lex/Lexer.cpp (+379-326)
- (added) clang/test/Lexer/pch-deleted-header.cpp (+15)
``````````diff
diff --git a/clang/lib/Lex/Lexer.cpp b/clang/lib/Lex/Lexer.cpp
index 10246552bb13d..3941c49c3c964 100644
--- a/clang/lib/Lex/Lexer.cpp
+++ b/clang/lib/Lex/Lexer.cpp
@@ -151,8 +151,8 @@ void Lexer::InitLexer(const char *BufStart, const char *BufPtr,
// Determine the size of the BOM.
StringRef Buf(BufferStart, BufferEnd - BufferStart);
size_t BOMLength = llvm::StringSwitch<size_t>(Buf)
- .StartsWith("\xEF\xBB\xBF", 3) // UTF-8 BOM
- .Default(0);
+ .StartsWith("\xEF\xBB\xBF", 3) // UTF-8 BOM
+ .Default(0);
// Skip the BOM.
BufferPtr += BOMLength;
@@ -266,14 +266,14 @@ Lexer *Lexer::Create_PragmaLexer(SourceLocation SpellingLoc,
const char *StrData = SM.getCharacterData(SpellingLoc);
L->BufferPtr = StrData;
- L->BufferEnd = StrData+TokLen;
+ L->BufferEnd = StrData + TokLen;
assert(L->BufferEnd[0] == 0 && "Buffer is not nul terminated!");
// Set the SourceLocation with the remapping information. This ensures that
// GetMappedTokenLoc will remap the tokens as they are lexed.
- L->FileLoc = SM.createExpansionLoc(SM.getLocForStartOfFile(SpellingFID),
- ExpansionLocStart,
- ExpansionLocEnd, TokLen);
+ L->FileLoc =
+ SM.createExpansionLoc(SM.getLocForStartOfFile(SpellingFID),
+ ExpansionLocStart, ExpansionLocEnd, TokLen);
// Ensure that the lexer thinks it is inside a directive, so that end \n will
// return an EOD token.
@@ -352,12 +352,14 @@ static size_t getSpellingSlow(const Token &Tok, const char *BufPtr,
// Raw string literals need special handling; trigraph expansion and line
// splicing do not occur within their d-char-sequence nor within their
// r-char-sequence.
- if (Length >= 2 &&
- Spelling[Length - 2] == 'R' && Spelling[Length - 1] == '"') {
+ if (Length >= 2 && Spelling[Length - 2] == 'R' &&
+ Spelling[Length - 1] == '"') {
// Search backwards from the end of the token to find the matching closing
// quote.
const char *RawEnd = BufEnd;
- do --RawEnd; while (*RawEnd != '"');
+ do
+ --RawEnd;
+ while (*RawEnd != '"');
size_t RawLength = RawEnd - BufPtr + 1;
// Everything between the quotes is included verbatim in the spelling.
@@ -385,11 +387,9 @@ static size_t getSpellingSlow(const Token &Tok, const char *BufPtr,
/// after trigraph expansion and escaped-newline folding. In particular, this
/// wants to get the true, uncanonicalized, spelling of things like digraphs
/// UCNs, etc.
-StringRef Lexer::getSpelling(SourceLocation loc,
- SmallVectorImpl<char> &buffer,
+StringRef Lexer::getSpelling(SourceLocation loc, SmallVectorImpl<char> &buffer,
const SourceManager &SM,
- const LangOptions &options,
- bool *invalid) {
+ const LangOptions &options, bool *invalid) {
// Break down the source location.
FileIDAndOffset locInfo = SM.getDecomposedLoc(loc);
@@ -397,15 +397,16 @@ StringRef Lexer::getSpelling(SourceLocation loc,
bool invalidTemp = false;
StringRef file = SM.getBufferData(locInfo.first, &invalidTemp);
if (invalidTemp) {
- if (invalid) *invalid = true;
+ if (invalid)
+ *invalid = true;
return {};
}
const char *tokenBegin = file.data() + locInfo.second;
// Lex from the start of the given location.
- Lexer lexer(SM.getLocForStartOfFile(locInfo.first), options,
- file.begin(), tokenBegin, file.end());
+ Lexer lexer(SM.getLocForStartOfFile(locInfo.first), options, file.begin(),
+ tokenBegin, file.end());
Token token;
lexer.LexFromRawLexer(token);
@@ -431,8 +432,8 @@ std::string Lexer::getSpelling(const Token &Tok, const SourceManager &SourceMgr,
assert((int)Tok.getLength() >= 0 && "Token character range is bogus!");
bool CharDataInvalid = false;
- const char *TokStart = SourceMgr.getCharacterData(Tok.getLocation(),
- &CharDataInvalid);
+ const char *TokStart =
+ SourceMgr.getCharacterData(Tok.getLocation(), &CharDataInvalid);
if (Invalid)
*Invalid = CharDataInvalid;
if (CharDataInvalid)
@@ -498,15 +499,14 @@ unsigned Lexer::getSpelling(const Token &Tok, const char *&Buffer,
}
// Otherwise, hard case, relex the characters into the string.
- return getSpellingSlow(Tok, TokStart, LangOpts, const_cast<char*>(Buffer));
+ return getSpellingSlow(Tok, TokStart, LangOpts, const_cast<char *>(Buffer));
}
/// MeasureTokenLength - Relex the token at the specified location and return
/// its length in bytes in the input file. If the token needs cleaning (e.g.
/// includes a trigraph or an escaped newline) then this count includes bytes
/// that are part of that.
-unsigned Lexer::MeasureTokenLength(SourceLocation Loc,
- const SourceManager &SM,
+unsigned Lexer::MeasureTokenLength(SourceLocation Loc, const SourceManager &SM,
const LangOptions &LangOpts) {
Token TheTok;
if (getRawToken(Loc, TheTok, SM, LangOpts))
@@ -517,8 +517,7 @@ unsigned Lexer::MeasureTokenLength(SourceLocation Loc,
/// Relex the token at the specified location.
/// \returns true if there was a failure, false on success.
bool Lexer::getRawToken(SourceLocation Loc, Token &Result,
- const SourceManager &SM,
- const LangOptions &LangOpts,
+ const SourceManager &SM, const LangOptions &LangOpts,
bool IgnoreWhiteSpace) {
// TODO: this could be special cased for common tokens like identifiers, ')',
// etc to make this faster, if it mattered. Just look at StrData[0] to handle
@@ -535,7 +534,7 @@ bool Lexer::getRawToken(SourceLocation Loc, Token &Result,
if (Invalid)
return true;
- const char *StrData = Buffer.data()+LocInfo.second;
+ const char *StrData = Buffer.data() + LocInfo.second;
if (!IgnoreWhiteSpace && isWhitespace(SkipEscapedNewLines(StrData)[0]))
return true;
@@ -635,10 +634,7 @@ SourceLocation Lexer::GetBeginningOfToken(SourceLocation Loc,
namespace {
-enum PreambleDirectiveKind {
- PDK_Skipped,
- PDK_Unknown
-};
+enum PreambleDirectiveKind { PDK_Skipped, PDK_Unknown };
} // namespace
@@ -722,31 +718,31 @@ PreambleBounds Lexer::ComputePreamble(StringRef Buffer,
TheLexer.LexFromRawLexer(TheTok);
if (TheTok.getKind() == tok::raw_identifier && !TheTok.needsCleaning()) {
StringRef Keyword = TheTok.getRawIdentifier();
- PreambleDirectiveKind PDK
- = llvm::StringSwitch<PreambleDirectiveKind>(Keyword)
- .Case("include", PDK_Skipped)
- .Case("__include_macros", PDK_Skipped)
- .Case("define", PDK_Skipped)
- .Case("undef", PDK_Skipped)
- .Case("line", PDK_Skipped)
- .Case("error", PDK_Skipped)
- .Case("pragma", PDK_Skipped)
- .Case("import", PDK_Skipped)
- .Case("include_next", PDK_Skipped)
- .Case("warning", PDK_Skipped)
- .Case("ident", PDK_Skipped)
- .Case("sccs", PDK_Skipped)
- .Case("assert", PDK_Skipped)
- .Case("unassert", PDK_Skipped)
- .Case("if", PDK_Skipped)
- .Case("ifdef", PDK_Skipped)
- .Case("ifndef", PDK_Skipped)
- .Case("elif", PDK_Skipped)
- .Case("elifdef", PDK_Skipped)
- .Case("elifndef", PDK_Skipped)
- .Case("else", PDK_Skipped)
- .Case("endif", PDK_Skipped)
- .Default(PDK_Unknown);
+ PreambleDirectiveKind PDK =
+ llvm::StringSwitch<PreambleDirectiveKind>(Keyword)
+ .Case("include", PDK_Skipped)
+ .Case("__include_macros", PDK_Skipped)
+ .Case("define", PDK_Skipped)
+ .Case("undef", PDK_Skipped)
+ .Case("line", PDK_Skipped)
+ .Case("error", PDK_Skipped)
+ .Case("pragma", PDK_Skipped)
+ .Case("import", PDK_Skipped)
+ .Case("include_next", PDK_Skipped)
+ .Case("warning", PDK_Skipped)
+ .Case("ident", PDK_Skipped)
+ .Case("sccs", PDK_Skipped)
+ .Case("assert", PDK_Skipped)
+ .Case("unassert", PDK_Skipped)
+ .Case("if", PDK_Skipped)
+ .Case("ifdef", PDK_Skipped)
+ .Case("ifndef", PDK_Skipped)
+ .Case("elif", PDK_Skipped)
+ .Case("elifdef", PDK_Skipped)
+ .Case("elifndef", PDK_Skipped)
+ .Case("else", PDK_Skipped)
+ .Case("endif", PDK_Skipped)
+ .Default(PDK_Unknown);
switch (PDK) {
case PDK_Skipped:
@@ -835,7 +831,7 @@ unsigned Lexer::getTokenPrefixLength(SourceLocation TokStart, unsigned CharNo,
// advanced by 3 should return the location of b, not of \\. One compounding
// detail of this is that the escape may be made by a trigraph.
if (!Lexer::isObviouslySimpleCharacter(*TokPtr))
- PhysOffset += Lexer::SkipEscapedNewLines(TokPtr)-TokPtr;
+ PhysOffset += Lexer::SkipEscapedNewLines(TokPtr) - TokPtr;
return PhysOffset;
}
@@ -874,6 +870,10 @@ SourceLocation Lexer::getLocForEndOfToken(SourceLocation Loc, unsigned Offset,
return Loc;
}
+ // Don't hit the file system for ASTReader tokens.
+ if (SM.isLoadedSourceLocation(Loc))
+ return Loc;
+
unsigned Len = Lexer::MeasureTokenLength(Loc, SM, LangOpts);
if (Len > Offset)
Len = Len - Offset;
@@ -907,8 +907,7 @@ bool Lexer::isAtStartOfMacroExpansion(SourceLocation loc,
/// Returns true if the given MacroID location points at the last
/// token of the macro expansion.
-bool Lexer::isAtEndOfMacroExpansion(SourceLocation loc,
- const SourceManager &SM,
+bool Lexer::isAtEndOfMacroExpansion(SourceLocation loc, const SourceManager &SM,
const LangOptions &LangOpts,
SourceLocation *MacroEnd) {
assert(loc.isValid() && loc.isMacroID() && "Expected a valid macro loc");
@@ -940,7 +939,7 @@ static CharSourceRange makeRangeFromFileLocs(CharSourceRange Range,
SourceLocation End = Range.getEnd();
assert(Begin.isFileID() && End.isFileID());
if (Range.isTokenRange()) {
- End = Lexer::getLocForEndOfToken(End, 0, SM,LangOpts);
+ End = Lexer::getLocForEndOfToken(End, 0, SM, LangOpts);
if (End.isInvalid())
return {};
}
@@ -951,8 +950,7 @@ static CharSourceRange makeRangeFromFileLocs(CharSourceRange Range,
return {};
unsigned EndOffs;
- if (!SM.isInFileID(End, FID, &EndOffs) ||
- BeginOffs > EndOffs)
+ if (!SM.isInFileID(End, FID, &EndOffs) || BeginOffs > EndOffs)
return {};
return CharSourceRange::getCharRange(Begin, End);
@@ -999,10 +997,10 @@ CharSourceRange Lexer::makeFileCharRange(CharSourceRange Range,
assert(Begin.isMacroID() && End.isMacroID());
SourceLocation MacroBegin, MacroEnd;
if (isAtStartOfMacroExpansion(Begin, SM, LangOpts, &MacroBegin) &&
- ((Range.isTokenRange() && isAtEndOfMacroExpansion(End, SM, LangOpts,
- &MacroEnd)) ||
- (Range.isCharRange() && isAtStartOfMacroExpansion(End, SM, LangOpts,
- &MacroEnd)))) {
+ ((Range.isTokenRange() &&
+ isAtEndOfMacroExpansion(End, SM, LangOpts, &MacroEnd)) ||
+ (Range.isCharRange() &&
+ isAtStartOfMacroExpansion(End, SM, LangOpts, &MacroEnd)))) {
Range.setBegin(MacroBegin);
Range.setEnd(MacroEnd);
// Use the *original* `End`, not the expanded one in `MacroEnd`.
@@ -1012,14 +1010,14 @@ CharSourceRange Lexer::makeFileCharRange(CharSourceRange Range,
}
bool Invalid = false;
- const SrcMgr::SLocEntry &BeginEntry = SM.getSLocEntry(SM.getFileID(Begin),
- &Invalid);
+ const SrcMgr::SLocEntry &BeginEntry =
+ SM.getSLocEntry(SM.getFileID(Begin), &Invalid);
if (Invalid)
return {};
if (BeginEntry.getExpansion().isMacroArgExpansion()) {
- const SrcMgr::SLocEntry &EndEntry = SM.getSLocEntry(SM.getFileID(End),
- &Invalid);
+ const SrcMgr::SLocEntry &EndEntry =
+ SM.getSLocEntry(SM.getFileID(End), &Invalid);
if (Invalid)
return {};
@@ -1035,27 +1033,28 @@ CharSourceRange Lexer::makeFileCharRange(CharSourceRange Range,
return {};
}
-StringRef Lexer::getSourceText(CharSourceRange Range,
- const SourceManager &SM,
- const LangOptions &LangOpts,
- bool *Invalid) {
+StringRef Lexer::getSourceText(CharSourceRange Range, const SourceManager &SM,
+ const LangOptions &LangOpts, bool *Invalid) {
Range = makeFileCharRange(Range, SM, LangOpts);
if (Range.isInvalid()) {
- if (Invalid) *Invalid = true;
+ if (Invalid)
+ *Invalid = true;
return {};
}
// Break down the source location.
FileIDAndOffset beginInfo = SM.getDecomposedLoc(Range.getBegin());
if (beginInfo.first.isInvalid()) {
- if (Invalid) *Invalid = true;
+ if (Invalid)
+ *Invalid = true;
return {};
}
unsigned EndOffs;
if (!SM.isInFileID(Range.getEnd(), beginInfo.first, &EndOffs) ||
beginInfo.second > EndOffs) {
- if (Invalid) *Invalid = true;
+ if (Invalid)
+ *Invalid = true;
return {};
}
@@ -1063,11 +1062,13 @@ StringRef Lexer::getSourceText(CharSourceRange Range,
bool invalidTemp = false;
StringRef file = SM.getBufferData(beginInfo.first, &invalidTemp);
if (invalidTemp) {
- if (Invalid) *Invalid = true;
+ if (Invalid)
+ *Invalid = true;
return {};
}
- if (Invalid) *Invalid = false;
+ if (Invalid)
+ *Invalid = false;
return file.substr(beginInfo.second, EndOffs - beginInfo.second);
}
@@ -1201,8 +1202,8 @@ StringRef Lexer::getIndentationForLine(SourceLocation Loc,
static LLVM_ATTRIBUTE_NOINLINE SourceLocation GetMappedTokenLoc(
Preprocessor &PP, SourceLocation FileLoc, unsigned CharNo, unsigned TokLen);
static SourceLocation GetMappedTokenLoc(Preprocessor &PP,
- SourceLocation FileLoc,
- unsigned CharNo, unsigned TokLen) {
+ SourceLocation FileLoc, unsigned CharNo,
+ unsigned TokLen) {
assert(FileLoc.isMacroID() && "Must be a macro expansion");
// Otherwise, we're lexing "mapped tokens". This is used for things like
@@ -1231,7 +1232,7 @@ SourceLocation Lexer::getSourceLocation(const char *Loc,
// In the normal case, we're just lexing from a simple file buffer, return
// the file id from FileLoc with the offset specified.
- unsigned CharNo = Loc-BufferStart;
+ unsigned CharNo = Loc - BufferStart;
if (FileLoc.isFileID())
return FileLoc.getLocWithOffset(CharNo);
@@ -1255,16 +1256,26 @@ DiagnosticBuilder Lexer::Diag(const char *Loc, unsigned DiagID) const {
/// return the decoded trigraph letter it corresponds to, or '\0' if nothing.
static char GetTrigraphCharForLetter(char Letter) {
switch (Letter) {
- default: return 0;
- case '=': return '#';
- case ')': return ']';
- case '(': return '[';
- case '!': return '|';
- case '\'': return '^';
- case '>': return '}';
- case '/': return '\\';
- case '<': return '{';
- case '-': return '~';
+ default:
+ return 0;
+ case '=':
+ return '#';
+ case ')':
+ return ']';
+ case '(':
+ return '[';
+ case '!':
+ return '|';
+ case '\'':
+ return '^';
+ case '>':
+ return '}';
+ case '/':
+ return '\\';
+ case '<':
+ return '{';
+ case '-':
+ return '~';
}
}
@@ -1279,12 +1290,12 @@ static char DecodeTrigraphChar(const char *CP, Lexer *L, bool Trigraphs) {
if (!Trigraphs) {
if (L && !L->isLexingRawMode())
- L->Diag(CP-2, diag::trigraph_ignored);
+ L->Diag(CP - 2, diag::trigraph_ignored);
return 0;
}
if (L && !L->isLexingRawMode())
- L->Diag(CP-2, diag::trigraph_converted) << StringRef(&Res, 1);
+ L->Diag(CP - 2, diag::trigraph_converted) << StringRef(&Res, 1);
return Res;
}
@@ -1296,12 +1307,11 @@ unsigned Lexer::getEscapedNewLineSize(const char *Ptr) {
while (isWhitespace(Ptr[Size])) {
++Size;
- if (Ptr[Size-1] != '\n' && Ptr[Size-1] != '\r')
+ if (Ptr[Size - 1] != '\n' && Ptr[Size - 1] != '\r')
continue;
// If this is a \r\n or \n\r, skip the other half.
- if ((Ptr[Size] == '\r' || Ptr[Size] == '\n') &&
- Ptr[Size-1] != Ptr[Size])
+ if ((Ptr[Size] == '\r' || Ptr[Size] == '\n') && Ptr[Size - 1] != Ptr[Size])
++Size;
return Size;
@@ -1318,21 +1328,22 @@ const char *Lexer::SkipEscapedNewLines(const char *P) {
while (true) {
const char *AfterEscape;
if (*P == '\\') {
- AfterEscape = P+1;
+ AfterEscape = P + 1;
} else if (*P == '?') {
// If not a trigraph for escape, bail out.
if (P[1] != '?' || P[2] != '/')
return P;
// FIXME: Take LangOpts into account; the language might not
// support trigraphs.
- AfterEscape = P+3;
+ AfterEscape = P + 3;
} else {
return P;
}
unsigned NewLineSize = Lexer::getEscapedNewLineSize(AfterEscape);
- if (NewLineSize == 0) return P;
- P = AfterEscape+NewLineSize;
+ if (NewLineSize == 0)
+ return P;
+ P = AfterEscape + NewLineSize;
}
}
@@ -1359,7 +1370,7 @@ std::optional<Token> Lexer::findNextToken(SourceLocation Loc,
// Lex from the start of the given location.
Lexer lexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts, File.begin(),
- TokenBegin, File.end());
+ TokenBegin, File.end());
lexer.SetCommentRetentionState(IncludeComments);
// Find the token.
Token Tok;
@@ -1444,7 +1455,7 @@ Lexer::SizedChar Lexer::getCharAndSizeSlow(const char *Ptr, Token *Tok) {
if (Ptr[0] == '\\') {
++Size;
++Ptr;
-Slash:
+ Slash:
// Common case, backslash-char where the char is not whitespace.
if (!isWhitespace(Ptr[0]))
return {'\\', Size};
@@ -1453,7 +1464,8 @@ Lexer::SizedChar Lexer::getCharAndSizeSlow(const char *Ptr, Token *Tok) {
// newline.
if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) {
// Remember that this token needs to be cleaned.
- if (Tok) Tok->setFlag(Token::NeedsCleaning);
+ if (Tok)
+ Tok->setFlag(Token::NeedsCleaning);
// Warn if there was whitespace between the backslash and newline.
if (Ptr[0] != '\n' && Ptr[0] != '\r' && Tok && !isLexingRawMode())
@@ -1461,7 +1473,7 @@ Lexer::SizedChar Lexer::getCharAndSizeSlow(const char *Ptr, Token *Tok) {
// Found backslash<whitespace><newline>. Parse the char after it.
Size += EscapedNewLineSize;
- Ptr += EscapedNewLineSize;
+ Ptr += EscapedNewLineSize;
// Use slow version to accumulate a correct size field.
auto CharAndSize = getCharAndSizeSlow(Ptr, Tok);
@@ -1480,11 +1492,13 @@ Lexer::SizedChar Lexer::getCharAndSizeSlow(const char *Ptr, Token *Tok) {
if (char C = DecodeTrigraphChar(Ptr + 2, Tok ? this : nullptr,
LangOpts.Trigraphs)) {
// Remember that this token needs to be cleaned.
- if (Tok) Tok->setFlag(Token::NeedsCleaning);
+ if (Tok)
+ Tok->setFlag(Token::NeedsCleaning);
Ptr += 3;
Size += 3;
- if (C == '\\') goto Slash;
+ if (C == '\\')
+ goto Slash;
return {C, Size};
}
}
@@ -1507,7 +1521,7 @@ Lexer::SizedChar Lexer::getCharAndSizeSlowNoWarn(const char *Ptr,
if (Ptr[0] == '\\') {
++Size;
++Ptr;
-Slash:
+ Slash:
// Common case, backslash-char where the char is not whitespace.
if (!isWhitespace(Ptr[0]))
return {'\\', Size};
@@ -1516,7 +15...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/192492
More information about the cfe-commits
mailing list