[clang] [Lexer] Prevent hitting the file system for ASTReader tokens (PR #192492)

Thu Apr 16 10:26:30 PDT 2026

llvmbot wrote:




@llvm/pr-subscribers-clang

Author: lucasvallejoo

<details>
<summary>Changes</summary>

This patch resolves an issue where the Lexer would attempt to measure token lengths from the physical file system (via MeasureTokenLength) even when the SourceLocation was already loaded from a precompiled AST or module.

In environments like interactive C++ (ROOT/Cling) where the original headers might be temporary or removed after the PCH generation, this caused fatal 'file not found' errors.

This upstreaming effort matches ROOT-7111. It includes a robust, cross-platform regression test that deletes the underlying header and uses -ast-dump to force source location resolution without triggering the diagnostics engine's file system checks.

**Note:** This work is submitted as part of my evaluation process for the ROOT/Compiler Fellowship program.
CC: @vgvassilev

---

Patch is 64.60 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/192492.diff


2 Files Affected:

- (modified) clang/lib/Lex/Lexer.cpp (+379-326) 
- (added) clang/test/Lexer/pch-deleted-header.cpp (+15) 


``````````diff

diff --git a/clang/lib/Lex/Lexer.cpp b/clang/lib/Lex/Lexer.cpp
index 10246552bb13d..3941c49c3c964 100644
--- a/clang/lib/Lex/Lexer.cpp
+++ b/clang/lib/Lex/Lexer.cpp
@@ -151,8 +151,8 @@ void Lexer::InitLexer(const char *BufStart, const char *BufPtr,
     // Determine the size of the BOM.
     StringRef Buf(BufferStart, BufferEnd - BufferStart);
     size_t BOMLength = llvm::StringSwitch<size_t>(Buf)
-      .StartsWith("\xEF\xBB\xBF", 3) // UTF-8 BOM
-      .Default(0);
+                           .StartsWith("\xEF\xBB\xBF", 3) // UTF-8 BOM
+                           .Default(0);
 
     // Skip the BOM.
     BufferPtr += BOMLength;
@@ -266,14 +266,14 @@ Lexer *Lexer::Create_PragmaLexer(SourceLocation SpellingLoc,
   const char *StrData = SM.getCharacterData(SpellingLoc);
 
   L->BufferPtr = StrData;
-  L->BufferEnd = StrData+TokLen;
+  L->BufferEnd = StrData + TokLen;
   assert(L->BufferEnd[0] == 0 && "Buffer is not nul terminated!");
 
   // Set the SourceLocation with the remapping information.  This ensures that
   // GetMappedTokenLoc will remap the tokens as they are lexed.
-  L->FileLoc = SM.createExpansionLoc(SM.getLocForStartOfFile(SpellingFID),
-                                     ExpansionLocStart,
-                                     ExpansionLocEnd, TokLen);
+  L->FileLoc =
+      SM.createExpansionLoc(SM.getLocForStartOfFile(SpellingFID),
+                            ExpansionLocStart, ExpansionLocEnd, TokLen);
 
   // Ensure that the lexer thinks it is inside a directive, so that end \n will
   // return an EOD token.
@@ -352,12 +352,14 @@ static size_t getSpellingSlow(const Token &Tok, const char *BufPtr,
     // Raw string literals need special handling; trigraph expansion and line
     // splicing do not occur within their d-char-sequence nor within their
     // r-char-sequence.
-    if (Length >= 2 &&
-        Spelling[Length - 2] == 'R' && Spelling[Length - 1] == '"') {
+    if (Length >= 2 && Spelling[Length - 2] == 'R' &&
+        Spelling[Length - 1] == '"') {
       // Search backwards from the end of the token to find the matching closing
       // quote.
       const char *RawEnd = BufEnd;
-      do --RawEnd; while (*RawEnd != '"');
+      do
+        --RawEnd;
+      while (*RawEnd != '"');
       size_t RawLength = RawEnd - BufPtr + 1;
 
       // Everything between the quotes is included verbatim in the spelling.
@@ -385,11 +387,9 @@ static size_t getSpellingSlow(const Token &Tok, const char *BufPtr,
 /// after trigraph expansion and escaped-newline folding.  In particular, this
 /// wants to get the true, uncanonicalized, spelling of things like digraphs
 /// UCNs, etc.
-StringRef Lexer::getSpelling(SourceLocation loc,
-                             SmallVectorImpl<char> &buffer,
+StringRef Lexer::getSpelling(SourceLocation loc, SmallVectorImpl<char> &buffer,
                              const SourceManager &SM,
-                             const LangOptions &options,
-                             bool *invalid) {
+                             const LangOptions &options, bool *invalid) {
   // Break down the source location.
   FileIDAndOffset locInfo = SM.getDecomposedLoc(loc);
 
@@ -397,15 +397,16 @@ StringRef Lexer::getSpelling(SourceLocation loc,
   bool invalidTemp = false;
   StringRef file = SM.getBufferData(locInfo.first, &invalidTemp);
   if (invalidTemp) {
-    if (invalid) *invalid = true;
+    if (invalid)
+      *invalid = true;
     return {};
   }
 
   const char *tokenBegin = file.data() + locInfo.second;
 
   // Lex from the start of the given location.
-  Lexer lexer(SM.getLocForStartOfFile(locInfo.first), options,
-              file.begin(), tokenBegin, file.end());
+  Lexer lexer(SM.getLocForStartOfFile(locInfo.first), options, file.begin(),
+              tokenBegin, file.end());
   Token token;
   lexer.LexFromRawLexer(token);
 
@@ -431,8 +432,8 @@ std::string Lexer::getSpelling(const Token &Tok, const SourceManager &SourceMgr,
   assert((int)Tok.getLength() >= 0 && "Token character range is bogus!");
 
   bool CharDataInvalid = false;
-  const char *TokStart = SourceMgr.getCharacterData(Tok.getLocation(),
-                                                    &CharDataInvalid);
+  const char *TokStart =
+      SourceMgr.getCharacterData(Tok.getLocation(), &CharDataInvalid);
   if (Invalid)
     *Invalid = CharDataInvalid;
   if (CharDataInvalid)
@@ -498,15 +499,14 @@ unsigned Lexer::getSpelling(const Token &Tok, const char *&Buffer,
   }
 
   // Otherwise, hard case, relex the characters into the string.
-  return getSpellingSlow(Tok, TokStart, LangOpts, const_cast<char*>(Buffer));
+  return getSpellingSlow(Tok, TokStart, LangOpts, const_cast<char *>(Buffer));
 }
 
 /// MeasureTokenLength - Relex the token at the specified location and return
 /// its length in bytes in the input file.  If the token needs cleaning (e.g.
 /// includes a trigraph or an escaped newline) then this count includes bytes
 /// that are part of that.
-unsigned Lexer::MeasureTokenLength(SourceLocation Loc,
-                                   const SourceManager &SM,
+unsigned Lexer::MeasureTokenLength(SourceLocation Loc, const SourceManager &SM,
                                    const LangOptions &LangOpts) {
   Token TheTok;
   if (getRawToken(Loc, TheTok, SM, LangOpts))
@@ -517,8 +517,7 @@ unsigned Lexer::MeasureTokenLength(SourceLocation Loc,
 /// Relex the token at the specified location.
 /// \returns true if there was a failure, false on success.
 bool Lexer::getRawToken(SourceLocation Loc, Token &Result,
-                        const SourceManager &SM,
-                        const LangOptions &LangOpts,
+                        const SourceManager &SM, const LangOptions &LangOpts,
                         bool IgnoreWhiteSpace) {
   // TODO: this could be special cased for common tokens like identifiers, ')',
   // etc to make this faster, if it mattered.  Just look at StrData[0] to handle
@@ -535,7 +534,7 @@ bool Lexer::getRawToken(SourceLocation Loc, Token &Result,
   if (Invalid)
     return true;
 
-  const char *StrData = Buffer.data()+LocInfo.second;
+  const char *StrData = Buffer.data() + LocInfo.second;
 
   if (!IgnoreWhiteSpace && isWhitespace(SkipEscapedNewLines(StrData)[0]))
     return true;
@@ -635,10 +634,7 @@ SourceLocation Lexer::GetBeginningOfToken(SourceLocation Loc,
 
 namespace {
 
-enum PreambleDirectiveKind {
-  PDK_Skipped,
-  PDK_Unknown
-};
+enum PreambleDirectiveKind { PDK_Skipped, PDK_Unknown };
 
 } // namespace
 
@@ -722,31 +718,31 @@ PreambleBounds Lexer::ComputePreamble(StringRef Buffer,
       TheLexer.LexFromRawLexer(TheTok);
       if (TheTok.getKind() == tok::raw_identifier && !TheTok.needsCleaning()) {
         StringRef Keyword = TheTok.getRawIdentifier();
-        PreambleDirectiveKind PDK
-          = llvm::StringSwitch<PreambleDirectiveKind>(Keyword)
-              .Case("include", PDK_Skipped)
-              .Case("__include_macros", PDK_Skipped)
-              .Case("define", PDK_Skipped)
-              .Case("undef", PDK_Skipped)
-              .Case("line", PDK_Skipped)
-              .Case("error", PDK_Skipped)
-              .Case("pragma", PDK_Skipped)
-              .Case("import", PDK_Skipped)
-              .Case("include_next", PDK_Skipped)
-              .Case("warning", PDK_Skipped)
-              .Case("ident", PDK_Skipped)
-              .Case("sccs", PDK_Skipped)
-              .Case("assert", PDK_Skipped)
-              .Case("unassert", PDK_Skipped)
-              .Case("if", PDK_Skipped)
-              .Case("ifdef", PDK_Skipped)
-              .Case("ifndef", PDK_Skipped)
-              .Case("elif", PDK_Skipped)
-              .Case("elifdef", PDK_Skipped)
-              .Case("elifndef", PDK_Skipped)
-              .Case("else", PDK_Skipped)
-              .Case("endif", PDK_Skipped)
-              .Default(PDK_Unknown);
+        PreambleDirectiveKind PDK =
+            llvm::StringSwitch<PreambleDirectiveKind>(Keyword)
+                .Case("include", PDK_Skipped)
+                .Case("__include_macros", PDK_Skipped)
+                .Case("define", PDK_Skipped)
+                .Case("undef", PDK_Skipped)
+                .Case("line", PDK_Skipped)
+                .Case("error", PDK_Skipped)
+                .Case("pragma", PDK_Skipped)
+                .Case("import", PDK_Skipped)
+                .Case("include_next", PDK_Skipped)
+                .Case("warning", PDK_Skipped)
+                .Case("ident", PDK_Skipped)
+                .Case("sccs", PDK_Skipped)
+                .Case("assert", PDK_Skipped)
+                .Case("unassert", PDK_Skipped)
+                .Case("if", PDK_Skipped)
+                .Case("ifdef", PDK_Skipped)
+                .Case("ifndef", PDK_Skipped)
+                .Case("elif", PDK_Skipped)
+                .Case("elifdef", PDK_Skipped)
+                .Case("elifndef", PDK_Skipped)
+                .Case("else", PDK_Skipped)
+                .Case("endif", PDK_Skipped)
+                .Default(PDK_Unknown);
 
         switch (PDK) {
         case PDK_Skipped:
@@ -835,7 +831,7 @@ unsigned Lexer::getTokenPrefixLength(SourceLocation TokStart, unsigned CharNo,
   // advanced by 3 should return the location of b, not of \\.  One compounding
   // detail of this is that the escape may be made by a trigraph.
   if (!Lexer::isObviouslySimpleCharacter(*TokPtr))
-    PhysOffset += Lexer::SkipEscapedNewLines(TokPtr)-TokPtr;
+    PhysOffset += Lexer::SkipEscapedNewLines(TokPtr) - TokPtr;
 
   return PhysOffset;
 }
@@ -874,6 +870,10 @@ SourceLocation Lexer::getLocForEndOfToken(SourceLocation Loc, unsigned Offset,
       return Loc;
   }
 
+  // Don't hit the file system for ASTReader tokens.
+  if (SM.isLoadedSourceLocation(Loc))
+    return Loc;
+
   unsigned Len = Lexer::MeasureTokenLength(Loc, SM, LangOpts);
   if (Len > Offset)
     Len = Len - Offset;
@@ -907,8 +907,7 @@ bool Lexer::isAtStartOfMacroExpansion(SourceLocation loc,
 
 /// Returns true if the given MacroID location points at the last
 /// token of the macro expansion.
-bool Lexer::isAtEndOfMacroExpansion(SourceLocation loc,
-                                    const SourceManager &SM,
+bool Lexer::isAtEndOfMacroExpansion(SourceLocation loc, const SourceManager &SM,
                                     const LangOptions &LangOpts,
                                     SourceLocation *MacroEnd) {
   assert(loc.isValid() && loc.isMacroID() && "Expected a valid macro loc");
@@ -940,7 +939,7 @@ static CharSourceRange makeRangeFromFileLocs(CharSourceRange Range,
   SourceLocation End = Range.getEnd();
   assert(Begin.isFileID() && End.isFileID());
   if (Range.isTokenRange()) {
-    End = Lexer::getLocForEndOfToken(End, 0, SM,LangOpts);
+    End = Lexer::getLocForEndOfToken(End, 0, SM, LangOpts);
     if (End.isInvalid())
       return {};
   }
@@ -951,8 +950,7 @@ static CharSourceRange makeRangeFromFileLocs(CharSourceRange Range,
     return {};
 
   unsigned EndOffs;
-  if (!SM.isInFileID(End, FID, &EndOffs) ||
-      BeginOffs > EndOffs)
+  if (!SM.isInFileID(End, FID, &EndOffs) || BeginOffs > EndOffs)
     return {};
 
   return CharSourceRange::getCharRange(Begin, End);
@@ -999,10 +997,10 @@ CharSourceRange Lexer::makeFileCharRange(CharSourceRange Range,
   assert(Begin.isMacroID() && End.isMacroID());
   SourceLocation MacroBegin, MacroEnd;
   if (isAtStartOfMacroExpansion(Begin, SM, LangOpts, &MacroBegin) &&
-      ((Range.isTokenRange() && isAtEndOfMacroExpansion(End, SM, LangOpts,
-                                                        &MacroEnd)) ||
-       (Range.isCharRange() && isAtStartOfMacroExpansion(End, SM, LangOpts,
-                                                         &MacroEnd)))) {
+      ((Range.isTokenRange() &&
+        isAtEndOfMacroExpansion(End, SM, LangOpts, &MacroEnd)) ||
+       (Range.isCharRange() &&
+        isAtStartOfMacroExpansion(End, SM, LangOpts, &MacroEnd)))) {
     Range.setBegin(MacroBegin);
     Range.setEnd(MacroEnd);
     // Use the *original* `End`, not the expanded one in `MacroEnd`.
@@ -1012,14 +1010,14 @@ CharSourceRange Lexer::makeFileCharRange(CharSourceRange Range,
   }
 
   bool Invalid = false;
-  const SrcMgr::SLocEntry &BeginEntry = SM.getSLocEntry(SM.getFileID(Begin),
-                                                        &Invalid);
+  const SrcMgr::SLocEntry &BeginEntry =
+      SM.getSLocEntry(SM.getFileID(Begin), &Invalid);
   if (Invalid)
     return {};
 
   if (BeginEntry.getExpansion().isMacroArgExpansion()) {
-    const SrcMgr::SLocEntry &EndEntry = SM.getSLocEntry(SM.getFileID(End),
-                                                        &Invalid);
+    const SrcMgr::SLocEntry &EndEntry =
+        SM.getSLocEntry(SM.getFileID(End), &Invalid);
     if (Invalid)
       return {};
 
@@ -1035,27 +1033,28 @@ CharSourceRange Lexer::makeFileCharRange(CharSourceRange Range,
   return {};
 }
 
-StringRef Lexer::getSourceText(CharSourceRange Range,
-                               const SourceManager &SM,
-                               const LangOptions &LangOpts,
-                               bool *Invalid) {
+StringRef Lexer::getSourceText(CharSourceRange Range, const SourceManager &SM,
+                               const LangOptions &LangOpts, bool *Invalid) {
   Range = makeFileCharRange(Range, SM, LangOpts);
   if (Range.isInvalid()) {
-    if (Invalid) *Invalid = true;
+    if (Invalid)
+      *Invalid = true;
     return {};
   }
 
   // Break down the source location.
   FileIDAndOffset beginInfo = SM.getDecomposedLoc(Range.getBegin());
   if (beginInfo.first.isInvalid()) {
-    if (Invalid) *Invalid = true;
+    if (Invalid)
+      *Invalid = true;
     return {};
   }
 
   unsigned EndOffs;
   if (!SM.isInFileID(Range.getEnd(), beginInfo.first, &EndOffs) ||
       beginInfo.second > EndOffs) {
-    if (Invalid) *Invalid = true;
+    if (Invalid)
+      *Invalid = true;
     return {};
   }
 
@@ -1063,11 +1062,13 @@ StringRef Lexer::getSourceText(CharSourceRange Range,
   bool invalidTemp = false;
   StringRef file = SM.getBufferData(beginInfo.first, &invalidTemp);
   if (invalidTemp) {
-    if (Invalid) *Invalid = true;
+    if (Invalid)
+      *Invalid = true;
     return {};
   }
 
-  if (Invalid) *Invalid = false;
+  if (Invalid)
+    *Invalid = false;
   return file.substr(beginInfo.second, EndOffs - beginInfo.second);
 }
 
@@ -1201,8 +1202,8 @@ StringRef Lexer::getIndentationForLine(SourceLocation Loc,
 static LLVM_ATTRIBUTE_NOINLINE SourceLocation GetMappedTokenLoc(
     Preprocessor &PP, SourceLocation FileLoc, unsigned CharNo, unsigned TokLen);
 static SourceLocation GetMappedTokenLoc(Preprocessor &PP,
-                                        SourceLocation FileLoc,
-                                        unsigned CharNo, unsigned TokLen) {
+                                        SourceLocation FileLoc, unsigned CharNo,
+                                        unsigned TokLen) {
   assert(FileLoc.isMacroID() && "Must be a macro expansion");
 
   // Otherwise, we're lexing "mapped tokens".  This is used for things like
@@ -1231,7 +1232,7 @@ SourceLocation Lexer::getSourceLocation(const char *Loc,
 
   // In the normal case, we're just lexing from a simple file buffer, return
   // the file id from FileLoc with the offset specified.
-  unsigned CharNo = Loc-BufferStart;
+  unsigned CharNo = Loc - BufferStart;
   if (FileLoc.isFileID())
     return FileLoc.getLocWithOffset(CharNo);
 
@@ -1255,16 +1256,26 @@ DiagnosticBuilder Lexer::Diag(const char *Loc, unsigned DiagID) const {
 /// return the decoded trigraph letter it corresponds to, or '\0' if nothing.
 static char GetTrigraphCharForLetter(char Letter) {
   switch (Letter) {
-  default:   return 0;
-  case '=':  return '#';
-  case ')':  return ']';
-  case '(':  return '[';
-  case '!':  return '|';
-  case '\'': return '^';
-  case '>':  return '}';
-  case '/':  return '\\';
-  case '<':  return '{';
-  case '-':  return '~';
+  default:
+    return 0;
+  case '=':
+    return '#';
+  case ')':
+    return ']';
+  case '(':
+    return '[';
+  case '!':
+    return '|';
+  case '\'':
+    return '^';
+  case '>':
+    return '}';
+  case '/':
+    return '\\';
+  case '<':
+    return '{';
+  case '-':
+    return '~';
   }
 }
 
@@ -1279,12 +1290,12 @@ static char DecodeTrigraphChar(const char *CP, Lexer *L, bool Trigraphs) {
 
   if (!Trigraphs) {
     if (L && !L->isLexingRawMode())
-      L->Diag(CP-2, diag::trigraph_ignored);
+      L->Diag(CP - 2, diag::trigraph_ignored);
     return 0;
   }
 
   if (L && !L->isLexingRawMode())
-    L->Diag(CP-2, diag::trigraph_converted) << StringRef(&Res, 1);
+    L->Diag(CP - 2, diag::trigraph_converted) << StringRef(&Res, 1);
   return Res;
 }
 
@@ -1296,12 +1307,11 @@ unsigned Lexer::getEscapedNewLineSize(const char *Ptr) {
   while (isWhitespace(Ptr[Size])) {
     ++Size;
 
-    if (Ptr[Size-1] != '\n' && Ptr[Size-1] != '\r')
+    if (Ptr[Size - 1] != '\n' && Ptr[Size - 1] != '\r')
       continue;
 
     // If this is a \r\n or \n\r, skip the other half.
-    if ((Ptr[Size] == '\r' || Ptr[Size] == '\n') &&
-        Ptr[Size-1] != Ptr[Size])
+    if ((Ptr[Size] == '\r' || Ptr[Size] == '\n') && Ptr[Size - 1] != Ptr[Size])
       ++Size;
 
     return Size;
@@ -1318,21 +1328,22 @@ const char *Lexer::SkipEscapedNewLines(const char *P) {
   while (true) {
     const char *AfterEscape;
     if (*P == '\\') {
-      AfterEscape = P+1;
+      AfterEscape = P + 1;
     } else if (*P == '?') {
       // If not a trigraph for escape, bail out.
       if (P[1] != '?' || P[2] != '/')
         return P;
       // FIXME: Take LangOpts into account; the language might not
       // support trigraphs.
-      AfterEscape = P+3;
+      AfterEscape = P + 3;
     } else {
       return P;
     }
 
     unsigned NewLineSize = Lexer::getEscapedNewLineSize(AfterEscape);
-    if (NewLineSize == 0) return P;
-    P = AfterEscape+NewLineSize;
+    if (NewLineSize == 0)
+      return P;
+    P = AfterEscape + NewLineSize;
   }
 }
 
@@ -1359,7 +1370,7 @@ std::optional<Token> Lexer::findNextToken(SourceLocation Loc,
 
   // Lex from the start of the given location.
   Lexer lexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts, File.begin(),
-                                      TokenBegin, File.end());
+              TokenBegin, File.end());
   lexer.SetCommentRetentionState(IncludeComments);
   // Find the token.
   Token Tok;
@@ -1444,7 +1455,7 @@ Lexer::SizedChar Lexer::getCharAndSizeSlow(const char *Ptr, Token *Tok) {
   if (Ptr[0] == '\\') {
     ++Size;
     ++Ptr;
-Slash:
+  Slash:
     // Common case, backslash-char where the char is not whitespace.
     if (!isWhitespace(Ptr[0]))
       return {'\\', Size};
@@ -1453,7 +1464,8 @@ Lexer::SizedChar Lexer::getCharAndSizeSlow(const char *Ptr, Token *Tok) {
     // newline.
     if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) {
       // Remember that this token needs to be cleaned.
-      if (Tok) Tok->setFlag(Token::NeedsCleaning);
+      if (Tok)
+        Tok->setFlag(Token::NeedsCleaning);
 
       // Warn if there was whitespace between the backslash and newline.
       if (Ptr[0] != '\n' && Ptr[0] != '\r' && Tok && !isLexingRawMode())
@@ -1461,7 +1473,7 @@ Lexer::SizedChar Lexer::getCharAndSizeSlow(const char *Ptr, Token *Tok) {
 
       // Found backslash<whitespace><newline>.  Parse the char after it.
       Size += EscapedNewLineSize;
-      Ptr  += EscapedNewLineSize;
+      Ptr += EscapedNewLineSize;
 
       // Use slow version to accumulate a correct size field.
       auto CharAndSize = getCharAndSizeSlow(Ptr, Tok);
@@ -1480,11 +1492,13 @@ Lexer::SizedChar Lexer::getCharAndSizeSlow(const char *Ptr, Token *Tok) {
     if (char C = DecodeTrigraphChar(Ptr + 2, Tok ? this : nullptr,
                                     LangOpts.Trigraphs)) {
       // Remember that this token needs to be cleaned.
-      if (Tok) Tok->setFlag(Token::NeedsCleaning);
+      if (Tok)
+        Tok->setFlag(Token::NeedsCleaning);
 
       Ptr += 3;
       Size += 3;
-      if (C == '\\') goto Slash;
+      if (C == '\\')
+        goto Slash;
       return {C, Size};
     }
   }
@@ -1507,7 +1521,7 @@ Lexer::SizedChar Lexer::getCharAndSizeSlowNoWarn(const char *Ptr,
   if (Ptr[0] == '\\') {
     ++Size;
     ++Ptr;
-Slash:
+  Slash:
     // Common case, backslash-char where the char is not whitespace.
     if (!isWhitespace(Ptr[0]))
       return {'\\', Size};
@@ -1516,7 +15...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/192492