[clang] [clang][Preprocessor] Handle the first pp-token in EnterMainSourceFile (PR #145244)

via cfe-commits cfe-commits at lists.llvm.org
Tue Jun 24 04:09:39 PDT 2025


https://github.com/yronglin updated https://github.com/llvm/llvm-project/pull/145244

>From 373daed324ca99f0b327c424c64a101f7d0f99a3 Mon Sep 17 00:00:00 2001
From: yronglin <yronglin777 at gmail.com>
Date: Thu, 12 Jun 2025 21:46:13 +0800
Subject: [PATCH 1/4] [Clang] Add peekNextPPToken, makes peek next token
 without side-effects

Signed-off-by: yronglin <yronglin777 at gmail.com>
---
 clang/include/clang/Lex/Lexer.h        | 10 ++++----
 clang/include/clang/Lex/Preprocessor.h |  8 ++++++-
 clang/include/clang/Lex/TokenLexer.h   |  7 +++---
 clang/lib/Lex/Lexer.cpp                | 21 +++++++++--------
 clang/lib/Lex/PPMacroExpansion.cpp     | 32 ++++++++++++--------------
 clang/lib/Lex/TokenLexer.cpp           | 10 ++++----
 6 files changed, 46 insertions(+), 42 deletions(-)

diff --git a/clang/include/clang/Lex/Lexer.h b/clang/include/clang/Lex/Lexer.h
index bb65ae010cffa..a595cda1eaa77 100644
--- a/clang/include/clang/Lex/Lexer.h
+++ b/clang/include/clang/Lex/Lexer.h
@@ -124,7 +124,7 @@ class Lexer : public PreprocessorLexer {
   //===--------------------------------------------------------------------===//
   // Context that changes as the file is lexed.
   // NOTE: any state that mutates when in raw mode must have save/restore code
-  // in Lexer::isNextPPTokenLParen.
+  // in Lexer::peekNextPPToken.
 
   // BufferPtr - Current pointer into the buffer.  This is the next character
   // to be lexed.
@@ -642,10 +642,10 @@ class Lexer : public PreprocessorLexer {
     BufferPtr = TokEnd;
   }
 
-  /// isNextPPTokenLParen - Return 1 if the next unexpanded token will return a
-  /// tok::l_paren token, 0 if it is something else and 2 if there are no more
-  /// tokens in the buffer controlled by this lexer.
-  unsigned isNextPPTokenLParen();
+  /// peekNextPPToken - Return std::nullopt if there are no more tokens in the
+  /// buffer controlled by this lexer, otherwise return the next unexpanded
+  /// token.
+  std::optional<Token> peekNextPPToken();
 
   //===--------------------------------------------------------------------===//
   // Lexer character reading interfaces.
diff --git a/clang/include/clang/Lex/Preprocessor.h b/clang/include/clang/Lex/Preprocessor.h
index 78be2bd64d61c..4fe7a79393afc 100644
--- a/clang/include/clang/Lex/Preprocessor.h
+++ b/clang/include/clang/Lex/Preprocessor.h
@@ -2288,7 +2288,9 @@ class Preprocessor {
   /// Determine whether the next preprocessor token to be
   /// lexed is a '('.  If so, consume the token and return true, if not, this
   /// method should have no observable side-effect on the lexed tokens.
-  bool isNextPPTokenLParen();
+  bool isNextPPTokenLParen() {
+    return peekNextPPToken().value_or(Token{}).is(tok::l_paren);
+  }
 
 private:
   /// Identifiers used for SEH handling in Borland. These are only
@@ -2667,6 +2669,10 @@ class Preprocessor {
 
   void removeCachedMacroExpandedTokensOfLastLexer();
 
+  /// Peek the next token. If so, return the token, if not, this
+  /// method should have no observable side-effect on the lexed tokens.
+  std::optional<Token> peekNextPPToken();
+
   /// After reading "MACRO(", this method is invoked to read all of the formal
   /// arguments specified for the macro invocation.  Returns null on error.
   MacroArgs *ReadMacroCallArgumentList(Token &MacroName, MacroInfo *MI,
diff --git a/clang/include/clang/Lex/TokenLexer.h b/clang/include/clang/Lex/TokenLexer.h
index 4d229ae610674..777b4e6266c71 100644
--- a/clang/include/clang/Lex/TokenLexer.h
+++ b/clang/include/clang/Lex/TokenLexer.h
@@ -139,10 +139,9 @@ class TokenLexer {
   void Init(const Token *TokArray, unsigned NumToks, bool DisableMacroExpansion,
             bool OwnsTokens, bool IsReinject);
 
-  /// If the next token lexed will pop this macro off the
-  /// expansion stack, return 2.  If the next unexpanded token is a '(', return
-  /// 1, otherwise return 0.
-  unsigned isNextTokenLParen() const;
+  /// If the next token lexed will pop this macro off the expansion stack,
+  /// return std::nullopt, otherwise return the next unexpanded token.
+  std::optional<Token> peekNextPPToken() const;
 
   /// Lex and return a token from this macro stream.
   bool Lex(Token &Tok);
diff --git a/clang/lib/Lex/Lexer.cpp b/clang/lib/Lex/Lexer.cpp
index 93200458f04b4..8e977eac8c983 100644
--- a/clang/lib/Lex/Lexer.cpp
+++ b/clang/lib/Lex/Lexer.cpp
@@ -3200,18 +3200,19 @@ bool Lexer::LexEndOfFile(Token &Result, const char *CurPtr) {
   return PP->HandleEndOfFile(Result, isPragmaLexer());
 }
 
-/// isNextPPTokenLParen - Return 1 if the next unexpanded token lexed from
-/// the specified lexer will return a tok::l_paren token, 0 if it is something
-/// else and 2 if there are no more tokens in the buffer controlled by the
-/// lexer.
-unsigned Lexer::isNextPPTokenLParen() {
+/// peekNextPPToken - Return std::nullopt if there are no more tokens in the
+/// buffer controlled by this lexer, otherwise return the next unexpanded
+/// token.
+std::optional<Token> Lexer::peekNextPPToken() {
   assert(!LexingRawMode && "How can we expand a macro from a skipping buffer?");
 
   if (isDependencyDirectivesLexer()) {
     if (NextDepDirectiveTokenIndex == DepDirectives.front().Tokens.size())
-      return 2;
-    return DepDirectives.front().Tokens[NextDepDirectiveTokenIndex].is(
-        tok::l_paren);
+      return std::nullopt;
+    Token Result;
+    (void)convertDependencyDirectiveToken(
+        DepDirectives.front().Tokens[NextDepDirectiveTokenIndex], Result);
+    return Result;
   }
 
   // Switch to 'skipping' mode.  This will ensure that we can lex a token
@@ -3240,8 +3241,8 @@ unsigned Lexer::isNextPPTokenLParen() {
   LexingRawMode = false;
 
   if (Tok.is(tok::eof))
-    return 2;
-  return Tok.is(tok::l_paren);
+    return std::nullopt;
+  return Tok;
 }
 
 /// Find the end of a version control conflict marker.
diff --git a/clang/lib/Lex/PPMacroExpansion.cpp b/clang/lib/Lex/PPMacroExpansion.cpp
index 37ac1bf07e9c0..585990f60c98a 100644
--- a/clang/lib/Lex/PPMacroExpansion.cpp
+++ b/clang/lib/Lex/PPMacroExpansion.cpp
@@ -418,42 +418,40 @@ static bool isTrivialSingleTokenExpansion(const MacroInfo *MI,
   return !llvm::is_contained(MI->params(), II);
 }
 
-/// isNextPPTokenLParen - Determine whether the next preprocessor token to be
-/// lexed is a '('.  If so, consume the token and return true, if not, this
-/// method should have no observable side-effect on the lexed tokens.
-bool Preprocessor::isNextPPTokenLParen() {
+/// isNextPPTokenLParen - Peek the next token. If so, return the token, if not,
+/// this method should have no observable side-effect on the lexed tokens.
+std::optional<Token> Preprocessor::peekNextPPToken() {
   // Do some quick tests for rejection cases.
-  unsigned Val;
+  std::optional<Token> Val;
   if (CurLexer)
-    Val = CurLexer->isNextPPTokenLParen();
+    Val = CurLexer->peekNextPPToken();
   else
-    Val = CurTokenLexer->isNextTokenLParen();
+    Val = CurTokenLexer->peekNextPPToken();
 
-  if (Val == 2) {
+  if (!Val) {
     // We have run off the end.  If it's a source file we don't
     // examine enclosing ones (C99 5.1.1.2p4).  Otherwise walk up the
     // macro stack.
     if (CurPPLexer)
-      return false;
+      return std::nullopt;
     for (const IncludeStackInfo &Entry : llvm::reverse(IncludeMacroStack)) {
       if (Entry.TheLexer)
-        Val = Entry.TheLexer->isNextPPTokenLParen();
+        Val = Entry.TheLexer->peekNextPPToken();
       else
-        Val = Entry.TheTokenLexer->isNextTokenLParen();
+        Val = Entry.TheTokenLexer->peekNextPPToken();
 
-      if (Val != 2)
+      if (Val)
         break;
 
       // Ran off the end of a source file?
       if (Entry.ThePPLexer)
-        return false;
+        return std::nullopt;
     }
   }
 
-  // Okay, if we know that the token is a '(', lex it and return.  Otherwise we
-  // have found something that isn't a '(' or we found the end of the
-  // translation unit.  In either case, return false.
-  return Val == 1;
+  // Okay, we found the token and return.  Otherwise we found the end of the
+  // translation unit.
+  return Val;
 }
 
 /// HandleMacroExpandedIdentifier - If an identifier token is read that is to be
diff --git a/clang/lib/Lex/TokenLexer.cpp b/clang/lib/Lex/TokenLexer.cpp
index 6e93416e01c0c..fbb8c4262d6da 100644
--- a/clang/lib/Lex/TokenLexer.cpp
+++ b/clang/lib/Lex/TokenLexer.cpp
@@ -921,13 +921,13 @@ bool TokenLexer::pasteTokens(Token &LHSTok, ArrayRef<Token> TokenStream,
 }
 
 /// isNextTokenLParen - If the next token lexed will pop this macro off the
-/// expansion stack, return 2.  If the next unexpanded token is a '(', return
-/// 1, otherwise return 0.
-unsigned TokenLexer::isNextTokenLParen() const {
+/// expansion stack, return std::nullopt, otherwise return the next unexpanded
+/// token.
+std::optional<Token> TokenLexer::peekNextPPToken() const {
   // Out of tokens?
   if (isAtEnd())
-    return 2;
-  return Tokens[CurTokenIdx].is(tok::l_paren);
+    return std::nullopt;
+  return Tokens[CurTokenIdx];
 }
 
 /// isParsingPreprocessorDirective - Return true if we are in the middle of a

>From c963c3a9cdb7835e5c031bf7a1f82e61f0c15e42 Mon Sep 17 00:00:00 2001
From: yronglin <yronglin777 at gmail.com>
Date: Sat, 14 Jun 2025 14:29:16 +0800
Subject: [PATCH 2/4] Update peekNextToken to isNextPPTokenOneOf

Signed-off-by: yronglin <yronglin777 at gmail.com>
---
 clang/include/clang/Lex/Preprocessor.h | 41 +++++++++++++++++++++-----
 clang/lib/Lex/PPDirectives.cpp         |  4 +--
 clang/lib/Lex/PPMacroExpansion.cpp     | 36 ----------------------
 clang/lib/Lex/Preprocessor.cpp         |  4 +--
 4 files changed, 37 insertions(+), 48 deletions(-)

diff --git a/clang/include/clang/Lex/Preprocessor.h b/clang/include/clang/Lex/Preprocessor.h
index 4fe7a79393afc..f5e85b3da1ba1 100644
--- a/clang/include/clang/Lex/Preprocessor.h
+++ b/clang/include/clang/Lex/Preprocessor.h
@@ -2285,11 +2285,40 @@ class Preprocessor {
     }
   }
 
-  /// Determine whether the next preprocessor token to be
-  /// lexed is a '('.  If so, consume the token and return true, if not, this
+  /// Check whether the next pp-token is one of the specificed token kind. this
   /// method should have no observable side-effect on the lexed tokens.
-  bool isNextPPTokenLParen() {
-    return peekNextPPToken().value_or(Token{}).is(tok::l_paren);
+  template <tok::TokenKind K, tok::TokenKind... Ks> bool isNextPPTokenOneOf() {
+    // Do some quick tests for rejection cases.
+    std::optional<Token> Val;
+    if (CurLexer)
+      Val = CurLexer->peekNextPPToken();
+    else
+      Val = CurTokenLexer->peekNextPPToken();
+
+    if (!Val) {
+      // We have run off the end.  If it's a source file we don't
+      // examine enclosing ones (C99 5.1.1.2p4).  Otherwise walk up the
+      // macro stack.
+      if (CurPPLexer)
+        return false;
+      for (const IncludeStackInfo &Entry : llvm::reverse(IncludeMacroStack)) {
+        if (Entry.TheLexer)
+          Val = Entry.TheLexer->peekNextPPToken();
+        else
+          Val = Entry.TheTokenLexer->peekNextPPToken();
+
+        if (Val)
+          break;
+
+        // Ran off the end of a source file?
+        if (Entry.ThePPLexer)
+          return false;
+      }
+    }
+
+    // Okay, we found the token and return.  Otherwise we found the end of the
+    // translation unit.
+    return Val->is(K) || (... || Val->is(Ks));
   }
 
 private:
@@ -2669,10 +2698,6 @@ class Preprocessor {
 
   void removeCachedMacroExpandedTokensOfLastLexer();
 
-  /// Peek the next token. If so, return the token, if not, this
-  /// method should have no observable side-effect on the lexed tokens.
-  std::optional<Token> peekNextPPToken();
-
   /// After reading "MACRO(", this method is invoked to read all of the formal
   /// arguments specified for the macro invocation.  Returns null on error.
   MacroArgs *ReadMacroCallArgumentList(Token &MacroName, MacroInfo *MI,
diff --git a/clang/lib/Lex/PPDirectives.cpp b/clang/lib/Lex/PPDirectives.cpp
index 04a30f66fb736..be061f462f65a 100644
--- a/clang/lib/Lex/PPDirectives.cpp
+++ b/clang/lib/Lex/PPDirectives.cpp
@@ -183,9 +183,9 @@ static bool isReservedCXXAttributeName(Preprocessor &PP, IdentifierInfo *II) {
     AttributeCommonInfo::AttrArgsInfo AttrArgsInfo =
         AttributeCommonInfo::getCXX11AttrArgsInfo(II);
     if (AttrArgsInfo == AttributeCommonInfo::AttrArgsInfo::Required)
-      return PP.isNextPPTokenLParen();
+      return PP.isNextPPTokenOneOf<tok::l_paren>();
 
-    return !PP.isNextPPTokenLParen() ||
+    return !PP.isNextPPTokenOneOf<tok::l_paren>() ||
            AttrArgsInfo == AttributeCommonInfo::AttrArgsInfo::Optional;
   }
   return false;
diff --git a/clang/lib/Lex/PPMacroExpansion.cpp b/clang/lib/Lex/PPMacroExpansion.cpp
index 585990f60c98a..b8b91e32179af 100644
--- a/clang/lib/Lex/PPMacroExpansion.cpp
+++ b/clang/lib/Lex/PPMacroExpansion.cpp
@@ -418,42 +418,6 @@ static bool isTrivialSingleTokenExpansion(const MacroInfo *MI,
   return !llvm::is_contained(MI->params(), II);
 }
 
-/// isNextPPTokenLParen - Peek the next token. If so, return the token, if not,
-/// this method should have no observable side-effect on the lexed tokens.
-std::optional<Token> Preprocessor::peekNextPPToken() {
-  // Do some quick tests for rejection cases.
-  std::optional<Token> Val;
-  if (CurLexer)
-    Val = CurLexer->peekNextPPToken();
-  else
-    Val = CurTokenLexer->peekNextPPToken();
-
-  if (!Val) {
-    // We have run off the end.  If it's a source file we don't
-    // examine enclosing ones (C99 5.1.1.2p4).  Otherwise walk up the
-    // macro stack.
-    if (CurPPLexer)
-      return std::nullopt;
-    for (const IncludeStackInfo &Entry : llvm::reverse(IncludeMacroStack)) {
-      if (Entry.TheLexer)
-        Val = Entry.TheLexer->peekNextPPToken();
-      else
-        Val = Entry.TheTokenLexer->peekNextPPToken();
-
-      if (Val)
-        break;
-
-      // Ran off the end of a source file?
-      if (Entry.ThePPLexer)
-        return std::nullopt;
-    }
-  }
-
-  // Okay, we found the token and return.  Otherwise we found the end of the
-  // translation unit.
-  return Val;
-}
-
 /// HandleMacroExpandedIdentifier - If an identifier token is read that is to be
 /// expanded as a macro, handle it and return the next token as 'Identifier'.
 bool Preprocessor::HandleMacroExpandedIdentifier(Token &Identifier,
diff --git a/clang/lib/Lex/Preprocessor.cpp b/clang/lib/Lex/Preprocessor.cpp
index 21fc7a2b6fae2..f28113e569218 100644
--- a/clang/lib/Lex/Preprocessor.cpp
+++ b/clang/lib/Lex/Preprocessor.cpp
@@ -811,14 +811,14 @@ bool Preprocessor::HandleIdentifier(Token &Identifier) {
       if (!Identifier.isExpandDisabled() && MI->isEnabled()) {
         // C99 6.10.3p10: If the preprocessing token immediately after the
         // macro name isn't a '(', this macro should not be expanded.
-        if (!MI->isFunctionLike() || isNextPPTokenLParen())
+        if (!MI->isFunctionLike() || isNextPPTokenOneOf<tok::l_paren>())
           return HandleMacroExpandedIdentifier(Identifier, MD);
       } else {
         // C99 6.10.3.4p2 says that a disabled macro may never again be
         // expanded, even if it's in a context where it could be expanded in the
         // future.
         Identifier.setFlag(Token::DisableExpand);
-        if (MI->isObjectLike() || isNextPPTokenLParen())
+        if (MI->isObjectLike() || isNextPPTokenOneOf<tok::l_paren>())
           Diag(Identifier, diag::pp_disabled_macro_expansion);
       }
     }

>From 6fe2cd4c723695c0ce6a4d2ced7a2b2628cdaf5d Mon Sep 17 00:00:00 2001
From: yronglin <yronglin777 at gmail.com>
Date: Mon, 23 Jun 2025 01:30:20 +0800
Subject: [PATCH 3/4] [clang][Preprocessor] Handle the first pp-token in
 EnterMainSourceFile

Signed-off-by: yronglin <yronglin777 at gmail.com>
---
 clang/include/clang/Lex/Preprocessor.h | 19 +++++---------
 clang/lib/Lex/Lexer.cpp                |  9 ++-----
 clang/lib/Lex/PPDirectives.cpp         |  3 ---
 clang/lib/Lex/PPMacroExpansion.cpp     |  3 ---
 clang/lib/Lex/Preprocessor.cpp         | 15 ++++++++++++
 clang/lib/Sema/SemaModule.cpp          |  8 +++---
 clang/unittests/Lex/LexerTest.cpp      | 34 ++++++++++++++------------
 7 files changed, 45 insertions(+), 46 deletions(-)

diff --git a/clang/include/clang/Lex/Preprocessor.h b/clang/include/clang/Lex/Preprocessor.h
index 0ec1cb4d0c5d8..a3cf18f21ca40 100644
--- a/clang/include/clang/Lex/Preprocessor.h
+++ b/clang/include/clang/Lex/Preprocessor.h
@@ -351,7 +351,7 @@ class Preprocessor {
   bool LastTokenWasAt = false;
 
   /// First pp-token in current translation unit.
-  std::optional<Token> FirstPPToken;
+  SourceLocation FirstPPTokenLoc;
 
   /// A position within a C++20 import-seq.
   class StdCXXImportSeq {
@@ -1769,20 +1769,13 @@ class Preprocessor {
   std::optional<LexEmbedParametersResult> LexEmbedParameters(Token &Current,
                                                              bool ForHasEmbed);
 
-  /// Whether the preprocessor already seen the first pp-token in main file.
-  bool hasSeenMainFileFirstPPToken() const { return FirstPPToken.has_value(); }
-
-  /// Record first pp-token and check if it has a Token::FirstPPToken flag.
-  void HandleMainFileFirstPPToken(const Token &Tok) {
-    if (!hasSeenMainFileFirstPPToken() && Tok.isFirstPPToken() &&
-        SourceMgr.isWrittenInMainFile(Tok.getLocation()))
-      FirstPPToken = Tok;
+  /// Get the start location of the first pp-token in main file.
+  SourceLocation getMainFileFirstPPTokenLoc() const {
+    assert(FirstPPTokenLoc.isValid() &&
+           "Did not see the first pp-token in the main file");
+    return FirstPPTokenLoc;
   }
 
-  Token getMainFileFirstPPToken() const {
-    assert(FirstPPToken && "First main file pp-token doesn't exists");
-    return *FirstPPToken;
-  }
   bool LexAfterModuleImport(Token &Result);
   void CollectPpImportSuffix(SmallVectorImpl<Token> &Toks);
 
diff --git a/clang/lib/Lex/Lexer.cpp b/clang/lib/Lex/Lexer.cpp
index f4d16ecce393c..42ea7edf3aaad 100644
--- a/clang/lib/Lex/Lexer.cpp
+++ b/clang/lib/Lex/Lexer.cpp
@@ -3228,6 +3228,7 @@ std::optional<Token> Lexer::peekNextPPToken() {
   bool atStartOfLine = IsAtStartOfLine;
   bool atPhysicalStartOfLine = IsAtPhysicalStartOfLine;
   bool leadingSpace = HasLeadingSpace;
+  bool isFirstPPToken = IsFirstPPToken;
 
   Token Tok;
   Lex(Tok);
@@ -3238,7 +3239,7 @@ std::optional<Token> Lexer::peekNextPPToken() {
   HasLeadingSpace = leadingSpace;
   IsAtStartOfLine = atStartOfLine;
   IsAtPhysicalStartOfLine = atPhysicalStartOfLine;
-
+  IsFirstPPToken = isFirstPPToken;
   // Restore the lexer back to non-skipping mode.
   LexingRawMode = false;
 
@@ -3740,10 +3741,6 @@ bool Lexer::Lex(Token &Result) {
   bool returnedToken = LexTokenInternal(Result, atPhysicalStartOfLine);
   // (After the LexTokenInternal call, the lexer might be destroyed.)
   assert((returnedToken || !isRawLex) && "Raw lex must succeed");
-
-  if (returnedToken && Result.isFirstPPToken() && PP &&
-      !PP->hasSeenMainFileFirstPPToken())
-    PP->HandleMainFileFirstPPToken(Result);
   return returnedToken;
 }
 
@@ -4547,8 +4544,6 @@ const char *Lexer::convertDependencyDirectiveToken(
   Result.setFlag((Token::TokenFlags)DDTok.Flags);
   Result.setLength(DDTok.Length);
   BufferPtr = TokPtr + DDTok.Length;
-  if (PP && !PP->hasSeenMainFileFirstPPToken() && Result.isFirstPPToken())
-    PP->HandleMainFileFirstPPToken(Result);
   return TokPtr;
 }
 
diff --git a/clang/lib/Lex/PPDirectives.cpp b/clang/lib/Lex/PPDirectives.cpp
index c8974e5a3528c..be061f462f65a 100644
--- a/clang/lib/Lex/PPDirectives.cpp
+++ b/clang/lib/Lex/PPDirectives.cpp
@@ -1242,9 +1242,6 @@ void Preprocessor::HandleDirective(Token &Result) {
   // pp-directive.
   bool ReadAnyTokensBeforeDirective =CurPPLexer->MIOpt.getHasReadAnyTokensVal();
 
-  if (!hasSeenMainFileFirstPPToken())
-    HandleMainFileFirstPPToken(Result);
-
   // Save the '#' token in case we need to return it later.
   Token SavedHash = Result;
 
diff --git a/clang/lib/Lex/PPMacroExpansion.cpp b/clang/lib/Lex/PPMacroExpansion.cpp
index 709cf3bb87c8e..b8b91e32179af 100644
--- a/clang/lib/Lex/PPMacroExpansion.cpp
+++ b/clang/lib/Lex/PPMacroExpansion.cpp
@@ -431,9 +431,6 @@ bool Preprocessor::HandleMacroExpandedIdentifier(Token &Identifier,
   // to disable the optimization in this case.
   if (CurPPLexer) CurPPLexer->MIOpt.ExpandedMacro();
 
-  if (!hasSeenMainFileFirstPPToken())
-    HandleMainFileFirstPPToken(Identifier);
-
   // If this is a builtin macro, like __LINE__ or _Pragma, handle it specially.
   if (MI->isBuiltinMacro()) {
     if (Callbacks)
diff --git a/clang/lib/Lex/Preprocessor.cpp b/clang/lib/Lex/Preprocessor.cpp
index 7fecbe9eee53c..9329f9fd4460a 100644
--- a/clang/lib/Lex/Preprocessor.cpp
+++ b/clang/lib/Lex/Preprocessor.cpp
@@ -566,6 +566,21 @@ void Preprocessor::EnterMainSourceFile() {
     // #imported, it won't be re-entered.
     if (OptionalFileEntryRef FE = SourceMgr.getFileEntryRefForID(MainFileID))
       markIncluded(*FE);
+
+    // Record the first PP token in the main file. This is used to generate
+    // better diagnostics for C++ modules.
+    //
+    // // This is a comment.
+    // #define FOO int  // note: add 'module;' to the start of the file
+    // ^ FirstPPToken   //       to introduce a global module fragment.
+    //
+    // export module M; // error: module declaration must occur
+    //                  //        at the start of the translation unit.
+    if (getLangOpts().CPlusPlusModules) {
+      std::optional<Token> FirstPPTok = CurLexer->peekNextPPToken();
+      if (FirstPPTok && FirstPPTok->isFirstPPToken())
+        FirstPPTokenLoc = FirstPPTok->getLocation();
+    }
   }
 
   // Preprocess Predefines to populate the initial preprocessor state.
diff --git a/clang/lib/Sema/SemaModule.cpp b/clang/lib/Sema/SemaModule.cpp
index fe70ce3fba6a5..7c982bcd63d73 100644
--- a/clang/lib/Sema/SemaModule.cpp
+++ b/clang/lib/Sema/SemaModule.cpp
@@ -337,11 +337,9 @@ Sema::ActOnModuleDecl(SourceLocation StartLoc, SourceLocation ModuleLoc,
   // tokens in a file (excluding the global module fragment.).
   if (getLangOpts().CPlusPlusModules && !IntroducerIsFirstPPToken && !SeenGMF) {
     Diag(ModuleLoc, diag::err_module_decl_not_at_start);
-    SourceLocation BeginLoc = PP.getMainFileFirstPPToken().getLocation();
-    if (BeginLoc.isValid()) {
-      Diag(BeginLoc, diag::note_global_module_introducer_missing)
-          << FixItHint::CreateInsertion(BeginLoc, "module;\n");
-    }
+    SourceLocation BeginLoc = PP.getMainFileFirstPPTokenLoc();
+    Diag(BeginLoc, diag::note_global_module_introducer_missing)
+        << FixItHint::CreateInsertion(BeginLoc, "module;\n");
   }
 
   // C++23 [module.unit]p1: ... The identifiers module and import shall not
diff --git a/clang/unittests/Lex/LexerTest.cpp b/clang/unittests/Lex/LexerTest.cpp
index 33c8abbec35a3..2adb55484be88 100644
--- a/clang/unittests/Lex/LexerTest.cpp
+++ b/clang/unittests/Lex/LexerTest.cpp
@@ -49,8 +49,7 @@ class LexerTest : public ::testing::Test {
   }
 
   std::unique_ptr<Preprocessor> CreatePP(StringRef Source,
-                                         TrivialModuleLoader &ModLoader,
-                                         StringRef PreDefines = {}) {
+                                         TrivialModuleLoader &ModLoader) {
     std::unique_ptr<llvm::MemoryBuffer> Buf =
         llvm::MemoryBuffer::getMemBuffer(Source);
     SourceMgr.setMainFileID(SourceMgr.createFileID(std::move(Buf)));
@@ -63,7 +62,7 @@ class LexerTest : public ::testing::Test {
         /*IILookup =*/nullptr,
         /*OwnsHeaderSearch =*/false);
     if (!PreDefines.empty())
-      PP->setPredefines(PreDefines.str());
+      PP->setPredefines(PreDefines);
     PP->Initialize(*Target);
     PP->EnterMainSourceFile();
     return PP;
@@ -111,6 +110,7 @@ class LexerTest : public ::testing::Test {
   std::shared_ptr<TargetOptions> TargetOpts;
   IntrusiveRefCntPtr<TargetInfo> Target;
   std::unique_ptr<Preprocessor> PP;
+  std::string PreDefines;
 };
 
 TEST_F(LexerTest, GetSourceTextExpandsToMaximumInMacroArgument) {
@@ -773,6 +773,7 @@ TEST(LexerPreambleTest, PreambleBounds) {
 }
 
 TEST_F(LexerTest, CheckFirstPPToken) {
+  LangOpts.CPlusPlusModules = true;
   {
     TrivialModuleLoader ModLoader;
     auto PP = CreatePP("// This is a comment\n"
@@ -781,9 +782,8 @@ TEST_F(LexerTest, CheckFirstPPToken) {
     Token Tok;
     PP->Lex(Tok);
     EXPECT_TRUE(Tok.is(tok::kw_int));
-    EXPECT_TRUE(PP->hasSeenMainFileFirstPPToken());
-    EXPECT_TRUE(PP->getMainFileFirstPPToken().isFirstPPToken());
-    EXPECT_TRUE(PP->getMainFileFirstPPToken().is(tok::kw_int));
+    EXPECT_TRUE(PP->getMainFileFirstPPTokenLoc().isValid());
+    EXPECT_EQ(PP->getMainFileFirstPPTokenLoc(), Tok.getLocation());
   }
   {
     TrivialModuleLoader ModLoader;
@@ -794,24 +794,28 @@ TEST_F(LexerTest, CheckFirstPPToken) {
     Token Tok;
     PP->Lex(Tok);
     EXPECT_TRUE(Tok.is(tok::kw_int));
-    EXPECT_TRUE(PP->hasSeenMainFileFirstPPToken());
-    EXPECT_TRUE(PP->getMainFileFirstPPToken().isFirstPPToken());
-    EXPECT_TRUE(PP->getMainFileFirstPPToken().is(tok::hash));
+    EXPECT_FALSE(Lexer::getRawToken(PP->getMainFileFirstPPTokenLoc(), Tok,
+                                    PP->getSourceManager(), PP->getLangOpts(),
+                                    /*IgnoreWhiteSpace=*/false));
+    EXPECT_TRUE(Tok.isFirstPPToken());
+    EXPECT_TRUE(Tok.is(tok::hash));
   }
 
   {
+    PreDefines = "#define FOO int\n";
     TrivialModuleLoader ModLoader;
     auto PP = CreatePP("// This is a comment\n"
                        "FOO a;",
-                       ModLoader, "#define FOO int\n");
+                       ModLoader);
     Token Tok;
     PP->Lex(Tok);
     EXPECT_TRUE(Tok.is(tok::kw_int));
-    EXPECT_TRUE(PP->hasSeenMainFileFirstPPToken());
-    EXPECT_TRUE(PP->getMainFileFirstPPToken().isFirstPPToken());
-    EXPECT_TRUE(PP->getMainFileFirstPPToken().is(tok::identifier));
-    EXPECT_TRUE(
-        PP->getMainFileFirstPPToken().getIdentifierInfo()->isStr("FOO"));
+    EXPECT_FALSE(Lexer::getRawToken(PP->getMainFileFirstPPTokenLoc(), Tok,
+                                    PP->getSourceManager(), PP->getLangOpts(),
+                                    /*IgnoreWhiteSpace=*/false));
+    EXPECT_TRUE(Tok.isFirstPPToken());
+    EXPECT_TRUE(Tok.is(tok::raw_identifier));
+    EXPECT_TRUE(Tok.getRawIdentifier() == "FOO");
   }
 }
 } // anonymous namespace

>From b3b3219b3d2e38828ba5f79acc81eb91b7359cb1 Mon Sep 17 00:00:00 2001
From: yronglin <yronglin777 at gmail.com>
Date: Tue, 24 Jun 2025 19:09:20 +0800
Subject: [PATCH 4/4] [NFC] Update comments in code

Signed-off-by: yronglin <yronglin777 at gmail.com>
---
 clang/include/clang/Lex/Preprocessor.h | 2 +-
 clang/include/clang/Lex/TokenLexer.h   | 5 +++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/clang/include/clang/Lex/Preprocessor.h b/clang/include/clang/Lex/Preprocessor.h
index a3cf18f21ca40..55661e8d4256f 100644
--- a/clang/include/clang/Lex/Preprocessor.h
+++ b/clang/include/clang/Lex/Preprocessor.h
@@ -350,7 +350,7 @@ class Preprocessor {
   /// Whether the last token we lexed was an '@'.
   bool LastTokenWasAt = false;
 
-  /// First pp-token in current translation unit.
+  /// First pp-token source location in current translation unit.
   SourceLocation FirstPPTokenLoc;
 
   /// A position within a C++20 import-seq.
diff --git a/clang/include/clang/Lex/TokenLexer.h b/clang/include/clang/Lex/TokenLexer.h
index 777b4e6266c71..620d3d620d7a1 100644
--- a/clang/include/clang/Lex/TokenLexer.h
+++ b/clang/include/clang/Lex/TokenLexer.h
@@ -139,8 +139,9 @@ class TokenLexer {
   void Init(const Token *TokArray, unsigned NumToks, bool DisableMacroExpansion,
             bool OwnsTokens, bool IsReinject);
 
-  /// If the next token lexed will pop this macro off the expansion stack,
-  /// return std::nullopt, otherwise return the next unexpanded token.
+  /// If TokenLexer::isAtEnd returns true (the next token lexed will pop this
+  /// macro off the expansion stack, return std::nullopt, otherwise return the
+  /// next unexpanded token.
   std::optional<Token> peekNextPPToken() const;
 
   /// Lex and return a token from this macro stream.



More information about the cfe-commits mailing list