[clang] 0529a34 - [clang][Preprocessor] Handle the first pp-token in EnterMainSourceFile (#145244)

via cfe-commits cfe-commits at lists.llvm.org
Wed Jun 25 17:49:46 PDT 2025


Author: yronglin
Date: 2025-06-26T08:49:43+08:00
New Revision: 0529a346007cecab95c6820a60cb3e4e36f34990

URL: https://github.com/llvm/llvm-project/commit/0529a346007cecab95c6820a60cb3e4e36f34990
DIFF: https://github.com/llvm/llvm-project/commit/0529a346007cecab95c6820a60cb3e4e36f34990.diff

LOG: [clang][Preprocessor] Handle the first pp-token in EnterMainSourceFile (#145244)

Depends on [[clang][Preprocessor] Add peekNextPPToken, makes look ahead
next token without
side-effects](https://github.com/llvm/llvm-project/pull/143898).

This PR fix the performance regression that introduced in
https://github.com/llvm/llvm-project/pull/144233.
The original PR(https://github.com/llvm/llvm-project/pull/144233) handle
the first pp-token in the main source file in the macro
definition/expansion and `Lexer::Lex`, but the lexer is almost always on
the hot path, we may hit a performance regression. In this PR, we handle
the first pp-token in `Preprocessor::EnterMainSourceFile`.

---------

Signed-off-by: yronglin <yronglin777 at gmail.com>

Added: 
    

Modified: 
    clang/include/clang/Lex/Preprocessor.h
    clang/include/clang/Lex/TokenLexer.h
    clang/lib/Lex/Lexer.cpp
    clang/lib/Lex/PPDirectives.cpp
    clang/lib/Lex/PPMacroExpansion.cpp
    clang/lib/Lex/Preprocessor.cpp
    clang/lib/Sema/SemaModule.cpp
    clang/unittests/Lex/LexerTest.cpp

Removed: 
    


################################################################################
diff  --git a/clang/include/clang/Lex/Preprocessor.h b/clang/include/clang/Lex/Preprocessor.h
index dae12a6015439..4d82e20e5d4f3 100644
--- a/clang/include/clang/Lex/Preprocessor.h
+++ b/clang/include/clang/Lex/Preprocessor.h
@@ -350,8 +350,8 @@ class Preprocessor {
   /// Whether the last token we lexed was an '@'.
   bool LastTokenWasAt = false;
 
-  /// First pp-token in current translation unit.
-  std::optional<Token> FirstPPToken;
+  /// First pp-token source location in current translation unit.
+  SourceLocation FirstPPTokenLoc;
 
   /// A position within a C++20 import-seq.
   class StdCXXImportSeq {
@@ -1769,20 +1769,13 @@ class Preprocessor {
   std::optional<LexEmbedParametersResult> LexEmbedParameters(Token &Current,
                                                              bool ForHasEmbed);
 
-  /// Whether the preprocessor already seen the first pp-token in main file.
-  bool hasSeenMainFileFirstPPToken() const { return FirstPPToken.has_value(); }
-
-  /// Record first pp-token and check if it has a Token::FirstPPToken flag.
-  void HandleMainFileFirstPPToken(const Token &Tok) {
-    if (!hasSeenMainFileFirstPPToken() && Tok.isFirstPPToken() &&
-        SourceMgr.isWrittenInMainFile(Tok.getLocation()))
-      FirstPPToken = Tok;
+  /// Get the start location of the first pp-token in main file.
+  SourceLocation getMainFileFirstPPTokenLoc() const {
+    assert(FirstPPTokenLoc.isValid() &&
+           "Did not see the first pp-token in the main file");
+    return FirstPPTokenLoc;
   }
 
-  Token getMainFileFirstPPToken() const {
-    assert(FirstPPToken && "First main file pp-token doesn't exists");
-    return *FirstPPToken;
-  }
   bool LexAfterModuleImport(Token &Result);
   void CollectPpImportSuffix(SmallVectorImpl<Token> &Toks);
 

diff  --git a/clang/include/clang/Lex/TokenLexer.h b/clang/include/clang/Lex/TokenLexer.h
index 777b4e6266c71..7ac933d8f9d45 100644
--- a/clang/include/clang/Lex/TokenLexer.h
+++ b/clang/include/clang/Lex/TokenLexer.h
@@ -139,8 +139,9 @@ class TokenLexer {
   void Init(const Token *TokArray, unsigned NumToks, bool DisableMacroExpansion,
             bool OwnsTokens, bool IsReinject);
 
-  /// If the next token lexed will pop this macro off the expansion stack,
-  /// return std::nullopt, otherwise return the next unexpanded token.
+  /// If TokenLexer::isAtEnd returns true(the next token lexed will pop this
+  /// macro off the expansion stack), return std::nullopt, otherwise return the
+  /// next unexpanded token.
   std::optional<Token> peekNextPPToken() const;
 
   /// Lex and return a token from this macro stream.

diff  --git a/clang/lib/Lex/Lexer.cpp b/clang/lib/Lex/Lexer.cpp
index f4d16ecce393c..42ea7edf3aaad 100644
--- a/clang/lib/Lex/Lexer.cpp
+++ b/clang/lib/Lex/Lexer.cpp
@@ -3228,6 +3228,7 @@ std::optional<Token> Lexer::peekNextPPToken() {
   bool atStartOfLine = IsAtStartOfLine;
   bool atPhysicalStartOfLine = IsAtPhysicalStartOfLine;
   bool leadingSpace = HasLeadingSpace;
+  bool isFirstPPToken = IsFirstPPToken;
 
   Token Tok;
   Lex(Tok);
@@ -3238,7 +3239,7 @@ std::optional<Token> Lexer::peekNextPPToken() {
   HasLeadingSpace = leadingSpace;
   IsAtStartOfLine = atStartOfLine;
   IsAtPhysicalStartOfLine = atPhysicalStartOfLine;
-
+  IsFirstPPToken = isFirstPPToken;
   // Restore the lexer back to non-skipping mode.
   LexingRawMode = false;
 
@@ -3740,10 +3741,6 @@ bool Lexer::Lex(Token &Result) {
   bool returnedToken = LexTokenInternal(Result, atPhysicalStartOfLine);
   // (After the LexTokenInternal call, the lexer might be destroyed.)
   assert((returnedToken || !isRawLex) && "Raw lex must succeed");
-
-  if (returnedToken && Result.isFirstPPToken() && PP &&
-      !PP->hasSeenMainFileFirstPPToken())
-    PP->HandleMainFileFirstPPToken(Result);
   return returnedToken;
 }
 
@@ -4547,8 +4544,6 @@ const char *Lexer::convertDependencyDirectiveToken(
   Result.setFlag((Token::TokenFlags)DDTok.Flags);
   Result.setLength(DDTok.Length);
   BufferPtr = TokPtr + DDTok.Length;
-  if (PP && !PP->hasSeenMainFileFirstPPToken() && Result.isFirstPPToken())
-    PP->HandleMainFileFirstPPToken(Result);
   return TokPtr;
 }
 

diff  --git a/clang/lib/Lex/PPDirectives.cpp b/clang/lib/Lex/PPDirectives.cpp
index b88624b22e622..e6da19d24f1c5 100644
--- a/clang/lib/Lex/PPDirectives.cpp
+++ b/clang/lib/Lex/PPDirectives.cpp
@@ -1242,9 +1242,6 @@ void Preprocessor::HandleDirective(Token &Result) {
   // pp-directive.
   bool ReadAnyTokensBeforeDirective =CurPPLexer->MIOpt.getHasReadAnyTokensVal();
 
-  if (!hasSeenMainFileFirstPPToken())
-    HandleMainFileFirstPPToken(Result);
-
   // Save the '#' token in case we need to return it later.
   Token SavedHash = Result;
 

diff  --git a/clang/lib/Lex/PPMacroExpansion.cpp b/clang/lib/Lex/PPMacroExpansion.cpp
index 709cf3bb87c8e..b8b91e32179af 100644
--- a/clang/lib/Lex/PPMacroExpansion.cpp
+++ b/clang/lib/Lex/PPMacroExpansion.cpp
@@ -431,9 +431,6 @@ bool Preprocessor::HandleMacroExpandedIdentifier(Token &Identifier,
   // to disable the optimization in this case.
   if (CurPPLexer) CurPPLexer->MIOpt.ExpandedMacro();
 
-  if (!hasSeenMainFileFirstPPToken())
-    HandleMainFileFirstPPToken(Identifier);
-
   // If this is a builtin macro, like __LINE__ or _Pragma, handle it specially.
   if (MI->isBuiltinMacro()) {
     if (Callbacks)

diff  --git a/clang/lib/Lex/Preprocessor.cpp b/clang/lib/Lex/Preprocessor.cpp
index 500cf6f8400e0..aac3dc9fa54b0 100644
--- a/clang/lib/Lex/Preprocessor.cpp
+++ b/clang/lib/Lex/Preprocessor.cpp
@@ -566,6 +566,21 @@ void Preprocessor::EnterMainSourceFile() {
     // #imported, it won't be re-entered.
     if (OptionalFileEntryRef FE = SourceMgr.getFileEntryRefForID(MainFileID))
       markIncluded(*FE);
+
+    // Record the first PP token in the main file. This is used to generate
+    // better diagnostics for C++ modules.
+    //
+    // // This is a comment.
+    // #define FOO int  // note: add 'module;' to the start of the file
+    // ^ FirstPPToken   //       to introduce a global module fragment.
+    //
+    // export module M; // error: module declaration must occur
+    //                  //        at the start of the translation unit.
+    if (getLangOpts().CPlusPlusModules) {
+      std::optional<Token> FirstPPTok = CurLexer->peekNextPPToken();
+      if (FirstPPTok && FirstPPTok->isFirstPPToken())
+        FirstPPTokenLoc = FirstPPTok->getLocation();
+    }
   }
 
   // Preprocess Predefines to populate the initial preprocessor state.

diff  --git a/clang/lib/Sema/SemaModule.cpp b/clang/lib/Sema/SemaModule.cpp
index fe70ce3fba6a5..7c982bcd63d73 100644
--- a/clang/lib/Sema/SemaModule.cpp
+++ b/clang/lib/Sema/SemaModule.cpp
@@ -337,11 +337,9 @@ Sema::ActOnModuleDecl(SourceLocation StartLoc, SourceLocation ModuleLoc,
   // tokens in a file (excluding the global module fragment.).
   if (getLangOpts().CPlusPlusModules && !IntroducerIsFirstPPToken && !SeenGMF) {
     Diag(ModuleLoc, diag::err_module_decl_not_at_start);
-    SourceLocation BeginLoc = PP.getMainFileFirstPPToken().getLocation();
-    if (BeginLoc.isValid()) {
-      Diag(BeginLoc, diag::note_global_module_introducer_missing)
-          << FixItHint::CreateInsertion(BeginLoc, "module;\n");
-    }
+    SourceLocation BeginLoc = PP.getMainFileFirstPPTokenLoc();
+    Diag(BeginLoc, diag::note_global_module_introducer_missing)
+        << FixItHint::CreateInsertion(BeginLoc, "module;\n");
   }
 
   // C++23 [module.unit]p1: ... The identifiers module and import shall not

diff  --git a/clang/unittests/Lex/LexerTest.cpp b/clang/unittests/Lex/LexerTest.cpp
index 33c8abbec35a3..2adb55484be88 100644
--- a/clang/unittests/Lex/LexerTest.cpp
+++ b/clang/unittests/Lex/LexerTest.cpp
@@ -49,8 +49,7 @@ class LexerTest : public ::testing::Test {
   }
 
   std::unique_ptr<Preprocessor> CreatePP(StringRef Source,
-                                         TrivialModuleLoader &ModLoader,
-                                         StringRef PreDefines = {}) {
+                                         TrivialModuleLoader &ModLoader) {
     std::unique_ptr<llvm::MemoryBuffer> Buf =
         llvm::MemoryBuffer::getMemBuffer(Source);
     SourceMgr.setMainFileID(SourceMgr.createFileID(std::move(Buf)));
@@ -63,7 +62,7 @@ class LexerTest : public ::testing::Test {
         /*IILookup =*/nullptr,
         /*OwnsHeaderSearch =*/false);
     if (!PreDefines.empty())
-      PP->setPredefines(PreDefines.str());
+      PP->setPredefines(PreDefines);
     PP->Initialize(*Target);
     PP->EnterMainSourceFile();
     return PP;
@@ -111,6 +110,7 @@ class LexerTest : public ::testing::Test {
   std::shared_ptr<TargetOptions> TargetOpts;
   IntrusiveRefCntPtr<TargetInfo> Target;
   std::unique_ptr<Preprocessor> PP;
+  std::string PreDefines;
 };
 
 TEST_F(LexerTest, GetSourceTextExpandsToMaximumInMacroArgument) {
@@ -773,6 +773,7 @@ TEST(LexerPreambleTest, PreambleBounds) {
 }
 
 TEST_F(LexerTest, CheckFirstPPToken) {
+  LangOpts.CPlusPlusModules = true;
   {
     TrivialModuleLoader ModLoader;
     auto PP = CreatePP("// This is a comment\n"
@@ -781,9 +782,8 @@ TEST_F(LexerTest, CheckFirstPPToken) {
     Token Tok;
     PP->Lex(Tok);
     EXPECT_TRUE(Tok.is(tok::kw_int));
-    EXPECT_TRUE(PP->hasSeenMainFileFirstPPToken());
-    EXPECT_TRUE(PP->getMainFileFirstPPToken().isFirstPPToken());
-    EXPECT_TRUE(PP->getMainFileFirstPPToken().is(tok::kw_int));
+    EXPECT_TRUE(PP->getMainFileFirstPPTokenLoc().isValid());
+    EXPECT_EQ(PP->getMainFileFirstPPTokenLoc(), Tok.getLocation());
   }
   {
     TrivialModuleLoader ModLoader;
@@ -794,24 +794,28 @@ TEST_F(LexerTest, CheckFirstPPToken) {
     Token Tok;
     PP->Lex(Tok);
     EXPECT_TRUE(Tok.is(tok::kw_int));
-    EXPECT_TRUE(PP->hasSeenMainFileFirstPPToken());
-    EXPECT_TRUE(PP->getMainFileFirstPPToken().isFirstPPToken());
-    EXPECT_TRUE(PP->getMainFileFirstPPToken().is(tok::hash));
+    EXPECT_FALSE(Lexer::getRawToken(PP->getMainFileFirstPPTokenLoc(), Tok,
+                                    PP->getSourceManager(), PP->getLangOpts(),
+                                    /*IgnoreWhiteSpace=*/false));
+    EXPECT_TRUE(Tok.isFirstPPToken());
+    EXPECT_TRUE(Tok.is(tok::hash));
   }
 
   {
+    PreDefines = "#define FOO int\n";
     TrivialModuleLoader ModLoader;
     auto PP = CreatePP("// This is a comment\n"
                        "FOO a;",
-                       ModLoader, "#define FOO int\n");
+                       ModLoader);
     Token Tok;
     PP->Lex(Tok);
     EXPECT_TRUE(Tok.is(tok::kw_int));
-    EXPECT_TRUE(PP->hasSeenMainFileFirstPPToken());
-    EXPECT_TRUE(PP->getMainFileFirstPPToken().isFirstPPToken());
-    EXPECT_TRUE(PP->getMainFileFirstPPToken().is(tok::identifier));
-    EXPECT_TRUE(
-        PP->getMainFileFirstPPToken().getIdentifierInfo()->isStr("FOO"));
+    EXPECT_FALSE(Lexer::getRawToken(PP->getMainFileFirstPPTokenLoc(), Tok,
+                                    PP->getSourceManager(), PP->getLangOpts(),
+                                    /*IgnoreWhiteSpace=*/false));
+    EXPECT_TRUE(Tok.isFirstPPToken());
+    EXPECT_TRUE(Tok.is(tok::raw_identifier));
+    EXPECT_TRUE(Tok.getRawIdentifier() == "FOO");
   }
 }
 } // anonymous namespace


        


More information about the cfe-commits mailing list