[clang-tools-extra] [clang-tidy] [NFC] Move comment scanning to `LexerUtils` and add tests (PR #180371)

Thu Feb 12 13:01:28 PST 2026

https://github.com/unterumarmung updated https://github.com/llvm/llvm-project/pull/180371

>From 3d45791cd4724de984e74ab2d44fde2b6f0a293a Mon Sep 17 00:00:00 2001
From: Daniil Dudkin <unterumarmung at yandex.ru>
Date: Sun, 8 Feb 2026 00:15:19 +0300
Subject: [PATCH] [clang-tidy] [NFC] Move comment scanning to `LexerUtils` and
 add tests

---
 .../bugprone/ArgumentCommentCheck.cpp         |  74 ++-------
 .../clang-tidy/utils/LexerUtils.cpp           |  61 ++++++++
 .../clang-tidy/utils/LexerUtils.h             |  13 ++
 .../unittests/clang-tidy/CMakeLists.txt       |   1 +
 .../unittests/clang-tidy/LexerUtilsTest.cpp   | 144 ++++++++++++++++++
 5 files changed, 235 insertions(+), 58 deletions(-)
 create mode 100644 clang-tools-extra/unittests/clang-tidy/LexerUtilsTest.cpp

diff --git a/clang-tools-extra/clang-tidy/bugprone/ArgumentCommentCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/ArgumentCommentCheck.cpp
index d46896808bd09..eb9b710b28549 100644
--- a/clang-tools-extra/clang-tidy/bugprone/ArgumentCommentCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/ArgumentCommentCheck.cpp
@@ -17,6 +17,8 @@
 using namespace clang::ast_matchers;
 
 namespace clang::tidy::bugprone {
+
+using utils::lexer::CommentToken;
 namespace {
 AST_MATCHER(Decl, isFromStdNamespaceOrSystemHeader) {
   if (const auto *D = Node.getDeclContext()->getEnclosingNamespaceContext())
@@ -77,55 +79,9 @@ void ArgumentCommentCheck::registerMatchers(MatchFinder *Finder) {
                      this);
 }
 
-static std::vector<std::pair<SourceLocation, StringRef>>
-getCommentsInRange(ASTContext *Ctx, CharSourceRange Range) {
-  std::vector<std::pair<SourceLocation, StringRef>> Comments;
-  auto &SM = Ctx->getSourceManager();
-  const std::pair<FileID, unsigned> BeginLoc =
-                                        SM.getDecomposedLoc(Range.getBegin()),
-                                    EndLoc =
-                                        SM.getDecomposedLoc(Range.getEnd());
-
-  if (BeginLoc.first != EndLoc.first)
-    return Comments;
-
-  bool Invalid = false;
-  const StringRef Buffer = SM.getBufferData(BeginLoc.first, &Invalid);
-  if (Invalid)
-    return Comments;
-
-  const char *StrData = Buffer.data() + BeginLoc.second;
-
-  Lexer TheLexer(SM.getLocForStartOfFile(BeginLoc.first), Ctx->getLangOpts(),
-                 Buffer.begin(), StrData, Buffer.end());
-  TheLexer.SetCommentRetentionState(true);
-
-  while (true) {
-    Token Tok;
-    if (TheLexer.LexFromRawLexer(Tok))
-      break;
-    if (Tok.getLocation() == Range.getEnd() || Tok.is(tok::eof))
-      break;
-
-    if (Tok.is(tok::comment)) {
-      const std::pair<FileID, unsigned> CommentLoc =
-          SM.getDecomposedLoc(Tok.getLocation());
-      assert(CommentLoc.first == BeginLoc.first);
-      Comments.emplace_back(
-          Tok.getLocation(),
-          StringRef(Buffer.begin() + CommentLoc.second, Tok.getLength()));
-    } else {
-      // Clear comments found before the different token, e.g. comma.
-      Comments.clear();
-    }
-  }
-
-  return Comments;
-}
-
-static std::vector<std::pair<SourceLocation, StringRef>>
-getCommentsBeforeLoc(ASTContext *Ctx, SourceLocation Loc) {
-  std::vector<std::pair<SourceLocation, StringRef>> Comments;
+static std::vector<CommentToken> getCommentsBeforeLoc(ASTContext *Ctx,
+                                                      SourceLocation Loc) {
+  std::vector<CommentToken> Comments;
   while (Loc.isValid()) {
     const std::optional<Token> Tok = utils::lexer::getPreviousToken(
         Loc, Ctx->getSourceManager(), Ctx->getLangOpts(),
@@ -133,11 +89,12 @@ getCommentsBeforeLoc(ASTContext *Ctx, SourceLocation Loc) {
     if (!Tok || Tok->isNot(tok::comment))
       break;
     Loc = Tok->getLocation();
-    Comments.emplace_back(
+    Comments.emplace_back(CommentToken{
         Loc,
         Lexer::getSourceText(CharSourceRange::getCharRange(
                                  Loc, Loc.getLocWithOffset(Tok->getLength())),
-                             Ctx->getSourceManager(), Ctx->getLangOpts()));
+                             Ctx->getSourceManager(), Ctx->getLangOpts()),
+    });
   }
   return Comments;
 }
@@ -304,9 +261,10 @@ void ArgumentCommentCheck::checkCallArgs(ASTContext *Ctx,
         MakeFileCharRange(ArgBeginLoc, Args[I]->getBeginLoc());
     ArgBeginLoc = Args[I]->getEndLoc();
 
-    std::vector<std::pair<SourceLocation, StringRef>> Comments;
+    std::vector<CommentToken> Comments;
     if (BeforeArgument.isValid()) {
-      Comments = getCommentsInRange(Ctx, BeforeArgument);
+      Comments = utils::lexer::getTrailingCommentsInRange(
+          BeforeArgument, Ctx->getSourceManager(), Ctx->getLangOpts());
     } else {
       // Fall back to parsing back from the start of the argument.
       const CharSourceRange ArgsRange =
@@ -314,18 +272,18 @@ void ArgumentCommentCheck::checkCallArgs(ASTContext *Ctx,
       Comments = getCommentsBeforeLoc(Ctx, ArgsRange.getBegin());
     }
 
-    for (auto Comment : Comments) {
+    for (const auto &Comment : Comments) {
       llvm::SmallVector<StringRef, 2> Matches;
-      if (IdentRE.match(Comment.second, &Matches) &&
+      if (IdentRE.match(Comment.Text, &Matches) &&
           !sameName(Matches[2], II->getName(), StrictMode)) {
         {
           const DiagnosticBuilder Diag =
-              diag(Comment.first, "argument name '%0' in comment does not "
-                                  "match parameter name %1")
+              diag(Comment.Loc, "argument name '%0' in comment does not "
+                                "match parameter name %1")
               << Matches[2] << II;
           if (isLikelyTypo(Callee->parameters(), Matches[2], II->getName())) {
             Diag << FixItHint::CreateReplacement(
-                Comment.first, (Matches[1] + II->getName() + Matches[3]).str());
+                Comment.Loc, (Matches[1] + II->getName() + Matches[3]).str());
           }
         }
         diag(PVD->getLocation(), "%0 declared here", DiagnosticIDs::Note) << II;
diff --git a/clang-tools-extra/clang-tidy/utils/LexerUtils.cpp b/clang-tools-extra/clang-tidy/utils/LexerUtils.cpp
index b77d985b76d77..b7de34f7e2e00 100644
--- a/clang-tools-extra/clang-tidy/utils/LexerUtils.cpp
+++ b/clang-tools-extra/clang-tidy/utils/LexerUtils.cpp
@@ -10,6 +10,7 @@
 #include "clang/Basic/SourceManager.h"
 #include <optional>
 #include <utility>
+#include <vector>
 
 namespace clang::tidy::utils::lexer {
 
@@ -99,6 +100,66 @@ bool rangeContainsExpansionsOrDirectives(SourceRange Range,
   return false;
 }
 
+std::vector<CommentToken>
+getTrailingCommentsInRange(CharSourceRange Range, const SourceManager &SM,
+                           const LangOptions &LangOpts) {
+  std::vector<CommentToken> Comments;
+  if (Range.isInvalid())
+    return Comments;
+
+  const CharSourceRange FileRange =
+      Lexer::makeFileCharRange(Range, SM, LangOpts);
+  if (FileRange.isInvalid())
+    return Comments;
+
+  const std::pair<FileID, unsigned> BeginLoc =
+      SM.getDecomposedLoc(FileRange.getBegin());
+  const std::pair<FileID, unsigned> EndLoc =
+      SM.getDecomposedLoc(FileRange.getEnd());
+
+  if (BeginLoc.first != EndLoc.first)
+    return Comments;
+
+  bool Invalid = false;
+  const StringRef Buffer = SM.getBufferData(BeginLoc.first, &Invalid);
+  if (Invalid)
+    return Comments;
+
+  const char *StrData = Buffer.data() + BeginLoc.second;
+
+  Lexer TheLexer(SM.getLocForStartOfFile(BeginLoc.first), LangOpts,
+                 Buffer.begin(), StrData, Buffer.end());
+  // Use raw lexing with comment retention so we can see comment tokens without
+  // preprocessing or macro expansion effects.
+  TheLexer.SetCommentRetentionState(true);
+
+  while (true) {
+    Token Tok;
+    if (TheLexer.LexFromRawLexer(Tok))
+      break;
+    if (Tok.is(tok::eof) || Tok.getLocation() == FileRange.getEnd() ||
+        SM.isBeforeInTranslationUnit(FileRange.getEnd(), Tok.getLocation()))
+      break;
+
+    if (Tok.is(tok::comment)) {
+      const std::pair<FileID, unsigned> CommentLoc =
+          SM.getDecomposedLoc(Tok.getLocation());
+      assert(CommentLoc.first == BeginLoc.first);
+      Comments.emplace_back(CommentToken{
+          Tok.getLocation(),
+          StringRef(Buffer.begin() + CommentLoc.second, Tok.getLength()),
+      });
+    } else {
+      // Clear comments found before the different token, e.g. comma. Callers
+      // use this to retrieve only the contiguous comment block that directly
+      // precedes a token of interest.
+      Comments.clear();
+    }
+  }
+
+  return Comments;
+}
+
 std::optional<Token> getQualifyingToken(tok::TokenKind TK,
                                         CharSourceRange Range,
                                         const ASTContext &Context,
diff --git a/clang-tools-extra/clang-tidy/utils/LexerUtils.h b/clang-tools-extra/clang-tidy/utils/LexerUtils.h
index 61389d86f22f4..38123ae14cff7 100644
--- a/clang-tools-extra/clang-tidy/utils/LexerUtils.h
+++ b/clang-tools-extra/clang-tidy/utils/LexerUtils.h
@@ -14,6 +14,7 @@
 #include "clang/Lex/Lexer.h"
 #include <optional>
 #include <utility>
+#include <vector>
 
 namespace clang {
 
@@ -113,6 +114,18 @@ bool rangeContainsExpansionsOrDirectives(SourceRange Range,
                                          const SourceManager &SM,
                                          const LangOptions &LangOpts);
 
+// Represents a comment token and its source location in the original file.
+struct CommentToken {
+  SourceLocation Loc;
+  StringRef Text;
+};
+
+/// Returns comment tokens found in the given range. If a non-comment token is
+/// encountered, clears previously collected comments and continues.
+std::vector<CommentToken>
+getTrailingCommentsInRange(CharSourceRange Range, const SourceManager &SM,
+                           const LangOptions &LangOpts);
+
 /// Assuming that ``Range`` spans a CVR-qualified type, returns the
 /// token in ``Range`` that is responsible for the qualification. ``Range``
 /// must be valid with respect to ``SM``.  Returns ``std::nullopt`` if no
diff --git a/clang-tools-extra/unittests/clang-tidy/CMakeLists.txt b/clang-tools-extra/unittests/clang-tidy/CMakeLists.txt
index 64bf47e61736c..167f5d3def06b 100644
--- a/clang-tools-extra/unittests/clang-tidy/CMakeLists.txt
+++ b/clang-tools-extra/unittests/clang-tidy/CMakeLists.txt
@@ -24,6 +24,7 @@ add_extra_unittest(ClangTidyTests
   DeclRefExprUtilsTest.cpp
   IncludeCleanerTest.cpp
   IncludeInserterTest.cpp
+  LexerUtilsTest.cpp
   GlobListTest.cpp
   GoogleModuleTest.cpp
   LLVMModuleTest.cpp
diff --git a/clang-tools-extra/unittests/clang-tidy/LexerUtilsTest.cpp b/clang-tools-extra/unittests/clang-tidy/LexerUtilsTest.cpp
new file mode 100644
index 0000000000000..438a78b4694ee
--- /dev/null
+++ b/clang-tools-extra/unittests/clang-tidy/LexerUtilsTest.cpp
@@ -0,0 +1,144 @@
+//===--- LexerUtilsTest.cpp - clang-tidy ---------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "../clang-tidy/utils/LexerUtils.h"
+
+#include "clang/Basic/FileManager.h"
+#include "clang/Basic/SourceManager.h"
+#include "clang/Frontend/ASTUnit.h"
+#include "clang/Serialization/PCHContainerOperations.h"
+#include "clang/Tooling/Tooling.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Testing/Annotations/Annotations.h"
+#include "gtest/gtest.h"
+
+namespace clang::tidy::test {
+
+using clang::tooling::FileContentMappings;
+
+static std::unique_ptr<ASTUnit>
+buildAST(StringRef Code, const FileContentMappings &Mappings = {}) {
+  std::vector<std::string> Args = {"-std=c++20"};
+  return clang::tooling::buildASTFromCodeWithArgs(
+      Code, Args, "input.cc", "clang-tool",
+      std::make_shared<PCHContainerOperations>(),
+      clang::tooling::getClangStripDependencyFileAdjuster(), Mappings);
+}
+
+static CharSourceRange rangeFromAnnotations(const llvm::Annotations &A,
+                                            const SourceManager &SM, FileID FID,
+                                            llvm::StringRef Name = "") {
+  const auto R = A.range(Name);
+  const SourceLocation Begin =
+      SM.getLocForStartOfFile(FID).getLocWithOffset(R.Begin);
+  const SourceLocation End =
+      SM.getLocForStartOfFile(FID).getLocWithOffset(R.End);
+  return CharSourceRange::getCharRange(Begin, End);
+}
+
+namespace {
+
+TEST(LexerUtilsTest, GetTrailingCommentsInRangeAdjacentComments) {
+  llvm::Annotations Code(R"cpp(
+void f() {
+  $range[[/*first*/ /*second*/]]
+  int x = 0;
+}
+)cpp");
+  std::unique_ptr<ASTUnit> AST = buildAST(Code.code());
+  ASSERT_TRUE(AST);
+  const ASTContext &Context = AST->getASTContext();
+  const SourceManager &SM = Context.getSourceManager();
+  const LangOptions &LangOpts = Context.getLangOpts();
+
+  const CharSourceRange Range =
+      rangeFromAnnotations(Code, SM, SM.getMainFileID(), "range");
+  const std::vector<utils::lexer::CommentToken> Comments =
+      utils::lexer::getTrailingCommentsInRange(Range, SM, LangOpts);
+  ASSERT_EQ(2u, Comments.size());
+  EXPECT_EQ("/*first*/", Comments[0].Text);
+  EXPECT_EQ("/*second*/", Comments[1].Text);
+  const StringRef CodeText = Code.code();
+  const size_t FirstOffset = CodeText.find("/*first*/");
+  ASSERT_NE(StringRef::npos, FirstOffset);
+  const size_t SecondOffset = CodeText.find("/*second*/");
+  ASSERT_NE(StringRef::npos, SecondOffset);
+  EXPECT_EQ(FirstOffset, SM.getFileOffset(Comments[0].Loc));
+  EXPECT_EQ(SecondOffset, SM.getFileOffset(Comments[1].Loc));
+}
+
+TEST(LexerUtilsTest, GetTrailingCommentsInRangeClearsOnToken) {
+  llvm::Annotations Code(R"cpp(
+void f() {
+  int x = ($range[[/*first*/ 0, /*second*/]] 1);
+}
+)cpp");
+  std::unique_ptr<ASTUnit> AST = buildAST(Code.code());
+  ASSERT_TRUE(AST);
+  const ASTContext &Context = AST->getASTContext();
+  const SourceManager &SM = Context.getSourceManager();
+  const LangOptions &LangOpts = Context.getLangOpts();
+
+  const CharSourceRange Range =
+      rangeFromAnnotations(Code, SM, SM.getMainFileID(), "range");
+  const std::vector<utils::lexer::CommentToken> Comments =
+      utils::lexer::getTrailingCommentsInRange(Range, SM, LangOpts);
+  ASSERT_EQ(1u, Comments.size());
+  EXPECT_EQ("/*second*/", Comments.front().Text);
+  const StringRef CodeText = Code.code();
+  const size_t SecondOffset = CodeText.find("/*second*/");
+  ASSERT_NE(StringRef::npos, SecondOffset);
+  EXPECT_EQ(SecondOffset, SM.getFileOffset(Comments.front().Loc));
+}
+
+TEST(LexerUtilsTest, GetTrailingCommentsInRangeLineComments) {
+  llvm::Annotations Code(R"cpp(
+void f() {
+  $range[[// first
+  // second
+  ]]
+  int x = 0;
+}
+)cpp");
+  std::unique_ptr<ASTUnit> AST = buildAST(Code.code());
+  ASSERT_TRUE(AST);
+  const ASTContext &Context = AST->getASTContext();
+  const SourceManager &SM = Context.getSourceManager();
+  const LangOptions &LangOpts = Context.getLangOpts();
+
+  const CharSourceRange Range =
+      rangeFromAnnotations(Code, SM, SM.getMainFileID(), "range");
+  const std::vector<utils::lexer::CommentToken> Comments =
+      utils::lexer::getTrailingCommentsInRange(Range, SM, LangOpts);
+  ASSERT_EQ(2u, Comments.size());
+  EXPECT_EQ("// first", Comments[0].Text);
+  EXPECT_EQ("// second", Comments[1].Text);
+  const StringRef CodeText = Code.code();
+  const size_t FirstOffset = CodeText.find("// first");
+  ASSERT_NE(StringRef::npos, FirstOffset);
+  const size_t SecondOffset = CodeText.find("// second");
+  ASSERT_NE(StringRef::npos, SecondOffset);
+  EXPECT_EQ(FirstOffset, SM.getFileOffset(Comments[0].Loc));
+  EXPECT_EQ(SecondOffset, SM.getFileOffset(Comments[1].Loc));
+}
+
+TEST(LexerUtilsTest, GetTrailingCommentsInRangeInvalidRange) {
+  std::unique_ptr<ASTUnit> AST = buildAST("int value = 0;");
+  ASSERT_TRUE(AST);
+  const ASTContext &Context = AST->getASTContext();
+  const SourceManager &SM = Context.getSourceManager();
+  const LangOptions &LangOpts = Context.getLangOpts();
+
+  const std::vector<utils::lexer::CommentToken> Comments =
+      utils::lexer::getTrailingCommentsInRange(CharSourceRange(), SM, LangOpts);
+  EXPECT_TRUE(Comments.empty());
+}
+
+} // namespace
+
+} // namespace clang::tidy::test