[clang-tools-extra] [clang-tidy] Add ranges-style view for tokenizing source code (PR #172508)
via cfe-commits
cfe-commits at lists.llvm.org
Tue Dec 16 08:01:38 PST 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-clang-tools-extra
Author: Victor Chernyakin (localspook)
<details>
<summary>Changes</summary>
We have several checks that want to relex source code, but right now, doing so is annoying; the `Lexer` API is difficult to use. This PR introduces a ranges-style wrapper for it and converts some checks to use the new API.
---
Full diff: https://github.com/llvm/llvm-project/pull/172508.diff
7 Files Affected:
- (modified) clang-tools-extra/clang-tidy/bugprone/ArgumentCommentCheck.cpp (+2-13)
- (modified) clang-tools-extra/clang-tidy/modernize/MacroToEnumCheck.cpp (+13-22)
- (modified) clang-tools-extra/clang-tidy/modernize/RedundantVoidArgCheck.cpp (+4-7)
- (modified) clang-tools-extra/clang-tidy/modernize/UseOverrideCheck.cpp (+2-11)
- (modified) clang-tools-extra/clang-tidy/modernize/UseTrailingReturnTypeCheck.cpp (+22-33)
- (modified) clang-tools-extra/clang-tidy/readability/SimplifyBooleanExprCheck.cpp (+7-16)
- (modified) clang-tools-extra/clang-tidy/utils/LexerUtils.h (+74)
``````````diff
diff --git a/clang-tools-extra/clang-tidy/bugprone/ArgumentCommentCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/ArgumentCommentCheck.cpp
index ed30d01e645d1..235c68eea08b4 100644
--- a/clang-tools-extra/clang-tidy/bugprone/ArgumentCommentCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/ArgumentCommentCheck.cpp
@@ -94,19 +94,8 @@ getCommentsInRange(ASTContext *Ctx, CharSourceRange Range) {
if (Invalid)
return Comments;
- const char *StrData = Buffer.data() + BeginLoc.second;
-
- Lexer TheLexer(SM.getLocForStartOfFile(BeginLoc.first), Ctx->getLangOpts(),
- Buffer.begin(), StrData, Buffer.end());
- TheLexer.SetCommentRetentionState(true);
-
- while (true) {
- Token Tok;
- if (TheLexer.LexFromRawLexer(Tok))
- break;
- if (Tok.getLocation() == Range.getEnd() || Tok.is(tok::eof))
- break;
-
+ for (const Token Tok :
+ utils::lexer::tokensIncludingComments(Range, SM, Ctx->getLangOpts())) {
if (Tok.is(tok::comment)) {
const std::pair<FileID, unsigned> CommentLoc =
SM.getDecomposedLoc(Tok.getLocation());
diff --git a/clang-tools-extra/clang-tidy/modernize/MacroToEnumCheck.cpp b/clang-tools-extra/clang-tidy/modernize/MacroToEnumCheck.cpp
index 098d46cae5df4..8f88daf1ea7cc 100644
--- a/clang-tools-extra/clang-tidy/modernize/MacroToEnumCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/MacroToEnumCheck.cpp
@@ -7,8 +7,8 @@
//===----------------------------------------------------------------------===//
#include "MacroToEnumCheck.h"
+#include "../utils/LexerUtils.h"
#include "IntegralLiteralExpressionMatcher.h"
-
#include "clang/AST/ASTContext.h"
#include "clang/ASTMatchers/ASTMatchFinder.h"
#include "clang/Lex/Preprocessor.h"
@@ -19,17 +19,14 @@
namespace clang::tidy::modernize {
-static bool hasOnlyComments(SourceLocation Loc, const LangOptions &Options,
- StringRef Text) {
+static bool hasOnlyComments(SourceLocation Loc, const SourceManager &SM,
+ const LangOptions &Options,
+ CharSourceRange CharRange) {
// Use a lexer to look for tokens; if we find something other than a single
// hash, then there were intervening tokens between macro definitions.
- const std::string Buffer{Text};
- Lexer Lex(Loc, Options, Buffer.c_str(), Buffer.c_str(),
- Buffer.c_str() + Buffer.size());
- Token Tok;
bool SeenHash = false;
- while (!Lex.LexFromRawLexer(Tok)) {
- if (Tok.getKind() == tok::hash && !SeenHash) {
+ for (const Token Tok : utils::lexer::tokens(CharRange, SM, Options)) {
+ if (Tok.is(tok::hash) && !SeenHash) {
SeenHash = true;
continue;
}
@@ -46,6 +43,7 @@ static bool hasOnlyComments(SourceLocation Loc, const LangOptions &Options,
CRLFCR,
};
+ const StringRef Text = Lexer::getSourceText(CharRange, SM, Options);
WhiteSpace State = WhiteSpace::Nothing;
for (const char C : Text) {
switch (C) {
@@ -237,8 +235,7 @@ bool MacroToEnumCallbacks::isConsecutiveMacro(const MacroDirective *MD) const {
SourceRange{CurrentFile->LastMacroLocation, Define}, true};
const CharSourceRange CharRange =
Lexer::makeFileCharRange(BetweenMacros, SM, LangOpts);
- const StringRef BetweenText = Lexer::getSourceText(CharRange, SM, LangOpts);
- return hasOnlyComments(Define, LangOpts, BetweenText);
+ return hasOnlyComments(Define, SM, LangOpts, CharRange);
}
void MacroToEnumCallbacks::clearCurrentEnum(SourceLocation Loc) {
@@ -258,17 +255,11 @@ void MacroToEnumCallbacks::conditionStart(const SourceLocation &Loc) {
}
void MacroToEnumCallbacks::checkCondition(SourceRange Range) {
- const CharSourceRange CharRange = Lexer::makeFileCharRange(
- CharSourceRange::getTokenRange(Range), SM, LangOpts);
- std::string Text = Lexer::getSourceText(CharRange, SM, LangOpts).str();
- Lexer Lex(CharRange.getBegin(), LangOpts, Text.data(), Text.data(),
- Text.data() + Text.size());
- Token Tok;
- bool End = false;
- while (!End) {
- End = Lex.LexFromRawLexer(Tok);
- if (Tok.is(tok::raw_identifier) &&
- Tok.getRawIdentifier().str() != "defined")
+ for (const Token Tok : utils::lexer::tokens(
+ Lexer::makeFileCharRange(CharSourceRange::getTokenRange(Range), SM,
+ LangOpts),
+ SM, LangOpts)) {
+ if (Tok.is(tok::raw_identifier) && Tok.getRawIdentifier() != "defined")
checkName(Tok);
}
}
diff --git a/clang-tools-extra/clang-tidy/modernize/RedundantVoidArgCheck.cpp b/clang-tools-extra/clang-tidy/modernize/RedundantVoidArgCheck.cpp
index aa2db2146475b..d3125711b89c3 100644
--- a/clang-tools-extra/clang-tidy/modernize/RedundantVoidArgCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/RedundantVoidArgCheck.cpp
@@ -7,6 +7,7 @@
//===----------------------------------------------------------------------===//
#include "RedundantVoidArgCheck.h"
+#include "../utils/LexerUtils.h"
#include "clang/Frontend/CompilerInstance.h"
#include "clang/Lex/Lexer.h"
@@ -127,12 +128,6 @@ void RedundantVoidArgCheck::removeVoidArgumentTokens(
const CharSourceRange CharRange =
Lexer::makeFileCharRange(CharSourceRange::getTokenRange(Range),
*Result.SourceManager, getLangOpts());
-
- std::string DeclText =
- Lexer::getSourceText(CharRange, *Result.SourceManager, getLangOpts())
- .str();
- Lexer PrototypeLexer(CharRange.getBegin(), getLangOpts(), DeclText.data(),
- DeclText.data(), DeclText.data() + DeclText.size());
enum class TokenState {
Start,
MacroId,
@@ -149,7 +144,9 @@ void RedundantVoidArgCheck::removeVoidArgumentTokens(
const std::string Diagnostic =
("redundant void argument list in " + GrammarLocation).str();
- while (!PrototypeLexer.LexFromRawLexer(ProtoToken)) {
+ for (const Token Tok :
+ utils::lexer::tokens(CharRange, *Result.SourceManager, getLangOpts())) {
+ ProtoToken = Tok;
switch (State) {
case TokenState::Start:
if (ProtoToken.is(tok::TokenKind::l_paren))
diff --git a/clang-tools-extra/clang-tidy/modernize/UseOverrideCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseOverrideCheck.cpp
index dd516f8e51264..6de465afcca84 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseOverrideCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/UseOverrideCheck.cpp
@@ -54,21 +54,12 @@ void UseOverrideCheck::registerMatchers(MatchFinder *Finder) {
static SmallVector<Token, 16>
parseTokens(CharSourceRange Range, const MatchFinder::MatchResult &Result) {
const SourceManager &Sources = *Result.SourceManager;
- const std::pair<FileID, unsigned> LocInfo =
- Sources.getDecomposedLoc(Range.getBegin());
- const StringRef File = Sources.getBufferData(LocInfo.first);
- const char *TokenBegin = File.data() + LocInfo.second;
- Lexer RawLexer(Sources.getLocForStartOfFile(LocInfo.first),
- Result.Context->getLangOpts(), File.begin(), TokenBegin,
- File.end());
SmallVector<Token, 16> Tokens;
- Token Tok;
int NestedParens = 0;
- while (!RawLexer.LexFromRawLexer(Tok)) {
+ for (Token Tok :
+ utils::lexer::tokens(Range, Sources, Result.Context->getLangOpts())) {
if ((Tok.is(tok::semi) || Tok.is(tok::l_brace)) && NestedParens == 0)
break;
- if (Sources.isBeforeInTranslationUnit(Range.getEnd(), Tok.getLocation()))
- break;
if (Tok.is(tok::l_paren))
++NestedParens;
else if (Tok.is(tok::r_paren))
diff --git a/clang-tools-extra/clang-tidy/modernize/UseTrailingReturnTypeCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseTrailingReturnTypeCheck.cpp
index 02865b65a9ec2..054213ea542b0 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseTrailingReturnTypeCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/UseTrailingReturnTypeCheck.cpp
@@ -7,6 +7,7 @@
//===----------------------------------------------------------------------===//
#include "UseTrailingReturnTypeCheck.h"
+#include "../utils/LexerUtils.h"
#include "clang/AST/ASTContext.h"
#include "clang/AST/RecursiveASTVisitor.h"
#include "clang/ASTMatchers/ASTMatchFinder.h"
@@ -14,7 +15,6 @@
#include "clang/Tooling/FixIt.h"
#include "llvm/ADT/StringExtras.h"
-#include <cctype>
#include <optional>
namespace clang::tidy {
@@ -173,13 +173,11 @@ static SourceLocation findTrailingReturnTypeSourceLocation(
Lexer::getLocForEndOfToken(ClosingParen, 0, SM, LangOpts);
// Skip subsequent CV and ref qualifiers.
- const std::pair<FileID, unsigned> Loc = SM.getDecomposedLoc(Result);
- const StringRef File = SM.getBufferData(Loc.first);
- const char *TokenBegin = File.data() + Loc.second;
- Lexer Lexer(SM.getLocForStartOfFile(Loc.first), LangOpts, File.begin(),
- TokenBegin, File.end());
- Token T;
- while (!Lexer.LexFromRawLexer(T)) {
+ for (Token T : utils::lexer::tokens(
+ Lexer::makeFileCharRange(
+ CharSourceRange::getTokenRange(Result, F.getEndLoc()), SM,
+ LangOpts),
+ SM, LangOpts)) {
if (T.is(tok::raw_identifier)) {
IdentifierInfo &Info = Ctx.Idents.get(
StringRef(SM.getCharacterData(T.getLocation()), T.getLength()));
@@ -255,15 +253,11 @@ classifyTokensBeforeFunctionName(const FunctionDecl &F, const ASTContext &Ctx,
const SourceLocation BeginNameF = expandIfMacroId(F.getLocation(), SM);
// Create tokens for everything before the name of the function.
- const std::pair<FileID, unsigned> Loc = SM.getDecomposedLoc(BeginF);
- const StringRef File = SM.getBufferData(Loc.first);
- const char *TokenBegin = File.data() + Loc.second;
- Lexer Lexer(SM.getLocForStartOfFile(Loc.first), LangOpts, File.begin(),
- TokenBegin, File.end());
- Token T;
SmallVector<ClassifiedToken, 8> ClassifiedTokens;
- while (!Lexer.LexFromRawLexer(T) &&
- SM.isBeforeInTranslationUnit(T.getLocation(), BeginNameF)) {
+ for (Token T : utils::lexer::tokens(
+ Lexer::makeFileCharRange(
+ CharSourceRange::getCharRange(BeginF, BeginNameF), SM, LangOpts),
+ SM, LangOpts)) {
if (T.is(tok::raw_identifier)) {
IdentifierInfo &Info = Ctx.Idents.get(
StringRef(SM.getCharacterData(T.getLocation()), T.getLength()));
@@ -367,25 +361,20 @@ static SourceLocation findLambdaTrailingReturnInsertLoc(
else
ParamEndLoc = Method->getParametersSourceRange().getEnd();
- const std::pair<FileID, unsigned> ParamEndLocInfo =
- SM.getDecomposedLoc(ParamEndLoc);
- const StringRef Buffer = SM.getBufferData(ParamEndLocInfo.first);
-
- Lexer Lexer(SM.getLocForStartOfFile(ParamEndLocInfo.first), LangOpts,
- Buffer.begin(), Buffer.data() + ParamEndLocInfo.second,
- Buffer.end());
-
- Token Token;
- while (!Lexer.LexFromRawLexer(Token)) {
- if (Token.is(tok::raw_identifier)) {
- IdentifierInfo &Info = Ctx.Idents.get(StringRef(
- SM.getCharacterData(Token.getLocation()), Token.getLength()));
- Token.setIdentifierInfo(&Info);
- Token.setKind(Info.getTokenID());
+ for (Token T : utils::lexer::tokens(
+ Lexer::makeFileCharRange(CharSourceRange::getTokenRange(
+ ParamEndLoc, Method->getEndLoc()),
+ SM, LangOpts),
+ SM, LangOpts)) {
+ if (T.is(tok::raw_identifier)) {
+ IdentifierInfo &Info = Ctx.Idents.get(
+ StringRef(SM.getCharacterData(T.getLocation()), T.getLength()));
+ T.setIdentifierInfo(&Info);
+ T.setKind(Info.getTokenID());
}
- if (Token.is(tok::kw_requires))
- return Token.getLocation().getLocWithOffset(-1);
+ if (T.is(tok::kw_requires))
+ return T.getLocation().getLocWithOffset(-1);
}
return {};
diff --git a/clang-tools-extra/clang-tidy/readability/SimplifyBooleanExprCheck.cpp b/clang-tools-extra/clang-tidy/readability/SimplifyBooleanExprCheck.cpp
index 1a9c161068030..baf77e6774061 100644
--- a/clang-tools-extra/clang-tidy/readability/SimplifyBooleanExprCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/SimplifyBooleanExprCheck.cpp
@@ -7,6 +7,7 @@
//===----------------------------------------------------------------------===//
#include "SimplifyBooleanExprCheck.h"
+#include "../utils/LexerUtils.h"
#include "clang/AST/Expr.h"
#include "clang/AST/RecursiveASTVisitor.h"
#include "clang/Basic/DiagnosticIDs.h"
@@ -237,22 +238,12 @@ static std::string replacementExpression(const ASTContext &Context,
static bool containsDiscardedTokens(const ASTContext &Context,
CharSourceRange CharRange) {
- std::string ReplacementText =
- Lexer::getSourceText(CharRange, Context.getSourceManager(),
- Context.getLangOpts())
- .str();
- Lexer Lex(CharRange.getBegin(), Context.getLangOpts(), ReplacementText.data(),
- ReplacementText.data(),
- ReplacementText.data() + ReplacementText.size());
- Lex.SetCommentRetentionState(true);
-
- Token Tok;
- while (!Lex.LexFromRawLexer(Tok)) {
- if (Tok.is(tok::TokenKind::comment) || Tok.is(tok::TokenKind::hash))
- return true;
- }
-
- return false;
+ return llvm::any_of(
+ utils::lexer::tokensIncludingComments(
+ CharRange, Context.getSourceManager(), Context.getLangOpts()),
+ [](Token Tok) {
+ return Tok.isOneOf(tok::TokenKind::comment, tok::TokenKind::hash);
+ });
}
class SimplifyBooleanExprCheck::Visitor : public RecursiveASTVisitor<Visitor> {
diff --git a/clang-tools-extra/clang-tidy/utils/LexerUtils.h b/clang-tools-extra/clang-tidy/utils/LexerUtils.h
index c5fb646c0efd9..9daf005a6cb00 100644
--- a/clang-tools-extra/clang-tidy/utils/LexerUtils.h
+++ b/clang-tools-extra/clang-tidy/utils/LexerUtils.h
@@ -12,6 +12,8 @@
#include "clang/AST/ASTContext.h"
#include "clang/Basic/TokenKinds.h"
#include "clang/Lex/Lexer.h"
+#include "clang/basic/SourceManager.h"
+#include <iterator>
#include <optional>
#include <utility>
@@ -127,6 +129,78 @@ SourceLocation getUnifiedEndLoc(const Stmt &S, const SourceManager &SM,
SourceLocation getLocationForNoexceptSpecifier(const FunctionDecl *FuncDecl,
const SourceManager &SM);
+class TokenView {
+public:
+ class iterator { // NOLINT(readability-identifier-naming)
+ public:
+ using size_type = std::size_t;
+ using difference_type = std::ptrdiff_t;
+ using iterator_category = std::input_iterator_tag;
+
+ iterator &operator++() {
+ if (View->RawLexer.getBufferLocation() < View->EndOfLexedRange)
+ View->RawLexer.LexFromRawLexer(View->Tok);
+ else
+ View = nullptr; // No more tokens.
+ return *this;
+ }
+
+ void operator++(int) { operator++(); }
+
+ friend bool operator==(iterator LHS, iterator RHS) {
+ return LHS.View == RHS.View;
+ }
+
+ friend bool operator!=(iterator LHS, iterator RHS) { return !(LHS == RHS); }
+
+ const Token &operator*() const { return View->Tok; }
+ const Token *operator->() const { return &View->Tok; }
+
+ private:
+ friend class TokenView;
+ iterator(TokenView *V) : View(V) {}
+ TokenView *View;
+ };
+
+ iterator begin() {
+ iterator It(this);
+ ++It;
+ return It;
+ }
+ iterator end() { return {nullptr}; }
+
+ TokenView(CharSourceRange Range, const SourceManager &SM,
+ const LangOptions &LangOpts, bool RetainComments)
+ : RawLexer([&]() -> Lexer {
+ const auto [FID, BeginOffset] = SM.getDecomposedLoc(Range.getBegin());
+ const auto [_, EndOffset] = SM.getDecomposedLoc(Range.getEnd());
+ const StringRef FileContents = SM.getBufferData(FID);
+ const StringRef LexedRange = {FileContents.begin() + BeginOffset,
+ EndOffset - BeginOffset};
+ EndOfLexedRange = LexedRange.end();
+ return {Range.getBegin(), LangOpts, LexedRange.begin(),
+ LexedRange.begin(), FileContents.end()};
+ }()) {
+ RawLexer.SetCommentRetentionState(RetainComments);
+ }
+
+private:
+ Lexer RawLexer;
+ const char *EndOfLexedRange;
+ Token Tok;
+};
+
+inline TokenView tokens(CharSourceRange Range, const SourceManager &SM,
+ const LangOptions &LangOpts) {
+ return {Range, SM, LangOpts, false};
+}
+
+inline TokenView tokensIncludingComments(CharSourceRange Range,
+ const SourceManager &SM,
+ const LangOptions &LangOpts) {
+ return {Range, SM, LangOpts, true};
+}
+
} // namespace tidy::utils::lexer
} // namespace clang
``````````
</details>
https://github.com/llvm/llvm-project/pull/172508
More information about the cfe-commits
mailing list