[clang] 8c2cf49 - [clang][Tooling] Add a way to tokenize a FileRange
Kadir Cetinkaya via cfe-commits
cfe-commits at lists.llvm.org
Wed Feb 26 04:50:52 PST 2020
Author: Kadir Cetinkaya
Date: 2020-02-26T13:50:41+01:00
New Revision: 8c2cf499e6119be8f3f1a0d42c4bb7e45b0d615d
URL: https://github.com/llvm/llvm-project/commit/8c2cf499e6119be8f3f1a0d42c4bb7e45b0d615d
DIFF: https://github.com/llvm/llvm-project/commit/8c2cf499e6119be8f3f1a0d42c4bb7e45b0d615d.diff
LOG: [clang][Tooling] Add a way to tokenize a FileRange
Reviewers: sammccall
Subscribers: cfe-commits
Tags: #clang
Differential Revision: https://reviews.llvm.org/D74962
Added:
Modified:
clang/include/clang/Tooling/Syntax/Tokens.h
clang/lib/Tooling/Syntax/Tokens.cpp
clang/unittests/Tooling/Syntax/TokensTest.cpp
Removed:
################################################################################
diff --git a/clang/include/clang/Tooling/Syntax/Tokens.h b/clang/include/clang/Tooling/Syntax/Tokens.h
index fc0fabb40658..19d120ebbc9f 100644
--- a/clang/include/clang/Tooling/Syntax/Tokens.h
+++ b/clang/include/clang/Tooling/Syntax/Tokens.h
@@ -339,6 +339,12 @@ spelledIdentifierTouching(SourceLocation Loc,
/// The result will *not* have a 'eof' token at the end.
std::vector<syntax::Token> tokenize(FileID FID, const SourceManager &SM,
const LangOptions &LO);
+/// Similar to one above, instead of whole file tokenizes a part of it. Note
+/// that, the first token might be incomplete if FR.startOffset is not at the
+/// beginning of a token, and the last token returned will start before the
+/// FR.endOffset but might end after it.
+std::vector<syntax::Token>
+tokenize(const FileRange &FR, const SourceManager &SM, const LangOptions &LO);
/// Collects tokens for the main file while running the frontend action. An
/// instance of this object should be created on
diff --git a/clang/lib/Tooling/Syntax/Tokens.cpp b/clang/lib/Tooling/Syntax/Tokens.cpp
index 9f51ab787ad1..ae5bc687553b 100644
--- a/clang/lib/Tooling/Syntax/Tokens.cpp
+++ b/clang/lib/Tooling/Syntax/Tokens.cpp
@@ -67,7 +67,8 @@ FileRange syntax::Token::range(const SourceManager &SM,
auto F = First.range(SM);
auto L = Last.range(SM);
assert(F.file() == L.file() && "tokens from
diff erent files");
- assert((F == L || F.endOffset() <= L.beginOffset()) && "wrong order of tokens");
+ assert((F == L || F.endOffset() <= L.beginOffset()) &&
+ "wrong order of tokens");
return FileRange(F.file(), F.beginOffset(), L.endOffset());
}
@@ -307,7 +308,8 @@ TokenBuffer::macroExpansions(FileID FID) const {
return Expansions;
}
-std::vector<syntax::Token> syntax::tokenize(FileID FID, const SourceManager &SM,
+std::vector<syntax::Token> syntax::tokenize(const FileRange &FR,
+ const SourceManager &SM,
const LangOptions &LO) {
std::vector<syntax::Token> Tokens;
IdentifierTable Identifiers(LO);
@@ -322,18 +324,28 @@ std::vector<syntax::Token> syntax::tokenize(FileID FID, const SourceManager &SM,
Tokens.push_back(syntax::Token(T));
};
- Lexer L(FID, SM.getBuffer(FID), SM, LO);
+ auto SrcBuffer = SM.getBufferData(FR.file());
+ Lexer L(SM.getLocForStartOfFile(FR.file()), LO, SrcBuffer.data(),
+ SrcBuffer.data() + FR.beginOffset(),
+ // We can't make BufEnd point to FR.endOffset, as Lexer requires a
+ // null terminated buffer.
+ SrcBuffer.data() + SrcBuffer.size());
clang::Token T;
- while (!L.LexFromRawLexer(T))
+ while (!L.LexFromRawLexer(T) && L.getCurrentBufferOffset() < FR.endOffset())
AddToken(T);
- // 'eof' is only the last token if the input is null-terminated. Never store
- // it, for consistency.
- if (T.getKind() != tok::eof)
+ // LexFromRawLexer returns true when it parses the last token of the file, add
+ // it iff it starts within the range we are interested in.
+ if (SM.getFileOffset(T.getLocation()) < FR.endOffset())
AddToken(T);
return Tokens;
}
+std::vector<syntax::Token> syntax::tokenize(FileID FID, const SourceManager &SM,
+ const LangOptions &LO) {
+ return tokenize(syntax::FileRange(FID, 0, SM.getFileIDSize(FID)), SM, LO);
+}
+
/// Records information reqired to construct mappings for the token buffer that
/// we are collecting.
class TokenCollector::CollectPPExpansions : public PPCallbacks {
diff --git a/clang/unittests/Tooling/Syntax/TokensTest.cpp b/clang/unittests/Tooling/Syntax/TokensTest.cpp
index b2ad3859104a..ad0293bc3e07 100644
--- a/clang/unittests/Tooling/Syntax/TokensTest.cpp
+++ b/clang/unittests/Tooling/Syntax/TokensTest.cpp
@@ -153,11 +153,17 @@ class TokenCollectorTest : public ::testing::Test {
}
}
- /// Add a new file, run syntax::tokenize() on it and return the results.
+ /// Add a new file, run syntax::tokenize() on the range if any, run it on the
+ /// whole file otherwise and return the results.
std::vector<syntax::Token> tokenize(llvm::StringRef Text) {
+ llvm::Annotations Annot(Text);
+ auto FID = SourceMgr->createFileID(
+ llvm::MemoryBuffer::getMemBufferCopy(Annot.code()));
// FIXME: pass proper LangOptions.
+ if (Annot.ranges().empty())
+ return syntax::tokenize(FID, *SourceMgr, LangOptions());
return syntax::tokenize(
- SourceMgr->createFileID(llvm::MemoryBuffer::getMemBufferCopy(Text)),
+ syntax::FileRange(FID, Annot.range().Begin, Annot.range().End),
*SourceMgr, LangOptions());
}
@@ -258,6 +264,20 @@ TEST_F(TokenCollectorTest, RawMode) {
ElementsAre(Kind(tok::kw_int),
AllOf(HasText("a"), Kind(tok::identifier)),
Kind(tok::semi)));
+ EXPECT_THAT(tokenize("int [[main() {]]}"),
+ ElementsAre(AllOf(HasText("main"), Kind(tok::identifier)),
+ Kind(tok::l_paren), Kind(tok::r_paren),
+ Kind(tok::l_brace)));
+ EXPECT_THAT(tokenize("int [[main() { ]]}"),
+ ElementsAre(AllOf(HasText("main"), Kind(tok::identifier)),
+ Kind(tok::l_paren), Kind(tok::r_paren),
+ Kind(tok::l_brace)));
+ // First token is partially parsed, last token is fully included even though
+ // only a part of it is contained in the range.
+ EXPECT_THAT(tokenize("int m[[ain() {ret]]urn 0;}"),
+ ElementsAre(AllOf(HasText("ain"), Kind(tok::identifier)),
+ Kind(tok::l_paren), Kind(tok::r_paren),
+ Kind(tok::l_brace), Kind(tok::kw_return)));
}
TEST_F(TokenCollectorTest, Basic) {
More information about the cfe-commits
mailing list