[clang] 7c1ee5e - [Pseudo] Token/TokenStream, PP directive parser.
Sam McCall via cfe-commits
cfe-commits at lists.llvm.org
Wed Feb 23 08:52:11 PST 2022
Author: Sam McCall
Date: 2022-02-23T17:52:02+01:00
New Revision: 7c1ee5e95f3159e13edef644db0509a7d49921c3
URL: https://github.com/llvm/llvm-project/commit/7c1ee5e95f3159e13edef644db0509a7d49921c3
DIFF: https://github.com/llvm/llvm-project/commit/7c1ee5e95f3159e13edef644db0509a7d49921c3.diff
LOG: [Pseudo] Token/TokenStream, PP directive parser.
The TokenStream class is the representation of the source code that will
be fed into the GLR parser.
This patch allows a "raw" TokenStream to be built by reading source code.
It also supports scanning a TokenStream to find the directive structure.
Next steps (with placeholders in the code): heuristically choosing a
path through #ifs, preprocessing the code by stripping directives and comments.
These will produce a suitable stream to feed into the parser proper.
Differential Revision: https://reviews.llvm.org/D119162
Added:
clang/include/clang/Tooling/Syntax/Pseudo/Preprocess.h
clang/include/clang/Tooling/Syntax/Pseudo/Token.h
clang/lib/Tooling/Syntax/Pseudo/Lex.cpp
clang/lib/Tooling/Syntax/Pseudo/Preprocess.cpp
clang/lib/Tooling/Syntax/Pseudo/Token.cpp
clang/test/Syntax/lex.c
clang/unittests/Tooling/Syntax/Pseudo/PreprocessTest.cpp
clang/unittests/Tooling/Syntax/Pseudo/TokenTest.cpp
Modified:
clang/include/clang/Basic/TokenKinds.h
clang/lib/Basic/TokenKinds.cpp
clang/lib/Tooling/Syntax/Pseudo/CMakeLists.txt
clang/tools/clang-pseudo/ClangPseudo.cpp
clang/unittests/Tooling/Syntax/Pseudo/CMakeLists.txt
Removed:
################################################################################
diff --git a/clang/include/clang/Basic/TokenKinds.h b/clang/include/clang/Basic/TokenKinds.h
index 4e66aa1c8c2d8..6b7006651f4eb 100644
--- a/clang/include/clang/Basic/TokenKinds.h
+++ b/clang/include/clang/Basic/TokenKinds.h
@@ -68,6 +68,9 @@ const char *getPunctuatorSpelling(TokenKind Kind) LLVM_READNONE;
/// tokens like 'int' and 'dynamic_cast'. Returns NULL for other token kinds.
const char *getKeywordSpelling(TokenKind Kind) LLVM_READNONE;
+/// Returns the spelling of preprocessor keywords, such as "else".
+const char *getPPKeywordSpelling(PPKeywordKind Kind) LLVM_READNONE;
+
/// Return true if this is a raw identifier or an identifier kind.
inline bool isAnyIdentifier(TokenKind K) {
return (K == tok::identifier) || (K == tok::raw_identifier);
diff --git a/clang/include/clang/Tooling/Syntax/Pseudo/Preprocess.h b/clang/include/clang/Tooling/Syntax/Pseudo/Preprocess.h
new file mode 100644
index 0000000000000..11a92042e7496
--- /dev/null
+++ b/clang/include/clang/Tooling/Syntax/Pseudo/Preprocess.h
@@ -0,0 +1,148 @@
+//===--- Preprocess.h - Preprocess token streams -----------------*- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// The pseudoparser tries to match a token stream to the C++ grammar.
+// Preprocessor #defines and other directives are not part of this grammar, and
+// should be removed before the file can be parsed.
+//
+// Conditional blocks like #if...#else...#endif are particularly tricky, as
+// simply stripping the directives may not produce a grammatical result:
+//
+// return
+// #ifndef DEBUG
+// 1
+// #else
+// 0
+// #endif
+// ;
+//
+// This header supports analyzing and removing the directives in a source file.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_TOOLING_SYNTAX_PREPROCESS_H
+#define LLVM_CLANG_TOOLING_SYNTAX_PREPROCESS_H
+
+#include "clang/Basic/TokenKinds.h"
+#include "clang/Tooling/Syntax/Pseudo/Token.h"
+#include <vector>
+
+namespace clang {
+class LangOptions;
+namespace syntax {
+namespace pseudo {
+
+/// Describes the structure of a source file, as seen by the preprocessor.
+///
+/// The structure is a tree, whose leaves are plain source code and directives,
+/// and whose internal nodes are #if...#endif sections.
+///
+/// (root)
+/// |-+ Directive #include <stdio.h>
+/// |-+ Code int main() {
+/// | ` printf("hello, ");
+/// |-+ Conditional -+ Directive #ifndef NDEBUG
+/// | |-+ Code printf("debug\n");
+/// | |-+ Directive #else
+/// | |-+ Code printf("production\n");
+/// | `-+ Directive #endif
+/// |-+ Code return 0;
+/// ` }
+///
+/// Unlike the clang preprocessor, we model the full tree explicitly.
+/// This class does not recognize macro usage, only directives.
+struct PPStructure {
+ /// A range of code (and possibly comments) containing no directives.
+ struct Code {
+ Token::Range Tokens;
+ };
+ /// A preprocessor directive.
+ struct Directive {
+ /// Raw tokens making up the directive, starting with `#`.
+ Token::Range Tokens;
+ clang::tok::PPKeywordKind Kind = clang::tok::pp_not_keyword;
+ };
+ /// A preprocessor conditional section.
+ ///
+ /// This starts with an #if, #ifdef, #ifndef etc directive.
+ /// It covers all #else branches, and spans until the matching #endif.
+ struct Conditional {
+ /// The sequence of directives that introduce top-level alternative parses.
+ ///
+ /// The first branch will have an #if type directive.
+ /// Subsequent branches will have #else type directives.
+ std::vector<std::pair<Directive, PPStructure>> Branches;
+ /// The directive terminating the conditional, should be #endif.
+ Directive End;
+ };
+
+ /// Some piece of the file. {One of Code, Directive, Conditional}.
+ class Chunk; // Defined below.
+ std::vector<Chunk> Chunks;
+
+ /// Extract preprocessor structure by examining the raw tokens.
+ static PPStructure parse(const TokenStream &);
+
+ // FIXME: add heuristically selection of conditional branches.
+ // FIXME: allow deriving a preprocessed stream
+};
+llvm::raw_ostream &operator<<(llvm::raw_ostream &, const PPStructure &);
+llvm::raw_ostream &operator<<(llvm::raw_ostream &, const PPStructure::Chunk &);
+llvm::raw_ostream &operator<<(llvm::raw_ostream &, const PPStructure::Code &);
+llvm::raw_ostream &operator<<(llvm::raw_ostream &,
+ const PPStructure::Directive &);
+llvm::raw_ostream &operator<<(llvm::raw_ostream &,
+ const PPStructure::Conditional &);
+
+// FIXME: This approximates std::variant<Code, Directive, Conditional>.
+// Switch once we can use C++17.
+class PPStructure::Chunk {
+public:
+ enum Kind { K_Empty, K_Code, K_Directive, K_Conditional };
+ Kind kind() const {
+ return CodeVariant ? K_Code
+ : DirectiveVariant ? K_Directive
+ : ConditionalVariant ? K_Conditional
+ : K_Empty;
+ }
+
+ Chunk() = delete;
+ Chunk(const Chunk &) = delete;
+ Chunk(Chunk &&) = default;
+ Chunk &operator=(const Chunk &) = delete;
+ Chunk &operator=(Chunk &&) = default;
+ ~Chunk() = default;
+
+ // T => Chunk constructor.
+ Chunk(Code C) : CodeVariant(std::move(C)) {}
+ Chunk(Directive C) : DirectiveVariant(std::move(C)) {}
+ Chunk(Conditional C) : ConditionalVariant(std::move(C)) {}
+
+ // Chunk => T& and const T& conversions.
+#define CONVERSION(CONST, V) \
+ explicit operator CONST V &() CONST { return *V##Variant; }
+ CONVERSION(const, Code);
+ CONVERSION(, Code);
+ CONVERSION(const, Directive);
+ CONVERSION(, Directive);
+ CONVERSION(const, Conditional);
+ CONVERSION(, Conditional);
+#undef CONVERSION
+
+private:
+ // Wasteful, a union variant would be better!
+ llvm::Optional<Code> CodeVariant;
+ llvm::Optional<Directive> DirectiveVariant;
+ llvm::Optional<Conditional> ConditionalVariant;
+};
+
+} // namespace pseudo
+} // namespace syntax
+} // namespace clang
+
+#endif
diff --git a/clang/include/clang/Tooling/Syntax/Pseudo/Token.h b/clang/include/clang/Tooling/Syntax/Pseudo/Token.h
new file mode 100644
index 0000000000000..7a73a85eae94d
--- /dev/null
+++ b/clang/include/clang/Tooling/Syntax/Pseudo/Token.h
@@ -0,0 +1,202 @@
+//===--- Token.h - Tokens and token streams in the pseudoparser --*- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Tokens are the first level of abstraction above bytes used in pseudoparsing.
+// We use clang's lexer to scan the bytes (in raw mode, with no preprocessor).
+// The tokens is wrapped into pseudo::Token, along with line/indent info.
+//
+// Unlike clang, we make multiple passes over the whole file, out-of-order.
+// Therefore we retain the whole token sequence in memory. (This is feasible as
+// we process one file at a time). pseudo::TokenStream holds such a stream.
+// The initial stream holds the raw tokens read from the file, later passes
+// operate on derived TokenStreams (e.g. with directives stripped).
+//
+// Similar facilities from clang that are *not* used:
+// - SourceManager: designed around multiple files and precise macro expansion.
+// - clang::Token: coupled to SourceManager, doesn't retain layout info.
+// (pseudo::Token is similar, but without SourceLocations).
+// - syntax::TokenBuffer: coupled to SourceManager, has #includes and macros.
+// (pseudo::TokenStream is similar, but a flat token list).
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_TOOLING_SYNTAX_TOKEN_H
+#define LLVM_CLANG_TOOLING_SYNTAX_TOKEN_H
+
+#include "clang/Basic/LLVM.h"
+#include "clang/Basic/TokenKinds.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cstdint>
+#include <limits>
+#include <memory>
+#include <vector>
+
+namespace clang {
+class LangOptions;
+namespace syntax {
+namespace pseudo {
+
+/// A single C++ or preprocessor token.
+///
+/// Unlike clang::Token and syntax::Token, these tokens are not connected to a
+/// SourceManager - we are not dealing with multiple files.
+struct Token {
+ /// An Index identifies a token within a stream.
+ using Index = uint32_t;
+ /// A sentinel Index indicating no token.
+ constexpr static Index Invalid = std::numeric_limits<Index>::max();
+ struct Range;
+
+ /// The token text.
+ ///
+ /// Typically from the original source file, but may have been synthesized.
+ StringRef text() const { return StringRef(Data, Length); }
+ const char *Data = nullptr;
+ uint32_t Length = 0;
+
+ /// Zero-based line number for the start of the token.
+ /// This refers to the original source file as written.
+ uint32_t Line = 0;
+ /// Width of whitespace before the first token on this line.
+ uint8_t Indent = 0;
+ /// Flags have some meaning defined by the function that produced this stream.
+ uint8_t Flags = 0;
+ // Helpers to get/set Flags based on `enum class`.
+ template <class T> bool flag(T Mask) const {
+ return Flags & uint8_t{static_cast<std::underlying_type_t<T>>(Mask)};
+ }
+ template <class T> void setFlag(T Mask) {
+ Flags |= uint8_t{static_cast<std::underlying_type_t<T>>(Mask)};
+ }
+
+ /// The type of token as determined by clang's lexer.
+ clang::tok::TokenKind Kind = clang::tok::unknown;
+};
+static_assert(sizeof(Token) <= sizeof(char *) + 16, "Careful with layout!");
+llvm::raw_ostream &operator<<(llvm::raw_ostream &, const Token &);
+
+/// A half-open range of tokens within a stream.
+struct Token::Range {
+ Index Begin = 0;
+ Index End = 0;
+
+ uint32_t size() const { return End - Begin; }
+ static Range emptyAt(Index Index) { return Range{Index, Index}; }
+};
+llvm::raw_ostream &operator<<(llvm::raw_ostream &, const Token::Range &);
+
+/// A complete sequence of Tokens representing a source file.
+///
+/// This may match a raw file from disk, or be derived from a previous stream.
+/// For example, stripping comments from a TokenStream results in a new stream.
+///
+/// A stream has sentinel 'eof' tokens at each end, e.g `int main();` becomes:
+/// int main ( ) ;
+/// eof kw_int ident l_paren r_paren semi eof
+/// front() back()
+/// 0 1 2 3 4 5
+class TokenStream {
+public:
+ /// Create an empty stream.
+ ///
+ /// Initially, the stream is appendable and not finalized.
+ /// The token sequence may only be accessed after finalize() is called.
+ ///
+ /// Payload is an opaque object which will be owned by the stream.
+ /// e.g. an allocator to hold backing storage for synthesized token text.
+ explicit TokenStream(std::shared_ptr<void> Payload = nullptr);
+
+ /// Append a token to the stream, which must not be finalized.
+ void push(Token T) {
+ assert(!isFinalized());
+ Storage.push_back(std::move(T));
+ }
+
+ /// Finalize the token stream, allowing tokens to be accessed.
+ /// Tokens may no longer be appended.
+ void finalize();
+ bool isFinalized() const;
+
+ /// Returns the index of T within the stream.
+ ///
+ /// T must be within the stream or the end sentinel (not the start sentinel).
+ Token::Index index(const Token &T) const {
+ assert(isFinalized());
+ assert(&T >= Storage.data() && &T < Storage.data() + Storage.size());
+ assert(&T != Storage.data() && "start sentinel");
+ return &T - Tokens.data();
+ }
+
+ ArrayRef<Token> tokens() const {
+ assert(isFinalized());
+ return Tokens;
+ }
+ ArrayRef<Token> tokens(Token::Range R) const {
+ return tokens().slice(R.Begin, R.End - R.Begin);
+ }
+
+ /// May return the end sentinel if the stream is empty.
+ const Token &front() const {
+ assert(isFinalized());
+ return Storage[1];
+ }
+
+ /// Print the tokens in this stream to the output stream.
+ ///
+ /// The presence of newlines/spaces is preserved, but not the quantity.
+ void print(llvm::raw_ostream &) const;
+
+private:
+ std::shared_ptr<void> Payload;
+
+ MutableArrayRef<Token> Tokens;
+ std::vector<Token> Storage; // eof + Tokens + eof
+};
+llvm::raw_ostream &operator<<(llvm::raw_ostream &, const TokenStream &);
+
+/// Extracts a raw token stream from the source code.
+///
+/// All tokens will reference the data of the provided string.
+/// "word-like" tokens such as identifiers and keywords will be raw_identifier.
+TokenStream lex(const std::string &, const clang::LangOptions &);
+enum class LexFlags : uint8_t {
+ /// Marks the token at the start of a logical preprocessor line.
+ /// This is a position where a directive might start.
+ ///
+ /// Here, the first # is StartsPPLine, but second is not (same logical line).
+ /// #define X(error) \
+ /// #error // not a directive!
+ ///
+ /// Careful, the directive may not start exactly on the StartsPPLine token:
+ /// /*comment*/ #include <foo.h>
+ StartsPPLine = 1 << 0,
+ /// Marks tokens containing trigraphs, escaped newlines, UCNs etc.
+ /// The text() of such tokens will contain the raw trigrah.
+ NeedsCleaning = 1 << 1,
+};
+
+/// Derives a token stream by decoding escapes and interpreting raw_identifiers.
+///
+/// Tokens containing UCNs, escaped newlines, trigraphs etc are decoded and
+/// their backing data is owned by the returned stream.
+/// raw_identifier tokens are assigned specific types (identifier, keyword etc).
+///
+/// The StartsPPLine flag is preserved.
+///
+/// Formally the identifier correctly happens before preprocessing, while we
+/// should only cook raw_identifiers that survive preprocessing.
+/// However, ignoring the Token::Kind of tokens in directives achieves the same.
+/// (And having cooked token kinds in PP-disabled sections is useful for us).
+TokenStream cook(const TokenStream &, const clang::LangOptions &);
+
+} // namespace pseudo
+} // namespace syntax
+} // namespace clang
+
+#endif
diff --git a/clang/lib/Basic/TokenKinds.cpp b/clang/lib/Basic/TokenKinds.cpp
index d55e176c72c4c..c300175ce90ba 100644
--- a/clang/lib/Basic/TokenKinds.cpp
+++ b/clang/lib/Basic/TokenKinds.cpp
@@ -46,6 +46,15 @@ const char *tok::getKeywordSpelling(TokenKind Kind) {
return nullptr;
}
+const char *tok::getPPKeywordSpelling(tok::PPKeywordKind Kind) {
+ switch (Kind) {
+#define PPKEYWORD(x) case tok::pp_##x: return #x;
+#include "clang/Basic/TokenKinds.def"
+ default: break;
+ }
+ return nullptr;
+}
+
bool tok::isAnnotation(TokenKind Kind) {
switch (Kind) {
#define ANNOTATION(X) case annot_ ## X: return true;
diff --git a/clang/lib/Tooling/Syntax/Pseudo/CMakeLists.txt b/clang/lib/Tooling/Syntax/Pseudo/CMakeLists.txt
index 8afe7f73f3085..be75138e60c60 100644
--- a/clang/lib/Tooling/Syntax/Pseudo/CMakeLists.txt
+++ b/clang/lib/Tooling/Syntax/Pseudo/CMakeLists.txt
@@ -3,9 +3,12 @@ set(LLVM_LINK_COMPONENTS Support)
add_clang_library(clangToolingSyntaxPseudo
Grammar.cpp
GrammarBNF.cpp
+ Lex.cpp
LRGraph.cpp
LRTable.cpp
LRTableBuild.cpp
+ Preprocess.cpp
+ Token.cpp
LINK_LIBS
clangBasic
diff --git a/clang/lib/Tooling/Syntax/Pseudo/Lex.cpp b/clang/lib/Tooling/Syntax/Pseudo/Lex.cpp
new file mode 100644
index 0000000000000..ac0120cb9e473
--- /dev/null
+++ b/clang/lib/Tooling/Syntax/Pseudo/Lex.cpp
@@ -0,0 +1,114 @@
+//===--- Lex.cpp - extract token stream from source code ---------*- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/Basic/SourceLocation.h"
+#include "clang/Basic/TokenKinds.h"
+#include "clang/Lex/Lexer.h"
+#include "clang/Lex/LiteralSupport.h"
+#include "clang/Tooling/Syntax/Pseudo/Token.h"
+
+namespace clang {
+namespace syntax {
+namespace pseudo {
+
+TokenStream lex(const std::string &Code, const clang::LangOptions &LangOpts) {
+ clang::SourceLocation Start;
+ // Tokenize using clang's lexer in raw mode.
+ // std::string guarantees null-termination, which the lexer needs.
+ clang::Lexer Lexer(Start, LangOpts, Code.data(), Code.data(),
+ Code.data() + Code.size());
+ Lexer.SetCommentRetentionState(true);
+
+ TokenStream Result;
+ clang::Token CT;
+ unsigned LastOffset = 0;
+ unsigned Line = 0;
+ unsigned Indent = 0;
+ for (Lexer.LexFromRawLexer(CT); CT.getKind() != clang::tok::eof;
+ Lexer.LexFromRawLexer(CT)) {
+ unsigned Offset =
+ CT.getLocation().getRawEncoding() - Start.getRawEncoding();
+
+ Token Tok;
+ Tok.Data = &Code[Offset];
+ Tok.Length = CT.getLength();
+ Tok.Kind = CT.getKind();
+
+ // Update current line number and indentation from raw source code.
+ unsigned NewLineStart = 0;
+ for (unsigned i = LastOffset; i < Offset; ++i) {
+ if (Code[i] == '\n') {
+ NewLineStart = i + 1;
+ ++Line;
+ }
+ }
+ if (NewLineStart || !LastOffset) {
+ Indent = 0;
+ for (char c : StringRef(Code).slice(NewLineStart, Offset)) {
+ if (c == ' ')
+ ++Indent;
+ else if (c == '\t')
+ Indent += 8;
+ else
+ break;
+ }
+ }
+ Tok.Indent = Indent;
+ Tok.Line = Line;
+
+ if (CT.isAtStartOfLine())
+ Tok.setFlag(LexFlags::StartsPPLine);
+ if (CT.needsCleaning() || CT.hasUCN())
+ Tok.setFlag(LexFlags::NeedsCleaning);
+
+ Result.push(Tok);
+ LastOffset = Offset;
+ }
+ Result.finalize();
+ return Result;
+}
+
+TokenStream cook(const TokenStream &Code, const LangOptions &LangOpts) {
+ auto CleanedStorage = std::make_shared<llvm::BumpPtrAllocator>();
+ clang::IdentifierTable Identifiers(LangOpts);
+ TokenStream Result(CleanedStorage);
+
+ for (auto Tok : Code.tokens()) {
+ if (Tok.flag(LexFlags::NeedsCleaning)) {
+ // Remove escaped newlines and trigraphs.
+ llvm::SmallString<64> CleanBuffer;
+ const char *Pos = Tok.text().begin();
+ while (Pos < Tok.text().end()) {
+ unsigned CharSize = 0;
+ CleanBuffer.push_back(
+ clang::Lexer::getCharAndSizeNoWarn(Pos, CharSize, LangOpts));
+ assert(CharSize != 0 && "no progress!");
+ Pos += CharSize;
+ }
+ // Remove universal character names (UCN).
+ llvm::SmallString<64> UCNBuffer;
+ clang::expandUCNs(UCNBuffer, CleanBuffer);
+
+ llvm::StringRef Text = llvm::StringRef(UCNBuffer).copy(*CleanedStorage);
+ Tok.Data = Text.data();
+ Tok.Length = Text.size();
+ Tok.Flags &= ~static_cast<decltype(Tok.Flags)>(LexFlags::NeedsCleaning);
+ }
+ // Cook raw_identifiers into identifier, keyword, etc.
+ if (Tok.Kind == tok::raw_identifier)
+ Tok.Kind = Identifiers.get(Tok.text()).getTokenID();
+ Result.push(std::move(Tok));
+ }
+
+ Result.finalize();
+ return Result;
+}
+
+} // namespace pseudo
+} // namespace syntax
+} // namespace clang
diff --git a/clang/lib/Tooling/Syntax/Pseudo/Preprocess.cpp b/clang/lib/Tooling/Syntax/Pseudo/Preprocess.cpp
new file mode 100644
index 0000000000000..3a6403a147c91
--- /dev/null
+++ b/clang/lib/Tooling/Syntax/Pseudo/Preprocess.cpp
@@ -0,0 +1,206 @@
+//===--- Preprocess.cpp - Preprocess token streams ------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/Tooling/Syntax/Pseudo/Preprocess.h"
+#include "clang/Basic/IdentifierTable.h"
+#include "clang/Basic/TokenKinds.h"
+#include "llvm/Support/FormatVariadic.h"
+
+namespace clang {
+namespace syntax {
+namespace pseudo {
+namespace {
+
+class PPParser {
+public:
+ explicit PPParser(const TokenStream &Code) : Code(Code), Tok(&Code.front()) {}
+ void parse(PPStructure *Result) { parse(Result, /*TopLevel=*/true); }
+
+private:
+ // Roles that a directive might take within a conditional block.
+ enum class Cond { None, If, Else, End };
+ static Cond classifyDirective(tok::PPKeywordKind K) {
+ switch (K) {
+ case clang::tok::pp_if:
+ case clang::tok::pp_ifdef:
+ case clang::tok::pp_ifndef:
+ return Cond::If;
+ case clang::tok::pp_elif:
+ case clang::tok::pp_elifdef:
+ case clang::tok::pp_elifndef:
+ case clang::tok::pp_else:
+ return Cond::Else;
+ case clang::tok::pp_endif:
+ return Cond::End;
+ default:
+ return Cond::None;
+ }
+ }
+
+ // Parses tokens starting at Tok into PP.
+ // If we reach an End or Else directive that ends PP, returns it.
+ // If TopLevel is true, then we do not expect End and always return None.
+ llvm::Optional<PPStructure::Directive> parse(PPStructure *PP, bool TopLevel) {
+ auto StartsDirective =
+ [&, AllowDirectiveAt((const Token *)nullptr)]() mutable {
+ if (Tok->flag(LexFlags::StartsPPLine)) {
+ // If we considered a comment at the start of a PP-line, it doesn't
+ // start a directive but the directive can still start after it.
+ if (Tok->Kind == tok::comment)
+ AllowDirectiveAt = Tok + 1;
+ return Tok->Kind == tok::hash;
+ }
+ return Tok->Kind == tok::hash && AllowDirectiveAt == Tok;
+ };
+ // Each iteration adds one chunk (or returns, if we see #endif).
+ while (Tok->Kind != tok::eof) {
+ // If there's no directive here, we have a code chunk.
+ if (!StartsDirective()) {
+ const Token *Start = Tok;
+ do
+ ++Tok;
+ while (Tok->Kind != tok::eof && !StartsDirective());
+ PP->Chunks.push_back(PPStructure::Code{
+ Token::Range{Code.index(*Start), Code.index(*Tok)}});
+ continue;
+ }
+
+ // We have some kind of directive.
+ PPStructure::Directive Directive;
+ parseDirective(&Directive);
+ Cond Kind = classifyDirective(Directive.Kind);
+ if (Kind == Cond::If) {
+ // #if or similar, starting a nested conditional block.
+ PPStructure::Conditional Conditional;
+ Conditional.Branches.emplace_back();
+ Conditional.Branches.back().first = std::move(Directive);
+ parseConditional(&Conditional);
+ PP->Chunks.push_back(std::move(Conditional));
+ } else if ((Kind == Cond::Else || Kind == Cond::End) && !TopLevel) {
+ // #endif or similar, ending this PPStructure scope.
+ // (#endif is unexpected at the top level, treat as simple directive).
+ return std::move(Directive);
+ } else {
+ // #define or similar, a simple directive at the current scope.
+ PP->Chunks.push_back(std::move(Directive));
+ }
+ }
+ return None;
+ }
+
+ // Parse the rest of a conditional section, after seeing the If directive.
+ // Returns after consuming the End directive.
+ void parseConditional(PPStructure::Conditional *C) {
+ assert(C->Branches.size() == 1 &&
+ C->Branches.front().second.Chunks.empty() &&
+ "Should be ready to parse first branch body");
+ while (Tok->Kind != tok::eof) {
+ auto Terminator = parse(&C->Branches.back().second, /*TopLevel=*/false);
+ if (!Terminator) {
+ assert(Tok->Kind == tok::eof && "gave up parsing before eof?");
+ C->End.Tokens = Token::Range::emptyAt(Code.index(*Tok));
+ return;
+ }
+ if (classifyDirective(Terminator->Kind) == Cond::End) {
+ C->End = std::move(*Terminator);
+ return;
+ }
+ assert(classifyDirective(Terminator->Kind) == Cond::Else &&
+ "ended branch unexpectedly");
+ C->Branches.emplace_back();
+ C->Branches.back().first = std::move(*Terminator);
+ }
+ }
+
+ // Parse a directive. Tok is the hash.
+ void parseDirective(PPStructure::Directive *D) {
+ assert(Tok->Kind == tok::hash);
+
+ // Directive spans from the hash until the end of line or file.
+ const Token *Begin = Tok++;
+ while (Tok->Kind != tok::eof && !Tok->flag(LexFlags::StartsPPLine))
+ ++Tok;
+ ArrayRef<Token> Tokens{Begin, Tok};
+ D->Tokens = {Code.index(*Tokens.begin()), Code.index(*Tokens.end())};
+
+ // Directive name is the first non-comment token after the hash.
+ Tokens = Tokens.drop_front().drop_while(
+ [](const Token &T) { return T.Kind == tok::comment; });
+ if (!Tokens.empty())
+ D->Kind = PPKeywords.get(Tokens.front().text()).getPPKeywordID();
+ }
+
+ const TokenStream &Code;
+ const Token *Tok;
+ clang::IdentifierTable PPKeywords;
+};
+
+} // namespace
+
+PPStructure PPStructure::parse(const TokenStream &Code) {
+ PPStructure Result;
+ PPParser(Code).parse(&Result);
+ return Result;
+}
+
+static void dump(llvm::raw_ostream &OS, const PPStructure &, unsigned Indent);
+static void dump(llvm::raw_ostream &OS, const PPStructure::Directive &Directive,
+ unsigned Indent) {
+ OS.indent(Indent) << llvm::formatv("#{0} ({1} tokens)\n",
+ tok::getPPKeywordSpelling(Directive.Kind),
+ Directive.Tokens.size());
+}
+static void dump(llvm::raw_ostream &OS, const PPStructure::Code &Code,
+ unsigned Indent) {
+ OS.indent(Indent) << llvm::formatv("code ({0} tokens)\n", Code.Tokens.size());
+}
+static void dump(llvm::raw_ostream &OS,
+ const PPStructure::Conditional &Conditional, unsigned Indent) {
+ for (const auto &Branch : Conditional.Branches) {
+ dump(OS, Branch.first, Indent);
+ dump(OS, Branch.second, Indent + 2);
+ }
+ dump(OS, Conditional.End, Indent);
+}
+
+static void dump(llvm::raw_ostream &OS, const PPStructure::Chunk &Chunk,
+ unsigned Indent) {
+ switch (Chunk.kind()) {
+ case PPStructure::Chunk::K_Empty:
+ llvm_unreachable("invalid chunk");
+ case PPStructure::Chunk::K_Code:
+ return dump(OS, (const PPStructure::Code &)Chunk, Indent);
+ case PPStructure::Chunk::K_Directive:
+ return dump(OS, (const PPStructure::Directive &)Chunk, Indent);
+ case PPStructure::Chunk::K_Conditional:
+ return dump(OS, (const PPStructure::Conditional &)Chunk, Indent);
+ }
+}
+
+static void dump(llvm::raw_ostream &OS, const PPStructure &PP,
+ unsigned Indent) {
+ for (const auto &Chunk : PP.Chunks)
+ dump(OS, Chunk, Indent);
+}
+
+// Define operator<< in terms of dump() functions above.
+#define OSTREAM_DUMP(Type) \
+ llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const Type &T) { \
+ dump(OS, T, 0); \
+ return OS; \
+ }
+OSTREAM_DUMP(PPStructure)
+OSTREAM_DUMP(PPStructure::Chunk)
+OSTREAM_DUMP(PPStructure::Directive)
+OSTREAM_DUMP(PPStructure::Conditional)
+OSTREAM_DUMP(PPStructure::Code)
+#undef OSTREAM_DUMP
+
+} // namespace pseudo
+} // namespace syntax
+} // namespace clang
diff --git a/clang/lib/Tooling/Syntax/Pseudo/Token.cpp b/clang/lib/Tooling/Syntax/Pseudo/Token.cpp
new file mode 100644
index 0000000000000..070bda4c50031
--- /dev/null
+++ b/clang/lib/Tooling/Syntax/Pseudo/Token.cpp
@@ -0,0 +1,98 @@
+//===--- Token.cpp - Tokens and token streams in the pseudoparser ---------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/Tooling/Syntax/Pseudo/Token.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/FormatVariadic.h"
+
+namespace clang {
+namespace syntax {
+namespace pseudo {
+
+llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const Token &T) {
+ OS << llvm::formatv("{0} {1}:{2} ", clang::tok::getTokenName(T.Kind), T.Line,
+ T.Indent);
+ OS << '"';
+ llvm::printEscapedString(T.text(), OS);
+ OS << '"';
+ if (T.Flags)
+ OS << llvm::format(" flags=%x", T.Flags);
+ return OS;
+}
+
+llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const TokenStream &TS) {
+ OS << "Index Kind Line Text\n";
+ for (const auto &T : TS.tokens()) {
+ OS << llvm::format("%5d: %16s %4d:%-2d ", TS.index(T),
+ clang::tok::getTokenName(T.Kind), T.Line, T.Indent);
+ OS << '"';
+ llvm::printEscapedString(T.text(), OS);
+ OS << '"';
+ if (T.Flags)
+ OS << llvm::format(" flags=%x", T.Flags);
+ OS << '\n';
+ }
+ return OS;
+}
+
+llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const Token::Range &R) {
+ OS << llvm::formatv("[{0},{1})", R.Begin, R.End);
+ return OS;
+}
+
+TokenStream::TokenStream(std::shared_ptr<void> Payload)
+ : Payload(std::move(Payload)) {
+ Storage.emplace_back();
+ Storage.back().Kind = clang::tok::eof;
+}
+
+void TokenStream::finalize() {
+ assert(!isFinalized());
+ unsigned LastLine = Storage.back().Line;
+ Storage.emplace_back();
+ Storage.back().Kind = tok::eof;
+ Storage.back().Line = LastLine + 1;
+
+ Tokens = Storage;
+ Tokens = Tokens.drop_front().drop_back();
+}
+
+bool TokenStream::isFinalized() const {
+ assert(!Storage.empty() && Storage.front().Kind == tok::eof);
+ if (Storage.size() == 1)
+ return false;
+ return Storage.back().Kind == tok::eof;
+}
+
+void TokenStream::print(llvm::raw_ostream &OS) const {
+ bool FirstToken = true;
+ unsigned LastLine = -1;
+ StringRef LastText;
+ for (const auto &T : tokens()) {
+ StringRef Text = T.text();
+ if (FirstToken) {
+ FirstToken = false;
+ } else if (T.Line == LastLine) {
+ if (LastText.data() + LastText.size() != Text.data())
+ OS << ' ';
+ } else {
+ OS << '\n';
+ OS.indent(T.Indent);
+ }
+ OS << Text;
+ LastLine = T.Line;
+ LastText = Text;
+ }
+ if (!FirstToken)
+ OS << '\n';
+}
+
+} // namespace pseudo
+} // namespace syntax
+} // namespace clang
diff --git a/clang/test/Syntax/lex.c b/clang/test/Syntax/lex.c
new file mode 100644
index 0000000000000..7ec015417a177
--- /dev/null
+++ b/clang/test/Syntax/lex.c
@@ -0,0 +1,52 @@
+int is_debug() {
+#ifndef NDEBUG
+ return 1; // in debug mode
+#else
+ return 0;
+#endif
+}
+
+/* This comment gets lexed along with the input above! We just don't CHECK it.
+
+RUN: clang-pseudo -source %s -print-source | FileCheck %s -check-prefix=SOURCE --strict-whitespace
+ SOURCE: int is_debug() {
+SOURCE-NEXT: #ifndef NDEBUG
+SOURCE-NEXT: return 1; // in debug mode
+SOURCE-NEXT: #else
+SOURCE-NEXT: return 0;
+SOURCE-NEXT: #end
+SOURCE-NEXT: }
+
+RUN: clang-pseudo -source %s -print-tokens | FileCheck %s -check-prefix=TOKEN
+ TOKEN: 0: raw_identifier 0:0 "int" flags=1
+TOKEN-NEXT: raw_identifier 0:0 "is_debug"
+TOKEN-NEXT: l_paren 0:0 "("
+TOKEN-NEXT: r_paren 0:0 ")"
+TOKEN-NEXT: l_brace 0:0 "{"
+TOKEN-NEXT: hash 1:0 "#" flags=1
+TOKEN-NEXT: raw_identifier 1:0 "ifndef"
+TOKEN-NEXT: raw_identifier 1:0 "NDEBUG"
+TOKEN-NEXT: raw_identifier 2:2 "return" flags=1
+TOKEN-NEXT: numeric_constant 2:2 "1"
+TOKEN-NEXT: semi 2:2 ";"
+TOKEN-NEXT: comment 2:2 "// in debug mode"
+TOKEN-NEXT: hash 3:0 "#" flags=1
+TOKEN-NEXT: raw_identifier 3:0 "else"
+TOKEN-NEXT: raw_identifier 4:2 "return" flags=1
+TOKEN-NEXT: numeric_constant 4:2 "0"
+TOKEN-NEXT: semi 4:2 ";"
+TOKEN-NEXT: hash 5:0 "#" flags=1
+TOKEN-NEXT: raw_identifier 5:0 "endif"
+TOKEN-NEXT: r_brace 6:0 "}" flags=1
+
+RUN: clang-pseudo -source %s -print-pp-structure | FileCheck %s -check-prefix=PPS --strict-whitespace
+ PPS: code (5 tokens)
+PPS-NEXT: #ifndef (3 tokens)
+PPS-NEXT: code (4 tokens)
+PPS-NEXT: #else (2 tokens)
+PPS-NEXT: code (3 tokens)
+PPS-NEXT: #endif (2 tokens)
+PPS-NEXT: code (2 tokens)
+ ^ including this block comment
+
+*******************************************************************************/
diff --git a/clang/tools/clang-pseudo/ClangPseudo.cpp b/clang/tools/clang-pseudo/ClangPseudo.cpp
index 449b9181f3ee0..2d6fbdb83944c 100644
--- a/clang/tools/clang-pseudo/ClangPseudo.cpp
+++ b/clang/tools/clang-pseudo/ClangPseudo.cpp
@@ -6,9 +6,12 @@
//
//===----------------------------------------------------------------------===//
+#include "clang/Basic/LangOptions.h"
#include "clang/Tooling/Syntax/Pseudo/Grammar.h"
#include "clang/Tooling/Syntax/Pseudo/LRGraph.h"
#include "clang/Tooling/Syntax/Pseudo/LRTable.h"
+#include "clang/Tooling/Syntax/Pseudo/Preprocess.h"
+#include "clang/Tooling/Syntax/Pseudo/Token.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/FormatVariadic.h"
@@ -25,13 +28,19 @@ static opt<bool> PrintGraph("print-graph",
desc("Print the LR graph for the grammar"));
static opt<bool> PrintTable("print-table",
desc("Print the LR table for the grammar"));
+static opt<std::string> Source("source", desc("Source file"));
+static opt<bool> PrintSource("print-source", desc("Print token stream"));
+static opt<bool> PrintTokens("print-tokens", desc("Print detailed token info"));
+static opt<bool>
+ PrintPPStructure("print-pp-structure",
+ desc("Print directive structure of source code"));
static std::string readOrDie(llvm::StringRef Path) {
llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> Text =
llvm::MemoryBuffer::getFile(Path);
if (std::error_code EC = Text.getError()) {
- llvm::errs() << "Error: can't read file '" << Path << "': " << EC.message()
- << "\n";
+ llvm::errs() << "Error: can't read grammar file '" << Path
+ << "': " << EC.message() << "\n";
::exit(1);
}
return Text.get()->getBuffer().str();
@@ -60,5 +69,19 @@ int main(int argc, char *argv[]) {
return 0;
}
+ if (Source.getNumOccurrences()) {
+ std::string Text = readOrDie(Source);
+ clang::LangOptions LangOpts; // FIXME: use real options.
+ auto Stream = clang::syntax::pseudo::lex(Text, LangOpts);
+ auto Structure = clang::syntax::pseudo::PPStructure::parse(Stream);
+
+ if (PrintPPStructure)
+ llvm::outs() << Structure;
+ if (PrintSource)
+ Stream.print(llvm::outs());
+ if (PrintTokens)
+ llvm::outs() << Stream;
+ }
+
return 0;
}
diff --git a/clang/unittests/Tooling/Syntax/Pseudo/CMakeLists.txt b/clang/unittests/Tooling/Syntax/Pseudo/CMakeLists.txt
index 509e9e4a1598b..658ad9d926b96 100644
--- a/clang/unittests/Tooling/Syntax/Pseudo/CMakeLists.txt
+++ b/clang/unittests/Tooling/Syntax/Pseudo/CMakeLists.txt
@@ -5,6 +5,8 @@ set(LLVM_LINK_COMPONENTS
add_clang_unittest(ClangPseudoTests
GrammarTest.cpp
LRTableTest.cpp
+ PreprocessTest.cpp
+ TokenTest.cpp
)
clang_target_link_libraries(ClangPseudoTests
diff --git a/clang/unittests/Tooling/Syntax/Pseudo/PreprocessTest.cpp b/clang/unittests/Tooling/Syntax/Pseudo/PreprocessTest.cpp
new file mode 100644
index 0000000000000..b6ff47d7fc8dc
--- /dev/null
+++ b/clang/unittests/Tooling/Syntax/Pseudo/PreprocessTest.cpp
@@ -0,0 +1,152 @@
+//===--- TokenTest.cpp ----------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/Tooling/Syntax/Pseudo/Preprocess.h"
+
+#include "clang/Basic/LangOptions.h"
+#include "clang/Basic/TokenKinds.h"
+#include "clang/Tooling/Syntax/Pseudo/Token.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+namespace clang {
+namespace syntax {
+namespace pseudo {
+namespace {
+
+using testing::_;
+using testing::ElementsAre;
+using testing::Matcher;
+using testing::Pair;
+using testing::StrEq;
+using Chunk = PPStructure::Chunk;
+
+MATCHER_P2(tokensAre, TS, Tokens, "tokens are " + std::string(Tokens)) {
+ std::vector<llvm::StringRef> Texts;
+ for (const Token &Tok : TS.tokens(arg.Tokens))
+ Texts.push_back(Tok.text());
+ return Matcher<std::string>(StrEq(Tokens))
+ .MatchAndExplain(llvm::join(Texts, " "), result_listener);
+}
+
+MATCHER_P(chunkKind, K, "") { return arg.kind() == K; }
+
+TEST(PPStructure, Parse) {
+ LangOptions Opts;
+ std::string Code = R"cpp(
+ #include <foo.h>
+
+ int main() {
+ #ifdef HAS_FOO
+ #if HAS_BAR
+ foo(bar);
+ #else
+ foo(0)
+ #endif
+ #elif NEEDS_FOO
+ #error missing_foo
+ #endif
+ }
+ )cpp";
+
+ TokenStream S = cook(lex(Code, Opts), Opts);
+ PPStructure PP = PPStructure::parse(S);
+
+ ASSERT_THAT(PP.Chunks, ElementsAre(chunkKind(Chunk::K_Directive),
+ chunkKind(Chunk::K_Code),
+ chunkKind(Chunk::K_Conditional),
+ chunkKind(Chunk::K_Code)));
+
+ EXPECT_THAT((const PPStructure::Directive &)PP.Chunks[0],
+ tokensAre(S, "# include < foo . h >"));
+ EXPECT_THAT((const PPStructure::Code &)PP.Chunks[1],
+ tokensAre(S, "int main ( ) {"));
+ EXPECT_THAT((const PPStructure::Code &)PP.Chunks[3], tokensAre(S, "}"));
+
+ const PPStructure::Conditional &Ifdef(PP.Chunks[2]);
+ EXPECT_THAT(Ifdef.Branches,
+ ElementsAre(Pair(tokensAre(S, "# ifdef HAS_FOO"), _),
+ Pair(tokensAre(S, "# elif NEEDS_FOO"), _)));
+ EXPECT_THAT(Ifdef.End, tokensAre(S, "# endif"));
+
+ const PPStructure &HasFoo(Ifdef.Branches[0].second);
+ const PPStructure &NeedsFoo(Ifdef.Branches[1].second);
+
+ EXPECT_THAT(HasFoo.Chunks, ElementsAre(chunkKind(Chunk::K_Conditional)));
+ const PPStructure::Conditional &If(HasFoo.Chunks[0]);
+ EXPECT_THAT(If.Branches, ElementsAre(Pair(tokensAre(S, "# if HAS_BAR"), _),
+ Pair(tokensAre(S, "# else"), _)));
+ EXPECT_THAT(If.Branches[0].second.Chunks,
+ ElementsAre(chunkKind(Chunk::K_Code)));
+ EXPECT_THAT(If.Branches[1].second.Chunks,
+ ElementsAre(chunkKind(Chunk::K_Code)));
+
+ EXPECT_THAT(NeedsFoo.Chunks, ElementsAre(chunkKind(Chunk::K_Directive)));
+ const PPStructure::Directive &Error(NeedsFoo.Chunks[0]);
+ EXPECT_THAT(Error, tokensAre(S, "# error missing_foo"));
+ EXPECT_EQ(Error.Kind, tok::pp_error);
+}
+
+TEST(PPStructure, ParseUgly) {
+ LangOptions Opts;
+ std::string Code = R"cpp(
+ /*A*/ # /*B*/ \
+ /*C*/ \
+define \
+BAR /*D*/
+/*E*/
+)cpp";
+ TokenStream S = cook(lex(Code, Opts), Opts);
+ PPStructure PP = PPStructure::parse(S);
+
+ ASSERT_THAT(PP.Chunks, ElementsAre(chunkKind(Chunk::K_Code),
+ chunkKind(Chunk::K_Directive),
+ chunkKind(Chunk::K_Code)));
+ EXPECT_THAT((const PPStructure::Code &)PP.Chunks[0], tokensAre(S, "/*A*/"));
+ const PPStructure::Directive &Define(PP.Chunks[1]);
+ EXPECT_EQ(Define.Kind, tok::pp_define);
+ EXPECT_THAT(Define, tokensAre(S, "# /*B*/ /*C*/ define BAR /*D*/"));
+ EXPECT_THAT((const PPStructure::Code &)PP.Chunks[2], tokensAre(S, "/*E*/"));
+}
+
+TEST(PPStructure, ParseBroken) {
+ LangOptions Opts;
+ std::string Code = R"cpp(
+ a
+ #endif // mismatched
+ #if X
+ b
+)cpp";
+ TokenStream S = cook(lex(Code, Opts), Opts);
+ PPStructure PP = PPStructure::parse(S);
+
+ ASSERT_THAT(PP.Chunks, ElementsAre(chunkKind(Chunk::K_Code),
+ chunkKind(Chunk::K_Directive),
+ chunkKind(Chunk::K_Conditional)));
+ EXPECT_THAT((const PPStructure::Code &)PP.Chunks[0], tokensAre(S, "a"));
+ const PPStructure::Directive &Endif(PP.Chunks[1]);
+ EXPECT_EQ(Endif.Kind, tok::pp_endif);
+ EXPECT_THAT(Endif, tokensAre(S, "# endif // mismatched"));
+
+ const PPStructure::Conditional &X(PP.Chunks[2]);
+ EXPECT_EQ(1u, X.Branches.size());
+ // The (only) branch of the broken conditional section runs until eof.
+ EXPECT_EQ(tok::pp_if, X.Branches.front().first.Kind);
+ EXPECT_THAT(X.Branches.front().second.Chunks,
+ ElementsAre(chunkKind(Chunk::K_Code)));
+ // The missing terminating directive is marked as pp_not_keyword.
+ EXPECT_EQ(tok::pp_not_keyword, X.End.Kind);
+ EXPECT_EQ(0u, X.End.Tokens.size());
+}
+
+} // namespace
+} // namespace pseudo
+} // namespace syntax
+} // namespace clang
diff --git a/clang/unittests/Tooling/Syntax/Pseudo/TokenTest.cpp b/clang/unittests/Tooling/Syntax/Pseudo/TokenTest.cpp
new file mode 100644
index 0000000000000..f790e65245741
--- /dev/null
+++ b/clang/unittests/Tooling/Syntax/Pseudo/TokenTest.cpp
@@ -0,0 +1,178 @@
+//===--- TokenTest.cpp ----------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/Tooling/Syntax/Pseudo/Token.h"
+#include "clang/Basic/LangOptions.h"
+#include "clang/Basic/TokenKinds.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+namespace clang {
+namespace syntax {
+namespace pseudo {
+namespace {
+
+using testing::AllOf;
+using testing::ElementsAre;
+using testing::ElementsAreArray;
+using testing::Not;
+
+MATCHER_P2(token, Text, Kind, "") {
+ return arg.Kind == Kind && arg.text() == Text;
+}
+
+MATCHER_P(hasFlag, Flag, "") { return arg.flag(Flag); }
+
+MATCHER_P2(lineIndent, Line, Indent, "") {
+ return arg.Line == (unsigned)Line && arg.Indent == (unsigned)Indent;
+}
+
+TEST(TokenTest, Lex) {
+ LangOptions Opts;
+ std::string Code = R"cpp(
+ #include <stdio.h>
+ int main() {
+ return 42; // the answer
+ }
+ )cpp";
+ TokenStream Raw = lex(Code, Opts);
+ ASSERT_TRUE(Raw.isFinalized());
+ EXPECT_THAT(Raw.tokens(),
+ ElementsAreArray({
+ // Lexing of directives is weird, especially <angled> strings.
+ token("#", tok::hash),
+ token("include", tok::raw_identifier),
+ token("<", tok::less),
+ token("stdio", tok::raw_identifier),
+ token(".", tok::period),
+ token("h", tok::raw_identifier),
+ token(">", tok::greater),
+
+ token("int", tok::raw_identifier),
+ token("main", tok::raw_identifier),
+ token("(", tok::l_paren),
+ token(")", tok::r_paren),
+ token("{", tok::l_brace),
+ token("return", tok::raw_identifier),
+ token("42", tok::numeric_constant),
+ token(";", tok::semi),
+ token("// the answer", tok::comment),
+ token("}", tok::r_brace),
+ }));
+
+ TokenStream Cooked = cook(Raw, Opts);
+ ASSERT_TRUE(Cooked.isFinalized());
+ EXPECT_THAT(Cooked.tokens(),
+ ElementsAreArray({
+ // Cooked identifier types in directives are not meaningful.
+ token("#", tok::hash),
+ token("include", tok::identifier),
+ token("<", tok::less),
+ token("stdio", tok::identifier),
+ token(".", tok::period),
+ token("h", tok::identifier),
+ token(">", tok::greater),
+
+ token("int", tok::kw_int),
+ token("main", tok::identifier),
+ token("(", tok::l_paren),
+ token(")", tok::r_paren),
+ token("{", tok::l_brace),
+ token("return", tok::kw_return),
+ token("42", tok::numeric_constant),
+ token(";", tok::semi),
+ token("// the answer", tok::comment),
+ token("}", tok::r_brace),
+ }));
+ // Check raw tokens point back into original source code.
+ EXPECT_EQ(Raw.tokens().front().text().begin(), &Code[Code.find('#')]);
+}
+
+TEST(TokenTest, LineContinuation) {
+ LangOptions Opts;
+ std::string Code = R"cpp(
+one_\
+token
+two \
+tokens
+ )cpp";
+ TokenStream Raw = lex(Code, Opts);
+ EXPECT_THAT(
+ Raw.tokens(),
+ ElementsAre(AllOf(token("one_\\\ntoken", tok::raw_identifier),
+ hasFlag(LexFlags::StartsPPLine),
+ hasFlag(LexFlags::NeedsCleaning), lineIndent(1, 0)),
+ AllOf(token("two", tok::raw_identifier),
+ hasFlag(LexFlags::StartsPPLine),
+ Not(hasFlag(LexFlags::NeedsCleaning))),
+ AllOf(token("\\\ntokens", tok::raw_identifier),
+ Not(hasFlag(LexFlags::StartsPPLine)),
+ hasFlag(LexFlags::NeedsCleaning))));
+
+ TokenStream Cooked = cook(Raw, Opts);
+ EXPECT_THAT(
+ Cooked.tokens(),
+ ElementsAre(AllOf(token("one_token", tok::identifier), lineIndent(1, 0)),
+ token("two", tok::identifier),
+ token("tokens", tok::identifier)));
+}
+
+TEST(TokenTest, EncodedCharacters) {
+ LangOptions Opts;
+ Opts.Trigraphs = true;
+ Opts.Digraphs = true;
+ Opts.C99 = true; // UCNs
+ Opts.CXXOperatorNames = true;
+ std::string Code = R"(and <: ??! '??=' \u00E9)";
+ TokenStream Raw = lex(Code, Opts);
+ EXPECT_THAT(
+ Raw.tokens(),
+ ElementsAre( // and is not recognized as && until cook().
+ AllOf(token("and", tok::raw_identifier),
+ Not(hasFlag(LexFlags::NeedsCleaning))),
+ // Digraphs are just
diff erent spellings of tokens.
+ AllOf(token("<:", tok::l_square),
+ Not(hasFlag(LexFlags::NeedsCleaning))),
+ // Trigraps are interpreted, still need text cleaning.
+ AllOf(token(R"(??!)", tok::pipe), hasFlag(LexFlags::NeedsCleaning)),
+ // Trigraphs must be substituted inside constants too.
+ AllOf(token(R"('??=')", tok::char_constant),
+ hasFlag(LexFlags::NeedsCleaning)),
+ // UCNs need substitution.
+ AllOf(token(R"(\u00E9)", tok::raw_identifier),
+ hasFlag(LexFlags::NeedsCleaning))));
+
+ TokenStream Cooked = cook(Raw, Opts);
+ EXPECT_THAT(
+ Cooked.tokens(),
+ ElementsAre(token("and", tok::ampamp), // alternate spelling recognized
+ token("<:", tok::l_square),
+ token("|", tok::pipe), // trigraph substituted
+ token("'#'", tok::char_constant), // trigraph substituted
+ token("é", tok::identifier))); // UCN substituted
+}
+
+TEST(TokenTest, Indentation) {
+ LangOptions Opts;
+ std::string Code = R"cpp( hello world
+no_indent \
+ line_was_continued
+)cpp";
+ TokenStream Raw = lex(Code, Opts);
+ EXPECT_THAT(Raw.tokens(), ElementsAreArray({
+ lineIndent(0, 3), // hello
+ lineIndent(0, 3), // world
+ lineIndent(1, 0), // no_indent
+ lineIndent(2, 2), // line_was_continued
+ }));
+}
+
+} // namespace
+} // namespace pseudo
+} // namespace syntax
+} // namespace clang
More information about the cfe-commits
mailing list