[clang] [clang-tools-extra] Remove clang-pseudo (PR #109154)
Aaron Ballman via cfe-commits
cfe-commits at lists.llvm.org
Wed Sep 18 08:00:59 PDT 2024
https://github.com/AaronBallman created https://github.com/llvm/llvm-project/pull/109154
The functionality is incomplete and the authors have since shifted gears to other work, so this is effectively unmaintained.
The original design document for clang-pseudo can be found at:
https://docs.google.com/document/d/1eGkTOsFja63wsv8v0vd5JdoTonj-NlN3ujGF0T7xDbM/edit
in case anyone wishes to pick this project back up again in the future.
Original RFC: https://discourse.llvm.org/t/removing-pseudo-parser/71131/
>From 74eb548cc91240bb24dcfe18f6d812d57455777f Mon Sep 17 00:00:00 2001
From: Aaron Ballman <aaron at aaronballman.com>
Date: Wed, 18 Sep 2024 10:57:47 -0400
Subject: [PATCH] Remove clang-pseudo
The functionality is incomplete and the authors have since shifted
gears to other work, so this is effectively unmaintained.
Original RFC: https://discourse.llvm.org/t/removing-pseudo-parser/71131/
---
clang-tools-extra/CMakeLists.txt | 1 -
clang-tools-extra/clangd/CMakeLists.txt | 1 -
.../clangd/SemanticSelection.cpp | 32 +-
.../lib => clangd/support}/Bracket.cpp | 8 +-
.../clang-pseudo => clangd/support}/Bracket.h | 12 +-
.../clangd/support/CMakeLists.txt | 4 +
.../lib => clangd/support}/DirectiveTree.cpp | 6 +-
.../support}/DirectiveTree.h | 12 +-
.../{pseudo/lib => clangd/support}/Lex.cpp | 6 +-
.../{pseudo/lib => clangd/support}/Token.cpp | 6 +-
.../clang-pseudo => clangd/support}/Token.h | 10 +-
clang-tools-extra/pseudo/CMakeLists.txt | 12 -
clang-tools-extra/pseudo/DesignNotes.md | 123 ---
clang-tools-extra/pseudo/Disambiguation.md | 367 --------
clang-tools-extra/pseudo/README.md | 37 -
.../pseudo/benchmarks/Benchmark.cpp | 156 ----
.../pseudo/benchmarks/CMakeLists.txt | 9 -
.../pseudo/fuzzer/CMakeLists.txt | 16 -
clang-tools-extra/pseudo/fuzzer/Fuzzer.cpp | 82 --
clang-tools-extra/pseudo/fuzzer/Main.cpp | 16 -
clang-tools-extra/pseudo/gen/CMakeLists.txt | 11 -
clang-tools-extra/pseudo/gen/Main.cpp | 172 ----
.../pseudo/include/CMakeLists.txt | 32 -
.../include/clang-pseudo/Disambiguate.h | 64 --
.../pseudo/include/clang-pseudo/Forest.h | 236 ------
.../pseudo/include/clang-pseudo/GLR.h | 170 ----
.../pseudo/include/clang-pseudo/Language.h | 64 --
.../pseudo/include/clang-pseudo/cli/CLI.h | 35 -
.../pseudo/include/clang-pseudo/cxx/CXX.h | 91 --
.../include/clang-pseudo/grammar/Grammar.h | 230 -----
.../include/clang-pseudo/grammar/LRGraph.h | 196 -----
.../include/clang-pseudo/grammar/LRTable.h | 278 ------
clang-tools-extra/pseudo/lib/CMakeLists.txt | 31 -
clang-tools-extra/pseudo/lib/Disambiguate.cpp | 48 --
clang-tools-extra/pseudo/lib/Forest.cpp | 199 -----
clang-tools-extra/pseudo/lib/GLR.cpp | 772 -----------------
clang-tools-extra/pseudo/lib/cli/CLI.cpp | 54 --
.../pseudo/lib/cli/CMakeLists.txt | 15 -
.../pseudo/lib/cxx/CMakeLists.txt | 19 -
clang-tools-extra/pseudo/lib/cxx/CXX.cpp | 452 ----------
clang-tools-extra/pseudo/lib/cxx/cxx.bnf | 775 -----------------
.../pseudo/lib/grammar/CMakeLists.txt | 10 -
.../pseudo/lib/grammar/Grammar.cpp | 190 -----
.../pseudo/lib/grammar/GrammarBNF.cpp | 362 --------
.../pseudo/lib/grammar/LRGraph.cpp | 265 ------
.../pseudo/lib/grammar/LRTable.cpp | 79 --
.../pseudo/lib/grammar/LRTableBuild.cpp | 121 ---
clang-tools-extra/pseudo/test/.clang-format | 1 -
clang-tools-extra/pseudo/test/CMakeLists.txt | 29 -
clang-tools-extra/pseudo/test/Unit/lit.cfg.py | 25 -
.../pseudo/test/Unit/lit.site.cfg.py.in | 11 -
.../pseudo/test/check-cxx-bnf.test | 2 -
.../pseudo/test/crash/backslashes.c | 4 -
.../pseudo/test/cxx/capture-list.cpp | 23 -
.../pseudo/test/cxx/contextual-keywords.cpp | 9 -
.../pseudo/test/cxx/dangling-else.cpp | 22 -
.../pseudo/test/cxx/decl-specfier-seq.cpp | 27 -
.../pseudo/test/cxx/declarator-function.cpp | 9 -
.../pseudo/test/cxx/declarator-var.cpp | 9 -
.../test/cxx/declator-member-function.cpp | 9 -
.../test/cxx/empty-member-declaration.cpp | 7 -
.../pseudo/test/cxx/empty-member-spec.cpp | 13 -
clang-tools-extra/pseudo/test/cxx/keyword.cpp | 12 -
.../pseudo/test/cxx/literals.cpp | 43 -
.../pseudo/test/cxx/mixed-designator.cpp | 27 -
.../pseudo/test/cxx/nested-name-specifier.cpp | 28 -
.../pseudo/test/cxx/parameter-decl-clause.cpp | 14 -
.../pseudo/test/cxx/predefined-identifier.cpp | 5 -
.../test/cxx/recovery-func-parameters.cpp | 13 -
.../pseudo/test/cxx/recovery-init-list.cpp | 13 -
.../pseudo/test/cxx/structured-binding.cpp | 6 -
.../cxx/template-empty-type-parameter.cpp | 3 -
.../pseudo/test/cxx/unsized-array.cpp | 7 -
clang-tools-extra/pseudo/test/fuzzer.cpp | 4 -
.../pseudo/test/glr-variant-start.cpp | 9 -
clang-tools-extra/pseudo/test/glr.cpp | 30 -
clang-tools-extra/pseudo/test/html-forest.c | 8 -
clang-tools-extra/pseudo/test/lex.c | 42 -
clang-tools-extra/pseudo/test/lit.cfg.py | 20 -
clang-tools-extra/pseudo/test/lit.local.cfg | 2 -
.../pseudo/test/lit.site.cfg.py.in | 14 -
.../pseudo/test/lr-build-basic.test | 32 -
.../pseudo/test/lr-build-conflicts.test | 49 --
.../pseudo/test/strip-directives.c | 49 --
clang-tools-extra/pseudo/tool/CMakeLists.txt | 30 -
clang-tools-extra/pseudo/tool/ClangPseudo.cpp | 243 ------
clang-tools-extra/pseudo/tool/HTMLForest.cpp | 192 -----
clang-tools-extra/pseudo/tool/HTMLForest.css | 93 ---
clang-tools-extra/pseudo/tool/HTMLForest.html | 15 -
clang-tools-extra/pseudo/tool/HTMLForest.js | 290 -------
.../pseudo/unittests/BracketTest.cpp | 117 ---
.../pseudo/unittests/CMakeLists.txt | 32 -
.../pseudo/unittests/CXXTest.cpp | 30 -
.../pseudo/unittests/DirectiveTreeTest.cpp | 357 --------
.../pseudo/unittests/DisambiguateTest.cpp | 111 ---
.../pseudo/unittests/ForestTest.cpp | 180 ----
.../pseudo/unittests/GLRTest.cpp | 789 ------------------
.../pseudo/unittests/GrammarTest.cpp | 213 -----
.../pseudo/unittests/LRTableTest.cpp | 76 --
.../pseudo/unittests/TokenTest.cpp | 224 -----
clang/docs/ClangFormattedStatus.rst | 5 -
clang/docs/tools/clang-formatted-files.txt | 1 -
102 files changed, 50 insertions(+), 9465 deletions(-)
rename clang-tools-extra/{pseudo/lib => clangd/support}/Bracket.cpp (97%)
rename clang-tools-extra/{pseudo/include/clang-pseudo => clangd/support}/Bracket.h (87%)
rename clang-tools-extra/{pseudo/lib => clangd/support}/DirectiveTree.cpp (99%)
rename clang-tools-extra/{pseudo/include/clang-pseudo => clangd/support}/DirectiveTree.h (95%)
rename clang-tools-extra/{pseudo/lib => clangd/support}/Lex.cpp (98%)
rename clang-tools-extra/{pseudo/lib => clangd/support}/Token.cpp (98%)
rename clang-tools-extra/{pseudo/include/clang-pseudo => clangd/support}/Token.h (98%)
delete mode 100644 clang-tools-extra/pseudo/CMakeLists.txt
delete mode 100644 clang-tools-extra/pseudo/DesignNotes.md
delete mode 100644 clang-tools-extra/pseudo/Disambiguation.md
delete mode 100644 clang-tools-extra/pseudo/README.md
delete mode 100644 clang-tools-extra/pseudo/benchmarks/Benchmark.cpp
delete mode 100644 clang-tools-extra/pseudo/benchmarks/CMakeLists.txt
delete mode 100644 clang-tools-extra/pseudo/fuzzer/CMakeLists.txt
delete mode 100644 clang-tools-extra/pseudo/fuzzer/Fuzzer.cpp
delete mode 100644 clang-tools-extra/pseudo/fuzzer/Main.cpp
delete mode 100644 clang-tools-extra/pseudo/gen/CMakeLists.txt
delete mode 100644 clang-tools-extra/pseudo/gen/Main.cpp
delete mode 100644 clang-tools-extra/pseudo/include/CMakeLists.txt
delete mode 100644 clang-tools-extra/pseudo/include/clang-pseudo/Disambiguate.h
delete mode 100644 clang-tools-extra/pseudo/include/clang-pseudo/Forest.h
delete mode 100644 clang-tools-extra/pseudo/include/clang-pseudo/GLR.h
delete mode 100644 clang-tools-extra/pseudo/include/clang-pseudo/Language.h
delete mode 100644 clang-tools-extra/pseudo/include/clang-pseudo/cli/CLI.h
delete mode 100644 clang-tools-extra/pseudo/include/clang-pseudo/cxx/CXX.h
delete mode 100644 clang-tools-extra/pseudo/include/clang-pseudo/grammar/Grammar.h
delete mode 100644 clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRGraph.h
delete mode 100644 clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRTable.h
delete mode 100644 clang-tools-extra/pseudo/lib/CMakeLists.txt
delete mode 100644 clang-tools-extra/pseudo/lib/Disambiguate.cpp
delete mode 100644 clang-tools-extra/pseudo/lib/Forest.cpp
delete mode 100644 clang-tools-extra/pseudo/lib/GLR.cpp
delete mode 100644 clang-tools-extra/pseudo/lib/cli/CLI.cpp
delete mode 100644 clang-tools-extra/pseudo/lib/cli/CMakeLists.txt
delete mode 100644 clang-tools-extra/pseudo/lib/cxx/CMakeLists.txt
delete mode 100644 clang-tools-extra/pseudo/lib/cxx/CXX.cpp
delete mode 100644 clang-tools-extra/pseudo/lib/cxx/cxx.bnf
delete mode 100644 clang-tools-extra/pseudo/lib/grammar/CMakeLists.txt
delete mode 100644 clang-tools-extra/pseudo/lib/grammar/Grammar.cpp
delete mode 100644 clang-tools-extra/pseudo/lib/grammar/GrammarBNF.cpp
delete mode 100644 clang-tools-extra/pseudo/lib/grammar/LRGraph.cpp
delete mode 100644 clang-tools-extra/pseudo/lib/grammar/LRTable.cpp
delete mode 100644 clang-tools-extra/pseudo/lib/grammar/LRTableBuild.cpp
delete mode 100644 clang-tools-extra/pseudo/test/.clang-format
delete mode 100644 clang-tools-extra/pseudo/test/CMakeLists.txt
delete mode 100644 clang-tools-extra/pseudo/test/Unit/lit.cfg.py
delete mode 100644 clang-tools-extra/pseudo/test/Unit/lit.site.cfg.py.in
delete mode 100644 clang-tools-extra/pseudo/test/check-cxx-bnf.test
delete mode 100644 clang-tools-extra/pseudo/test/crash/backslashes.c
delete mode 100644 clang-tools-extra/pseudo/test/cxx/capture-list.cpp
delete mode 100644 clang-tools-extra/pseudo/test/cxx/contextual-keywords.cpp
delete mode 100644 clang-tools-extra/pseudo/test/cxx/dangling-else.cpp
delete mode 100644 clang-tools-extra/pseudo/test/cxx/decl-specfier-seq.cpp
delete mode 100644 clang-tools-extra/pseudo/test/cxx/declarator-function.cpp
delete mode 100644 clang-tools-extra/pseudo/test/cxx/declarator-var.cpp
delete mode 100644 clang-tools-extra/pseudo/test/cxx/declator-member-function.cpp
delete mode 100644 clang-tools-extra/pseudo/test/cxx/empty-member-declaration.cpp
delete mode 100644 clang-tools-extra/pseudo/test/cxx/empty-member-spec.cpp
delete mode 100644 clang-tools-extra/pseudo/test/cxx/keyword.cpp
delete mode 100644 clang-tools-extra/pseudo/test/cxx/literals.cpp
delete mode 100644 clang-tools-extra/pseudo/test/cxx/mixed-designator.cpp
delete mode 100644 clang-tools-extra/pseudo/test/cxx/nested-name-specifier.cpp
delete mode 100644 clang-tools-extra/pseudo/test/cxx/parameter-decl-clause.cpp
delete mode 100644 clang-tools-extra/pseudo/test/cxx/predefined-identifier.cpp
delete mode 100644 clang-tools-extra/pseudo/test/cxx/recovery-func-parameters.cpp
delete mode 100644 clang-tools-extra/pseudo/test/cxx/recovery-init-list.cpp
delete mode 100644 clang-tools-extra/pseudo/test/cxx/structured-binding.cpp
delete mode 100644 clang-tools-extra/pseudo/test/cxx/template-empty-type-parameter.cpp
delete mode 100644 clang-tools-extra/pseudo/test/cxx/unsized-array.cpp
delete mode 100644 clang-tools-extra/pseudo/test/fuzzer.cpp
delete mode 100644 clang-tools-extra/pseudo/test/glr-variant-start.cpp
delete mode 100644 clang-tools-extra/pseudo/test/glr.cpp
delete mode 100644 clang-tools-extra/pseudo/test/html-forest.c
delete mode 100644 clang-tools-extra/pseudo/test/lex.c
delete mode 100644 clang-tools-extra/pseudo/test/lit.cfg.py
delete mode 100644 clang-tools-extra/pseudo/test/lit.local.cfg
delete mode 100644 clang-tools-extra/pseudo/test/lit.site.cfg.py.in
delete mode 100644 clang-tools-extra/pseudo/test/lr-build-basic.test
delete mode 100644 clang-tools-extra/pseudo/test/lr-build-conflicts.test
delete mode 100644 clang-tools-extra/pseudo/test/strip-directives.c
delete mode 100644 clang-tools-extra/pseudo/tool/CMakeLists.txt
delete mode 100644 clang-tools-extra/pseudo/tool/ClangPseudo.cpp
delete mode 100644 clang-tools-extra/pseudo/tool/HTMLForest.cpp
delete mode 100644 clang-tools-extra/pseudo/tool/HTMLForest.css
delete mode 100644 clang-tools-extra/pseudo/tool/HTMLForest.html
delete mode 100644 clang-tools-extra/pseudo/tool/HTMLForest.js
delete mode 100644 clang-tools-extra/pseudo/unittests/BracketTest.cpp
delete mode 100644 clang-tools-extra/pseudo/unittests/CMakeLists.txt
delete mode 100644 clang-tools-extra/pseudo/unittests/CXXTest.cpp
delete mode 100644 clang-tools-extra/pseudo/unittests/DirectiveTreeTest.cpp
delete mode 100644 clang-tools-extra/pseudo/unittests/DisambiguateTest.cpp
delete mode 100644 clang-tools-extra/pseudo/unittests/ForestTest.cpp
delete mode 100644 clang-tools-extra/pseudo/unittests/GLRTest.cpp
delete mode 100644 clang-tools-extra/pseudo/unittests/GrammarTest.cpp
delete mode 100644 clang-tools-extra/pseudo/unittests/LRTableTest.cpp
delete mode 100644 clang-tools-extra/pseudo/unittests/TokenTest.cpp
diff --git a/clang-tools-extra/CMakeLists.txt b/clang-tools-extra/CMakeLists.txt
index f6a6b57b5ef0bc..6b6f2b1ca22765 100644
--- a/clang-tools-extra/CMakeLists.txt
+++ b/clang-tools-extra/CMakeLists.txt
@@ -27,7 +27,6 @@ add_subdirectory(clang-move)
add_subdirectory(clang-query)
add_subdirectory(include-cleaner)
add_subdirectory(pp-trace)
-add_subdirectory(pseudo)
add_subdirectory(tool-template)
option(CLANG_TOOLS_EXTRA_INCLUDE_DOCS "Generate build targets for the Clang Extra Tools docs."
diff --git a/clang-tools-extra/clangd/CMakeLists.txt b/clang-tools-extra/clangd/CMakeLists.txt
index c21d277d2ffcbd..8dcbf5f47e056a 100644
--- a/clang-tools-extra/clangd/CMakeLists.txt
+++ b/clang-tools-extra/clangd/CMakeLists.txt
@@ -183,7 +183,6 @@ target_link_libraries(clangDaemon
${LLVM_PTHREAD_LIB}
clangIncludeCleaner
- clangPseudo
clangTidy
clangTidyUtils
diff --git a/clang-tools-extra/clangd/SemanticSelection.cpp b/clang-tools-extra/clangd/SemanticSelection.cpp
index 3d687173b2be99..dd7116e619e6d0 100644
--- a/clang-tools-extra/clangd/SemanticSelection.cpp
+++ b/clang-tools-extra/clangd/SemanticSelection.cpp
@@ -11,9 +11,6 @@
#include "Protocol.h"
#include "Selection.h"
#include "SourceCode.h"
-#include "clang-pseudo/Bracket.h"
-#include "clang-pseudo/DirectiveTree.h"
-#include "clang-pseudo/Token.h"
#include "clang/AST/DeclBase.h"
#include "clang/Basic/SourceLocation.h"
#include "clang/Basic/SourceManager.h"
@@ -25,6 +22,9 @@
#include "llvm/ADT/StringRef.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/Error.h"
+#include "support/Bracket.h"
+#include "support/DirectiveTree.h"
+#include "support/Token.h"
#include <optional>
#include <queue>
#include <vector>
@@ -181,16 +181,16 @@ llvm::Expected<std::vector<FoldingRange>> getFoldingRanges(ParsedAST &AST) {
// Related issue: https://github.com/clangd/clangd/issues/310
llvm::Expected<std::vector<FoldingRange>>
getFoldingRanges(const std::string &Code, bool LineFoldingOnly) {
- auto OrigStream = pseudo::lex(Code, clang::pseudo::genericLangOpts());
+ auto OrigStream = lex(Code, genericLangOpts());
- auto DirectiveStructure = pseudo::DirectiveTree::parse(OrigStream);
- pseudo::chooseConditionalBranches(DirectiveStructure, OrigStream);
+ auto DirectiveStructure = DirectiveTree::parse(OrigStream);
+ chooseConditionalBranches(DirectiveStructure, OrigStream);
// FIXME: Provide ranges in the disabled-PP regions as well.
auto Preprocessed = DirectiveStructure.stripDirectives(OrigStream);
- auto ParseableStream = cook(Preprocessed, clang::pseudo::genericLangOpts());
- pseudo::pairBrackets(ParseableStream);
+ auto ParseableStream = cook(Preprocessed, genericLangOpts());
+ pairBrackets(ParseableStream);
std::vector<FoldingRange> Result;
auto AddFoldingRange = [&](Position Start, Position End,
@@ -205,19 +205,19 @@ getFoldingRanges(const std::string &Code, bool LineFoldingOnly) {
FR.kind = Kind.str();
Result.push_back(FR);
};
- auto OriginalToken = [&](const pseudo::Token &T) {
+ auto OriginalToken = [&](const Token &T) {
return OrigStream.tokens()[T.OriginalIndex];
};
- auto StartOffset = [&](const pseudo::Token &T) {
+ auto StartOffset = [&](const Token &T) {
return OriginalToken(T).text().data() - Code.data();
};
- auto StartPosition = [&](const pseudo::Token &T) {
+ auto StartPosition = [&](const Token &T) {
return offsetToPosition(Code, StartOffset(T));
};
- auto EndOffset = [&](const pseudo::Token &T) {
+ auto EndOffset = [&](const Token &T) {
return StartOffset(T) + OriginalToken(T).Length;
};
- auto EndPosition = [&](const pseudo::Token &T) {
+ auto EndPosition = [&](const Token &T) {
return offsetToPosition(Code, EndOffset(T));
};
auto Tokens = ParseableStream.tokens();
@@ -235,7 +235,7 @@ getFoldingRanges(const std::string &Code, bool LineFoldingOnly) {
}
}
}
- auto IsBlockComment = [&](const pseudo::Token &T) {
+ auto IsBlockComment = [&](const Token &T) {
assert(T.Kind == tok::comment);
return OriginalToken(T).Length >= 2 &&
Code.substr(StartOffset(T), 2) == "/*";
@@ -246,10 +246,10 @@ getFoldingRanges(const std::string &Code, bool LineFoldingOnly) {
T++;
continue;
}
- pseudo::Token *FirstComment = T;
+ Token *FirstComment = T;
// Show starting sentinals (// and /*) of the comment.
Position Start = offsetToPosition(Code, 2 + StartOffset(*FirstComment));
- pseudo::Token *LastComment = T;
+ Token *LastComment = T;
Position End = EndPosition(*T);
while (T != Tokens.end() && T->Kind == tok::comment &&
StartPosition(*T).line <= End.line + 1) {
diff --git a/clang-tools-extra/pseudo/lib/Bracket.cpp b/clang-tools-extra/clangd/support/Bracket.cpp
similarity index 97%
rename from clang-tools-extra/pseudo/lib/Bracket.cpp
rename to clang-tools-extra/clangd/support/Bracket.cpp
index 07836146ad8a58..93d0f38015efbf 100644
--- a/clang-tools-extra/pseudo/lib/Bracket.cpp
+++ b/clang-tools-extra/clangd/support/Bracket.cpp
@@ -62,10 +62,10 @@
//
//===----------------------------------------------------------------------===//
-#include "clang-pseudo/Bracket.h"
+#include "Bracket.h"
namespace clang {
-namespace pseudo {
+namespace clangd {
namespace {
struct Bracket {
@@ -83,7 +83,7 @@ struct Bracket {
// Find brackets in the stream and convert to Bracket struct.
std::vector<Bracket> findBrackets(const TokenStream &Stream) {
std::vector<Bracket> Brackets;
- auto Add = [&](const pseudo::Token &Tok, Bracket::BracketKind K,
+ auto Add = [&](const Token &Tok, Bracket::BracketKind K,
Bracket::Direction D) {
Brackets.push_back(
{K, D, Tok.Line, Tok.Indent, Stream.index(Tok), Bracket::None});
@@ -151,5 +151,5 @@ void pairBrackets(TokenStream &Stream) {
applyPairings(Brackets, Stream);
}
-} // namespace pseudo
+} // namespace clangd
} // namespace clang
diff --git a/clang-tools-extra/pseudo/include/clang-pseudo/Bracket.h b/clang-tools-extra/clangd/support/Bracket.h
similarity index 87%
rename from clang-tools-extra/pseudo/include/clang-pseudo/Bracket.h
rename to clang-tools-extra/clangd/support/Bracket.h
index 268cfff1ab07ab..b43c22cea06d06 100644
--- a/clang-tools-extra/pseudo/include/clang-pseudo/Bracket.h
+++ b/clang-tools-extra/clangd/support/Bracket.h
@@ -23,19 +23,19 @@
//
//===----------------------------------------------------------------------===//
-#ifndef CLANG_PSEUDO_BRACKET_H
-#define CLANG_PSEUDO_BRACKET_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANGD_BRACKET_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANGD_BRACKET_H
-#include "clang-pseudo/Token.h"
+#include "Token.h"
namespace clang {
-namespace pseudo {
+namespace clangd {
/// Identifies bracket token in the stream which should be paired.
/// Sets Token::Pair accordingly.
void pairBrackets(TokenStream &);
-} // namespace pseudo
+} // namespace clangd
} // namespace clang
-#endif
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANGD_BRACKET_H
diff --git a/clang-tools-extra/clangd/support/CMakeLists.txt b/clang-tools-extra/clangd/support/CMakeLists.txt
index 0c8c199dd4a4c2..a1a2a7765fae95 100644
--- a/clang-tools-extra/clangd/support/CMakeLists.txt
+++ b/clang-tools-extra/clangd/support/CMakeLists.txt
@@ -16,9 +16,12 @@ if(NOT HAVE_CXX_ATOMICS_WITHOUT_LIB OR NOT HAVE_CXX_ATOMICS64_WITHOUT_LIB)
endif()
add_clang_library(clangdSupport
+ Bracket.cpp
Cancellation.cpp
Context.cpp
+ DirectiveTree.cpp
FileCache.cpp
+ Lex.cpp
Logger.cpp
Markup.cpp
MemoryTree.cpp
@@ -27,6 +30,7 @@ add_clang_library(clangdSupport
ThreadCrashReporter.cpp
Threading.cpp
ThreadsafeFS.cpp
+ Token.cpp
Trace.cpp
LINK_LIBS
diff --git a/clang-tools-extra/pseudo/lib/DirectiveTree.cpp b/clang-tools-extra/clangd/support/DirectiveTree.cpp
similarity index 99%
rename from clang-tools-extra/pseudo/lib/DirectiveTree.cpp
rename to clang-tools-extra/clangd/support/DirectiveTree.cpp
index 9e853e46edc232..d25da111681afc 100644
--- a/clang-tools-extra/pseudo/lib/DirectiveTree.cpp
+++ b/clang-tools-extra/clangd/support/DirectiveTree.cpp
@@ -6,7 +6,7 @@
//
//===----------------------------------------------------------------------===//
-#include "clang-pseudo/DirectiveTree.h"
+#include "DirectiveTree.h"
#include "clang/Basic/IdentifierTable.h"
#include "clang/Basic/TokenKinds.h"
#include "llvm/Support/FormatVariadic.h"
@@ -14,7 +14,7 @@
#include <variant>
namespace clang {
-namespace pseudo {
+namespace clangd {
namespace {
class DirectiveParser {
@@ -353,5 +353,5 @@ TokenStream DirectiveTree::stripDirectives(const TokenStream &In) const {
return Out;
}
-} // namespace pseudo
+} // namespace clangd
} // namespace clang
diff --git a/clang-tools-extra/pseudo/include/clang-pseudo/DirectiveTree.h b/clang-tools-extra/clangd/support/DirectiveTree.h
similarity index 95%
rename from clang-tools-extra/pseudo/include/clang-pseudo/DirectiveTree.h
rename to clang-tools-extra/clangd/support/DirectiveTree.h
index 2b6cb63297915d..34f5a888863f26 100644
--- a/clang-tools-extra/pseudo/include/clang-pseudo/DirectiveTree.h
+++ b/clang-tools-extra/clangd/support/DirectiveTree.h
@@ -25,17 +25,17 @@
//
//===----------------------------------------------------------------------===//
-#ifndef CLANG_PSEUDO_DIRECTIVETREE_H
-#define CLANG_PSEUDO_DIRECTIVETREE_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANGD_DIRECTIVETREE_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANGD_DIRECTIVETREE_H
-#include "clang-pseudo/Token.h"
+#include "Token.h"
#include "clang/Basic/TokenKinds.h"
#include <optional>
#include <variant>
#include <vector>
namespace clang {
-namespace pseudo {
+namespace clangd {
/// Describes the structure of a source file, as seen by the preprocessor.
///
@@ -124,7 +124,7 @@ llvm::raw_ostream &operator<<(llvm::raw_ostream &,
/// The choices are stored in Conditional::Taken nodes.
void chooseConditionalBranches(DirectiveTree &, const TokenStream &Code);
-} // namespace pseudo
+} // namespace clangd
} // namespace clang
-#endif // CLANG_PSEUDO_DIRECTIVETREE_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANGD_DIRECTIVETREE_H
diff --git a/clang-tools-extra/pseudo/lib/Lex.cpp b/clang-tools-extra/clangd/support/Lex.cpp
similarity index 98%
rename from clang-tools-extra/pseudo/lib/Lex.cpp
rename to clang-tools-extra/clangd/support/Lex.cpp
index 2111476f04dc5b..f043b551b6bc6c 100644
--- a/clang-tools-extra/pseudo/lib/Lex.cpp
+++ b/clang-tools-extra/clangd/support/Lex.cpp
@@ -6,7 +6,7 @@
//
//===----------------------------------------------------------------------===//
-#include "clang-pseudo/Token.h"
+#include "Token.h"
#include "clang/Basic/IdentifierTable.h"
#include "clang/Basic/SourceLocation.h"
#include "clang/Basic/TokenKinds.h"
@@ -14,7 +14,7 @@
#include "clang/Lex/LiteralSupport.h"
namespace clang {
-namespace pseudo {
+namespace clangd {
TokenStream lex(const std::string &Code, const clang::LangOptions &LangOpts) {
clang::SourceLocation Start;
@@ -135,5 +135,5 @@ TokenStream cook(const TokenStream &Code, const LangOptions &LangOpts) {
return Result;
}
-} // namespace pseudo
+} // namespace clangd
} // namespace clang
diff --git a/clang-tools-extra/pseudo/lib/Token.cpp b/clang-tools-extra/clangd/support/Token.cpp
similarity index 98%
rename from clang-tools-extra/pseudo/lib/Token.cpp
rename to clang-tools-extra/clangd/support/Token.cpp
index 5b07a62f37fb2f..13eee66606061d 100644
--- a/clang-tools-extra/pseudo/lib/Token.cpp
+++ b/clang-tools-extra/clangd/support/Token.cpp
@@ -6,14 +6,14 @@
//
//===----------------------------------------------------------------------===//
-#include "clang-pseudo/Token.h"
+#include "Token.h"
#include "clang/Basic/LangOptions.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/Support/Format.h"
#include "llvm/Support/FormatVariadic.h"
namespace clang {
-namespace pseudo {
+namespace clangd {
llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const Token &T) {
OS << llvm::formatv("{0} {1}:{2} ", clang::tok::getTokenName(T.Kind), T.Line,
@@ -126,5 +126,5 @@ TokenStream stripComments(const TokenStream &Input) {
return Out;
}
-} // namespace pseudo
+} // namespace clangd
} // namespace clang
diff --git a/clang-tools-extra/pseudo/include/clang-pseudo/Token.h b/clang-tools-extra/clangd/support/Token.h
similarity index 98%
rename from clang-tools-extra/pseudo/include/clang-pseudo/Token.h
rename to clang-tools-extra/clangd/support/Token.h
index 859fd7d2b3dfe2..555b6b0e4ce570 100644
--- a/clang-tools-extra/pseudo/include/clang-pseudo/Token.h
+++ b/clang-tools-extra/clangd/support/Token.h
@@ -25,8 +25,8 @@
//
//===----------------------------------------------------------------------===//
-#ifndef CLANG_PSEUDO_TOKEN_H
-#define CLANG_PSEUDO_TOKEN_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANGD_TOKEN_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANGD_TOKEN_H
#include "clang/Basic/LLVM.h"
#include "clang/Basic/LangStandard.h"
@@ -41,7 +41,7 @@
namespace clang {
class LangOptions;
-namespace pseudo {
+namespace clangd {
/// A single C++ or preprocessor token.
///
@@ -249,7 +249,7 @@ TokenStream cook(const TokenStream &, const clang::LangOptions &);
/// Drops comment tokens.
TokenStream stripComments(const TokenStream &);
-} // namespace pseudo
+} // namespace clangd
} // namespace clang
-#endif // CLANG_PSEUDO_TOKEN_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANGD_TOKEN_H
diff --git a/clang-tools-extra/pseudo/CMakeLists.txt b/clang-tools-extra/pseudo/CMakeLists.txt
deleted file mode 100644
index 24bc1530bb7d6f..00000000000000
--- a/clang-tools-extra/pseudo/CMakeLists.txt
+++ /dev/null
@@ -1,12 +0,0 @@
-include_directories(include)
-include_directories(${CMAKE_CURRENT_BINARY_DIR}/include)
-add_subdirectory(include)
-add_subdirectory(gen)
-add_subdirectory(lib)
-add_subdirectory(tool)
-add_subdirectory(fuzzer)
-add_subdirectory(benchmarks)
-if(CLANG_INCLUDE_TESTS)
- add_subdirectory(unittests)
- add_subdirectory(test)
-endif()
diff --git a/clang-tools-extra/pseudo/DesignNotes.md b/clang-tools-extra/pseudo/DesignNotes.md
deleted file mode 100644
index 421cc02aef7576..00000000000000
--- a/clang-tools-extra/pseudo/DesignNotes.md
+++ /dev/null
@@ -1,123 +0,0 @@
-# Error recovery (2022-05-07)
-
-Problem: we have two fairly generic cases of recovery bounded within a range:
- - sequences: `int x; this is absolute garbage; x++;`
- - brackets: `void foo() { this is garbage too }`
-
-So far, we've thought of these as different, and had precise ideas about
-brackets ("lazy parsing") and vague ones about sequences.
-Con we unify them instead?
-
-In both cases we want to recognize the bounds of the garbage item based on
-basic token-level features of the surrounding code, and avoid any interference
-with the surrounding code.
-
-## Brackets
-
-Consider a rule like `compound-stmt := { stmt-seq }`.
-
-The desired recovery is:
-- if we fail at `{ . stmt-seq }`
-- ... and we can find for the matching `}`
-- then consume up to that token as an opaque broken `stmt-seq`
-- ... and advance state to `{ stmt-seq . }`
-
-We can annotate the rule to describe this: `{ stmt-seq [recovery] }`.
-We can generalize as `{ stmt-seq [recovery=rbrace] }`, allowing for different
-**strategies** to find the resume point.
-
-(It's OK to assume we're skipping over one RHS nonterminal, we can always
-introduce a nonterminal for the bracket contents if necessary).
-
-## Sequences
-
-Can we apply the same technique to sequences?
-Simplest case: delimited right-recursive sequence.
-
-```
-param-list := param
-param-list := param , param-list
-```
-
-We need recovery in **both** rules.
-`param` in the first rule matches the *last* param in a list,
-in the second rule it matches all others. We want to recover in any position.
-
-If we want to be able to recovery `int x int y` as two parameters, then we
-should extract a `param-and-comma` rule that recovery could operate on.
-
-### Last vs not-last elements
-
-Sequences will usually have two rules with recovery, we discussed:
- - how to pick the correct one to recover with
- - in a left-recursive rule they correspond to last & not-last elements
- - the recovery strategy knows whether we're recoverying last or not-last
- - we could have the strategy return (pos, symbol parsed), and artificially
- require distinct symbols (e.g. `stmt` vs `last_stmt`).
- - we can rewrite left-recursion in the grammar as right-recursion
-
-However, on reflection I think we can simply allow recovery according to both
-rules. The "wrong" recovery will produce a parse head that dies.
-
-## How recovery fits into GLR
-
-Recovery should kick in at the point where we would otherwise abandon all
-variants of an high-level parse.
-
-e.g. Suppose we're parsing `static_cast<foo bar baz>(1)` and are up to `bar`.
-Our GSS looks something like:
-
-```
- "the static cast's type starts at foo"
----> {expr := static_cast < . type > ( expr )}
- | "foo... is a class name"
- +---- {type := identifier .}
- | "foo... is a template ID"
- +---- {type := identifier . < template-arg-list >}
-```
-
-Since `foo bar baz` isn't a valid class name or template ID, both active heads
-will soon die, as will the parent GSS Node - the latter should trigger recovery.
-
-- we need a refcount in GSS nodes so we can recognize never-reduced node death
-- when a node dies, we look up its recovery actions (symbol, strategy).
- These are the union of the recovery actions for each item.
- They can be stored in the action table.
- Here: `actions[State, death] = Recovery(type, matching-angle-bracket)`
-- we try each strategy: feeding in the start position = token of the dying node
- (`foo`) getting out the end position (`>`).
-- We form an opaque forest node with the correct symbol (`type`) spanning
- [start, end)
-- We create a GSS node to represent the state after recovery.
- The new state is found in the Goto table in the usual way.
-
-```
- "the static cast's type starts at foo"
----> {expr := static_cast < . type > ( expr )}
- | "`foo bar baz` is an unparseable type"
- +---- {expr := static_cast < type . > (expr)}
-```
-
-## Which recovery heads to activate
-
-We probably shouldn't *always* create active recovery heads when a recoverable
-node dies (and thus keep it alive).
-By design GLR creates multiple speculative parse heads and lets incorrect heads
-disappear.
-
-Concretely, the expression `(int *)(x)` is a valid cast, we probably shouldn't
-also parse it as a call whose callee is a broken expr.
-
-The simplest solution is to only create recovery heads if there are no normal
-heads remaining, i.e. if parsing is completely stuck. This is vulnerable if the
-"wrong" parse makes slightly more progress than the "right" parse which has
-better error recovery.
-
-A sophisticated variant might record recovery opportunities and pick the one
-with the rightmost *endpoint* when the last parse head dies.
-
-We should consider whether including every recovery in the parse forest might
-work after all - this would let disambiguation choose "broken" but likely parses
-over "valid" but unlikely ones.
-
-
diff --git a/clang-tools-extra/pseudo/Disambiguation.md b/clang-tools-extra/pseudo/Disambiguation.md
deleted file mode 100644
index 39e246a523beb9..00000000000000
--- a/clang-tools-extra/pseudo/Disambiguation.md
+++ /dev/null
@@ -1,367 +0,0 @@
-# Disambiguation
-
-The C++ grammar is highly ambiguous, so the GLR parser produces a forest of
-parses, represented compactly by a DAG.
-A real C++ parser finds the correct parse through semantic analysis: mostly
-resolving names. But we can't do that, as we don't parse the headers.
-
-Our disambiguation phase should take the parse forest, and choose a single parse
-tree that is most likely.
-It might **optionally** use some other hints (e.g. coding style, or what
-specific names tend to mean in this codebase).
-
-There are some grammatical ambiguities that can be resolved without semantic
-analysis, e.g. whether `int <declarator>{}` is a function-definition.
-We eliminate these earlier e.g., with rule guards. By "disambiguation" we mean
-choosing between interpretations that we can't reject confidently and locally.
-
-## Types of evidence
-
-We have limited information to go on, and strive to use similar heuristics a
-human reader might.
-
-### Likely and unlikely structure
-
-In some cases, the shape of a particular interpretation is unlikely but not
-impossible. For example, the statement `x(a);` might:
-
-- call a function `x` (likely)
-- construct a temporary of class type `x` (less likely)
-- define a variable `a` of type `x`, which is an alias for e.g. `int`
- (unlikely!)
-
-We model this as a bonus/penalty for including a particular forest node in the
-chosen parse. For each rule we want to apply, we can write some code to
-recognize the corresponding pattern in the parse tree, and run these recognizers
-at each node to assign the bonuses.
-
-### Interpreting names
-
-Just as resolving names allows a C++ parser to choose the right parse (rejecting
-others), chunks of a parse tree imply things about how names resolve.
-
-Because the same name often means the same thing in different contexts, we can
-apply knowledge from elsewhere. This can be as simple as knowing "`vector` is
-usually a type", and adding bonuses to nodes that include that interpretation.
-
-However we can also transfer knowlegde across the same file we're parsing:
-
-```cpp
-// Is `Builder` a class or a namespace?
-void Builder::build() { ... }
-// ...
-// `Builder` is a type.
-Builder b;
-```
-
-We can use this to understand more-ambiguous code based on names in a section
-we're more sure about. It also pushes us to provide a consistent parse, rather
-than interpreting each occurrence of an unclear name differently.
-
-Again, we can define bonuses/penalties for forest nodes that interpret names,
-but this time those bonuses change as we disambiguate. Specifically:
-
-- we can group identifiers into **classes**, most importantly "all identifiers
- with text 'foo'" but also "all snake_case identifiers".
-- clusters of nodes immediately above the identifiers in the parse forest are
- **interpretations**, they bind the identifier to a **kind** such "type",
- "value", "namespace", "other".
-- for each class we can query information once from an external source (such as
- an index or hard-coded list), yielding a vector of weights (one per kind)
-- at each point we can compute metrics based on interpretations in the forest:
- - the number of identifiers in the class that are interpreted as each kind
- (e.g. *all* remaining interpretations of 'foo' at 3:7 are 'type')
- - the number of identifiers in the class that *may* be interpereted as each
- kind (e.g. 'foo' at 3:7 might be a 'type').
-- we can mash these metrics together into a vector of bonuses for a class (e.g.
- for identifiers with text 'bar', 'type'=>+5, 'namespace'=>+1, 'value'=>-2).
-- these bonuses are assigned to corresponding interpretations in the graph
-
-### Templates
-
-Another aspect of a name is whether it names a template (type or value). This
-is ambiguous in many more cases since CTAD allowed template arguments to be
-omitted.
-
-A fairly simple heuristic appears sufficient here: things that look like
-templates usually are, so if a node for certain rules exists in the forest
-(e.g. `template-id := template-name < template-argument-list >`) then we treat
-the template name as a probable template, and apply a bonus to every node that
-interprets it that way. We do this even if alternate parses are possible
-(`a < b > :: c` might be a comparison, but is still evidence `a` is a template).
-
-## Algorithm sketch
-
-With static node scores, finding the best tree is a very tractable problem
-with an efficient solution.
-With dynamic scores it becomes murky and we have to settle for approximations.
-These build on the same idea, so we'll look at the simple version first.
-
-### Naive version (static scores)
-
-At a high level, we want to assign bonuses to nodes, and find the tree that
-maximizes the total score. If bonuses were fixed, independent of other
-disambiguation decisions, then we could simply walk bottom-up, aggregating
-scores and replacing each ambiguous node with the top-scoring alternative
-subtree. This could happen directly on the parse tree.
-
-Given this tree as input:
-
-```mermaid
-flowchart TB
- subgraph
- idA["a"]
- open["("]
- idB["b"]
- close[")"]
- semi[";"]
- end
- class idA,open,idB,close,semi token;
-
- typeA["type := IDENTIFIER"] --- idA
- exprA["expr := IDENTIFIER"] --- idA
- exprB["expr := IDENTIFIER"] --- idB
- declB["declarator := IDENTIFIER"] --- idB
- stmtExpr --- semi
- stmtDecl --- semi
-
- stmtAmbig["stmt?"]:::ambig
- stmtAmbig === stmtExpr["stmt := expr ;"]
- stmtExpr --- exprAmbig["expr?"]:::ambig
- exprAmbig === funcall["expr := expr ( expr )"]:::good
- funcall --- exprA
- funcall --- open
- funcall --- exprB["expr := IDENTIFIER"]
- funcall --- close
- exprAmbig -.- construct["expr := type ( expr )"]:::bad
- construct --- typeA
- construct --- open
- construct --- exprB
- construct --- close
- stmtAmbig -.- stmtDecl["stmt := decl"]
- stmtDecl --- decl["decl := type declarator ;"]
- decl --- typeA
- decl --- declParens["declarator := ( declarator )"]:::bad
- declParens --- open
- declParens --- declB
- declParens --- close
-
- classDef ambig fill:blue,color:white;
- classDef token fill:yellow;
- classDef good fill:lightgreen
- classDef bad fill:pink
-```
-
-A post-order traversal reaches the ambiguous node `expr?` first.
-The left alternative has a total score of +1 (green bonus for
-`expr := expr (expr)`) and the right alternative has a total bonus of -1
-(red penalty for `expr := type (expr)`). So we replace `expr?` with its left
-alternative.
-
-As we continue traversing, we reach `stmt?` next: again we have +1 in the left
-subtree and -1 in the right subtree, so we pick the left one. Result:
-
-```mermaid
-flowchart TB
- subgraph
- idA["a"]
- open["("]
- idB["b"]
- close[")"]
- semi[";"]
- end
- class idA,open,idB,close,semi token;
-
- typeA["type := IDENTIFIER"] --- idA
- exprA["expr := IDENTIFIER"] --- idA
- exprB["expr := IDENTIFIER"] --- idB
- declB["declarator := IDENTIFIER"] --- idB
- stmtExpr --- semi
- stmtDecl --- semi
-
- stmtExpr["stmt := expr ;"]
- stmtExpr --- funcall["expr := expr ( expr )"]
- funcall --- exprA
- funcall --- open
- funcall --- exprB["expr := IDENTIFIER"]
- funcall --- close
-
- classDef token fill:yellow;
-```
-
-### Degrees of freedom
-
-We must traverse the DAG bottom-up in order to make score-based decisions:
-if an ambiguous node has ambiguous descendants then we can't calculate the score
-for that subtree.
-
-This gives us a topological **partial** order, but we don't have to go from
-left-to-right. At any given point there is a "frontier" of ambiguous nodes with
-no ambiguous descendants. The sequence we choose matters: each choice adds
-more interpretations that should affect future choices.
-
-Initially, most of the ambiguous nodes in the frontier will be e.g. "is this
-identifier a type or a value". If we had to work left-to-right then we'd
-immediately be forced to resolve the first name in the file, likely with
-little to go on and high chance of a mistake.
-But there are probably names where we have strong evidence, e.g. we've seen an
-(unambiguous) declaration of a variable `foo`, so other occurrences of `foo`
-are very likely to be values rather than types. We can disambiguate these with
-high confidence, and these choices are likely to "unlock" other conclusions that
-we can use for further disambiguation.
-
-This is intuitively similar to how people decipher ambiguous code: they find a
-piece that's easy to understand, read that to learn what names mean, and use
-the knowledge gained to study the more difficult parts.
-
-To formalize this a little:
-- we prioritize making the **highest confidence** decisions first
-- we define **confidence** as the score of the accepted alternative, minus the
- score of the best rejected alternative.
-
-### Removing the bottom-up restriction
-
-Strictly only resolving "frontier" ambiguities may cause problems.
-Consider the following example:
-
-```mermaid
-flowchart TB
- subgraph
- a:::token
- b:::token
- end
-
- ambig1:::ambig
- ambig1 --- choice1:::good
- ambig1 --- choice2
- choice1 --- ambig2:::ambig
- ambig2 --- aType["a is class"] --- a
- ambig2 --- aTemplate["a is CTAD"] --- a
- choice1 --- bValue["b is variable"] --- b
-
- classDef ambig fill:blue,color:white;
- classDef token fill:yellow;
- classDef good fill:lightgreen
- classDef bad fill:pink
-```
-
-We have some evidence that `choice1` is good. If we selected it, we would know
-that `b` is a variable and could use this in disambiguating the rest of the
-file. However we can't select `choice1` until we decide exactly how to interpret
-`a`, and there might be little info to do so. Gating higher-confidence decisions
-on lower-confidence ones increases our chance of making an error.
-
-A possible fix would be to generalize to a **range** of possible scores for
-nodes above the frontier, and rank by **minimum confidence**, i.e. the highest
-min-score of the accepted alternative, minus the highest max-score among the
-rejected alternative.
-
-## Details
-
-The remaining challenges are mainly:
-- defining the score function for an alternative. This is TBD, pending
- experiments.
-- finding a data structure and algorithm to efficiently resolve/re-evaluate
- in a loop until we've resolved all ambiguities.
-
-### Disambiguation DAG
-
-Rather than operate on the forest directly, it's simpler to consider a reduced
-view that hides complexity unrelated to disambiguation:
-
-**Forest:**
-
-```mermaid
-flowchart TB
- subgraph
- open["{"]
- a
- star["*"]
- b
- semi[";"]
- close["}"]
- end
- class open,a,star,b,semi,close token
-
- compound-stmt --- open
- compound-stmt --- stmt?
- compound-stmt --- close
-
- stmt?:::ambig --- decl-stmt
- decl-stmt --- type-name
- type-name --a is type--- a
- decl-stmt --- declarator
- declarator --- star
- declarator --- declarator_b["declarator"]
- declarator_b --b is value--- b
- decl-stmt --- semi
-
- stmt?:::ambig --- expr-stmt
- expr-stmt --- expr1["expr"]
- expr-stmt --- star
- expr-stmt --- expr_b["expr"]
- expr-stmt --- semi
- expr_a --a is value--- a
- expr_b --b is value--- b
-
- classDef ambig fill:blue,color:white;
- classDef token fill:yellow;
-```
-
-**Ambiguity graph:**
-
-```mermaid
-flowchart TB
- subgraph
- a
- b
- end
- class a,b token
-
- root --- stmt?
-
- stmt?:::ambig --- decl-stmt
- decl-stmt --a is type--- a
- decl-stmt --b is value--- b
-
- stmt?:::ambig --- expr-stmt
- expr-stmt --a is value--- a
- expr-stmt --b is value--- b
-
- classDef ambig fill:blue,color:white;
- classDef token fill:yellow;
-```
-
-Here the clusters of non-ambiguous forest nodes are grouped together, so that the DAG is bipartite with ambiguous/cluster nodes, and interpretation edges at the bottom.
-
-Scoring the clusters and selecting which to include is equivalent to disambiguating the full graph.
-
-### Resolving the ambiguity DAG
-
-The static scores of the forest nodes are aggregated into static scores for the clusters.
-The interpretation edges of the frontier clusters can be scored based on the context available so far, and the scores "bubble up" to parent nodes, with ambiguous nodes creating score ranges as described above.
-
-The main dynamic signal is when a token has been fully resolved, which happens when all the interpretations leading to it have the same label.
-
-The naive algorithm is to score all clusters, choose the best to resolve and repeat.
-However this is very slow:
- - there are **many** ambiguities available at first, therefore many clusters to score
- - each time we resolve an ambiguity, we invalidate previously computed scores
- - while the clusters become fewer over time, there are more interpretations per cluster
-
-It's tempting to use a priority queue to avoid repeatedly scanning clusters. However if we invalidate a large fraction of a heap's elements each round, we lose the efficiency benefits it brings.
-We could reuse scores if the resolved cluster doesn't tell us much about the target cluster.
-The simplest idea is to only recalculate clusters with an overlapping word, this may not save much (consider `std`) as clusters get larger.
-A heuristic to estimate how much a cluster affects another may help.
-
-To stop the clusters having too many interpretation edges (and thus take too long to score), we can drop the edges for any token that is fully resolved. We need to track these anyway (for scoring of interpretations of other identifiers with the same text). And once only a single interpretation exists, removing it has no impact on scores.
-
-So for now the sketch is:
-- build the ambiguity DAG
-- compute scores for all clusters
-- place confidences (score difference) for each cluster in a priority queue
-- while there is still ambiguity:
- - take the most confident cluster C and resolve it
- - propagate the score change to all of C's ancestors
- - work out which identifiers are now resolved, record that and remove the interpretations from the graph
- - recompute scores for the K clusters most affected by resolving those identifiers, and their ancestors
diff --git a/clang-tools-extra/pseudo/README.md b/clang-tools-extra/pseudo/README.md
deleted file mode 100644
index 0958f5d500e7f3..00000000000000
--- a/clang-tools-extra/pseudo/README.md
+++ /dev/null
@@ -1,37 +0,0 @@
-# clang pseudoparser
-
-This directory implements an approximate heuristic parser for C++, based on the
-clang lexer, the C++ grammar, and the GLR parsing algorithm.
-
-It parses a file in isolation, without reading its included headers.
-The result is a strict syntactic tree whose structure follows the C++ grammar.
-There is no semantic analysis, apart from guesses to disambiguate the parse.
-Disambiguation can optionally be guided by an AST or a symbol index.
-
-For now, the best reference on intended scope is the [design proposal],
-with further discussion on the [RFC].
-
-## Dependencies between pseudoparser and clang
-
-Dependencies are limited because they don't make sense, but also to avoid
-placing a burden on clang mantainers.
-
-The pseudoparser reuses the clang lexer (clangLex and clangBasic libraries) but
-not the higher-level libraries (Parse, Sema, AST, Frontend...).
-
-When the pseudoparser should be used together with an AST (e.g. to guide
-disambiguation), this is a separate "bridge" library that depends on both.
-
-Clang does not depend on the pseudoparser at all. If this seems useful in future
-it should be discussed by RFC.
-
-## Parity between pseudoparser and clang
-
-The pseudoparser aims to understand real-world code, and particularly the
-languages and extensions supported by Clang.
-
-However we don't try to keep these in lockstep: there's no expectation that
-Clang parser changes are accompanied by pseudoparser changes or vice versa.
-
-[design proposal]: https://docs.google.com/document/d/1eGkTOsFja63wsv8v0vd5JdoTonj-NlN3ujGF0T7xDbM/edit
-[RFC]: https://discourse.llvm.org/t/rfc-a-c-pseudo-parser-for-tooling/59217/49
diff --git a/clang-tools-extra/pseudo/benchmarks/Benchmark.cpp b/clang-tools-extra/pseudo/benchmarks/Benchmark.cpp
deleted file mode 100644
index 087ab6c250e39e..00000000000000
--- a/clang-tools-extra/pseudo/benchmarks/Benchmark.cpp
+++ /dev/null
@@ -1,156 +0,0 @@
-//===--- Benchmark.cpp - clang pseudoparser benchmarks ---------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Benchmark for the overall pseudoparser performance, it also includes other
-// important pieces of the pseudoparser (grammar compliation, LR table build
-// etc).
-//
-// Note: make sure to build the benchmark in Release mode.
-//
-// Usage:
-// tools/clang/tools/extra/pseudo/benchmarks/ClangPseudoBenchmark \
-// --grammar=../clang-tools-extra/pseudo/lib/cxx.bnf \
-// --source=../clang/lib/Sema/SemaDecl.cpp
-//
-//===----------------------------------------------------------------------===//
-
-#include "benchmark/benchmark.h"
-#include "clang-pseudo/Bracket.h"
-#include "clang-pseudo/DirectiveTree.h"
-#include "clang-pseudo/Forest.h"
-#include "clang-pseudo/GLR.h"
-#include "clang-pseudo/Token.h"
-#include "clang-pseudo/cli/CLI.h"
-#include "clang-pseudo/grammar/Grammar.h"
-#include "clang-pseudo/grammar/LRTable.h"
-#include "clang/Basic/LangOptions.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/ErrorOr.h"
-#include "llvm/Support/MemoryBuffer.h"
-#include "llvm/Support/raw_ostream.h"
-#include <string>
-
-using llvm::cl::desc;
-using llvm::cl::opt;
-using llvm::cl::Required;
-
-static opt<std::string> Source("source", desc("Source file"), Required);
-
-namespace clang {
-namespace pseudo {
-namespace bench {
-namespace {
-
-const std::string *SourceText = nullptr;
-const Language *Lang = nullptr;
-
-void setup() {
- auto ReadFile = [](llvm::StringRef FilePath) -> std::string {
- llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> GrammarText =
- llvm::MemoryBuffer::getFile(FilePath);
- if (std::error_code EC = GrammarText.getError()) {
- llvm::errs() << "Error: can't read file '" << FilePath
- << "': " << EC.message() << "\n";
- std::exit(1);
- }
- return GrammarText.get()->getBuffer().str();
- };
- SourceText = new std::string(ReadFile(Source));
- Lang = &getLanguageFromFlags();
-}
-
-static void buildSLR(benchmark::State &State) {
- for (auto _ : State)
- LRTable::buildSLR(Lang->G);
-}
-BENCHMARK(buildSLR);
-
-TokenStream lexAndPreprocess() {
- clang::LangOptions LangOpts = genericLangOpts();
- TokenStream RawStream = pseudo::lex(*SourceText, LangOpts);
- auto DirectiveStructure = DirectiveTree::parse(RawStream);
- chooseConditionalBranches(DirectiveStructure, RawStream);
- TokenStream Cook =
- cook(DirectiveStructure.stripDirectives(RawStream), LangOpts);
- auto Stream = stripComments(Cook);
- pairBrackets(Stream);
- return Stream;
-}
-
-static void lex(benchmark::State &State) {
- clang::LangOptions LangOpts = genericLangOpts();
- for (auto _ : State)
- clang::pseudo::lex(*SourceText, LangOpts);
- State.SetBytesProcessed(static_cast<uint64_t>(State.iterations()) *
- SourceText->size());
-}
-BENCHMARK(lex);
-
-static void pairBrackets(benchmark::State &State) {
- clang::LangOptions LangOpts = genericLangOpts();
- auto Stream = clang::pseudo::lex(*SourceText, LangOpts);
- for (auto _ : State)
- pairBrackets(Stream);
- State.SetBytesProcessed(static_cast<uint64_t>(State.iterations()) *
- SourceText->size());
-}
-BENCHMARK(pairBrackets);
-
-static void preprocess(benchmark::State &State) {
- clang::LangOptions LangOpts = genericLangOpts();
- TokenStream RawStream = clang::pseudo::lex(*SourceText, LangOpts);
- for (auto _ : State) {
- auto DirectiveStructure = DirectiveTree::parse(RawStream);
- chooseConditionalBranches(DirectiveStructure, RawStream);
- stripComments(
- cook(DirectiveStructure.stripDirectives(RawStream), LangOpts));
- }
- State.SetBytesProcessed(static_cast<uint64_t>(State.iterations()) *
- SourceText->size());
-}
-BENCHMARK(preprocess);
-
-static void glrParse(benchmark::State &State) {
- SymbolID StartSymbol = *Lang->G.findNonterminal("translation-unit");
- TokenStream Stream = lexAndPreprocess();
- for (auto _ : State) {
- pseudo::ForestArena Forest;
- pseudo::GSS GSS;
- pseudo::glrParse(ParseParams{Stream, Forest, GSS}, StartSymbol, *Lang);
- }
- State.SetBytesProcessed(static_cast<uint64_t>(State.iterations()) *
- SourceText->size());
-}
-BENCHMARK(glrParse);
-
-static void full(benchmark::State &State) {
- SymbolID StartSymbol = *Lang->G.findNonterminal("translation-unit");
- for (auto _ : State) {
- TokenStream Stream = lexAndPreprocess();
- pseudo::ForestArena Forest;
- pseudo::GSS GSS;
- pseudo::glrParse(ParseParams{Stream, Forest, GSS}, StartSymbol, *Lang);
- }
- State.SetBytesProcessed(static_cast<uint64_t>(State.iterations()) *
- SourceText->size());
-}
-BENCHMARK(full);
-
-} // namespace
-} // namespace bench
-} // namespace pseudo
-} // namespace clang
-
-int main(int argc, char *argv[]) {
- benchmark::Initialize(&argc, argv);
- llvm::cl::ParseCommandLineOptions(argc, argv);
- clang::pseudo::bench::setup();
- benchmark::RunSpecifiedBenchmarks();
- return 0;
-}
diff --git a/clang-tools-extra/pseudo/benchmarks/CMakeLists.txt b/clang-tools-extra/pseudo/benchmarks/CMakeLists.txt
deleted file mode 100644
index 859db991403cd5..00000000000000
--- a/clang-tools-extra/pseudo/benchmarks/CMakeLists.txt
+++ /dev/null
@@ -1,9 +0,0 @@
-add_benchmark(ClangPseudoBenchmark Benchmark.cpp)
-
-target_link_libraries(ClangPseudoBenchmark
- PRIVATE
- clangPseudo
- clangPseudoCLI
- clangPseudoGrammar
- LLVMSupport
- )
diff --git a/clang-tools-extra/pseudo/fuzzer/CMakeLists.txt b/clang-tools-extra/pseudo/fuzzer/CMakeLists.txt
deleted file mode 100644
index e1d79873471f07..00000000000000
--- a/clang-tools-extra/pseudo/fuzzer/CMakeLists.txt
+++ /dev/null
@@ -1,16 +0,0 @@
-set(LLVM_LINK_COMPONENTS
- FuzzerCLI
- Support
- )
-
-add_llvm_fuzzer(clang-pseudo-fuzzer
- Fuzzer.cpp
- DUMMY_MAIN Main.cpp
- )
-
-target_link_libraries(clang-pseudo-fuzzer
- PRIVATE
- clangPseudo
- clangPseudoCLI
- clangPseudoGrammar
- )
diff --git a/clang-tools-extra/pseudo/fuzzer/Fuzzer.cpp b/clang-tools-extra/pseudo/fuzzer/Fuzzer.cpp
deleted file mode 100644
index 87b9d15480cc35..00000000000000
--- a/clang-tools-extra/pseudo/fuzzer/Fuzzer.cpp
+++ /dev/null
@@ -1,82 +0,0 @@
-//===-- Fuzzer.cpp - Fuzz the pseudoparser --------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "clang-pseudo/DirectiveTree.h"
-#include "clang-pseudo/Forest.h"
-#include "clang-pseudo/GLR.h"
-#include "clang-pseudo/Token.h"
-#include "clang-pseudo/cli/CLI.h"
-#include "clang-pseudo/grammar/Grammar.h"
-#include "clang-pseudo/grammar/LRTable.h"
-#include "clang/Basic/LangOptions.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/Support/MemoryBuffer.h"
-#include "llvm/Support/raw_ostream.h"
-#include <algorithm>
-
-namespace clang {
-namespace pseudo {
-namespace {
-
-class Fuzzer {
- clang::LangOptions LangOpts = clang::pseudo::genericLangOpts();
- bool Print;
-
-public:
- Fuzzer(bool Print) : Print(Print) {}
-
- void operator()(llvm::StringRef Code) {
- std::string CodeStr = Code.str(); // Must be null-terminated.
- auto RawStream = lex(CodeStr, LangOpts);
- auto DirectiveStructure = DirectiveTree::parse(RawStream);
- clang::pseudo::chooseConditionalBranches(DirectiveStructure, RawStream);
- // FIXME: strip preprocessor directives
- auto ParseableStream =
- clang::pseudo::stripComments(cook(RawStream, LangOpts));
-
- clang::pseudo::ForestArena Arena;
- clang::pseudo::GSS GSS;
- const Language &Lang = getLanguageFromFlags();
- auto &Root =
- glrParse(clang::pseudo::ParseParams{ParseableStream, Arena, GSS},
- *Lang.G.findNonterminal("translation-unit"), Lang);
- if (Print)
- llvm::outs() << Root.dumpRecursive(Lang.G);
- }
-};
-
-Fuzzer *Fuzz = nullptr;
-
-} // namespace
-} // namespace pseudo
-} // namespace clang
-
-extern "C" {
-
-// Set up the fuzzer from command line flags:
-// -print - used for testing the fuzzer
-int LLVMFuzzerInitialize(int *Argc, char ***Argv) {
- bool PrintForest = false;
- auto ConsumeArg = [&](llvm::StringRef Arg) -> bool {
- if (Arg == "-print") {
- PrintForest = true;
- return true;
- }
- return false;
- };
- *Argc = std::remove_if(*Argv + 1, *Argv + *Argc, ConsumeArg) - *Argv;
-
- clang::pseudo::Fuzz = new clang::pseudo::Fuzzer(PrintForest);
- return 0;
-}
-
-int LLVMFuzzerTestOneInput(uint8_t *Data, size_t Size) {
- (*clang::pseudo::Fuzz)(llvm::StringRef(reinterpret_cast<char *>(Data), Size));
- return 0;
-}
-}
diff --git a/clang-tools-extra/pseudo/fuzzer/Main.cpp b/clang-tools-extra/pseudo/fuzzer/Main.cpp
deleted file mode 100644
index 542a3007a399f1..00000000000000
--- a/clang-tools-extra/pseudo/fuzzer/Main.cpp
+++ /dev/null
@@ -1,16 +0,0 @@
-//===--- Main.cpp - Entry point to sanity check the fuzzer ----------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/FuzzMutate/FuzzerCLI.h"
-
-extern "C" int LLVMFuzzerInitialize(int *, char ***);
-extern "C" int LLVMFuzzerTestOneInput(const uint8_t *, size_t);
-int main(int argc, char *argv[]) {
- return llvm::runFuzzerOnInputs(argc, argv, LLVMFuzzerTestOneInput,
- LLVMFuzzerInitialize);
-}
diff --git a/clang-tools-extra/pseudo/gen/CMakeLists.txt b/clang-tools-extra/pseudo/gen/CMakeLists.txt
deleted file mode 100644
index 3dd615a5587512..00000000000000
--- a/clang-tools-extra/pseudo/gen/CMakeLists.txt
+++ /dev/null
@@ -1,11 +0,0 @@
-set(LLVM_LINK_COMPONENTS Support)
-list(REMOVE_ITEM LLVM_COMMON_DEPENDS clang-tablegen-targets)
-
-add_clang_executable(clang-pseudo-gen
- Main.cpp
- )
-
-target_link_libraries(clang-pseudo-gen
- PRIVATE
- clangPseudoGrammar
- )
diff --git a/clang-tools-extra/pseudo/gen/Main.cpp b/clang-tools-extra/pseudo/gen/Main.cpp
deleted file mode 100644
index 25cb26563837a6..00000000000000
--- a/clang-tools-extra/pseudo/gen/Main.cpp
+++ /dev/null
@@ -1,172 +0,0 @@
-//===--- Main.cpp - Compile BNF grammar -----------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This is a tool to compile a BNF grammar, it is used by the build system to
-// generate a necessary data bits to statically construct core pieces (Grammar,
-// LRTable etc) of the LR parser.
-//
-//===----------------------------------------------------------------------===//
-
-#include "clang-pseudo/grammar/Grammar.h"
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/FileSystem.h"
-#include "llvm/Support/FormatVariadic.h"
-#include "llvm/Support/MemoryBuffer.h"
-#include "llvm/Support/ToolOutputFile.h"
-#include <algorithm>
-
-using llvm::cl::desc;
-using llvm::cl::init;
-using llvm::cl::opt;
-using llvm::cl::Required;
-using llvm::cl::value_desc;
-using llvm::cl::values;
-
-namespace {
-enum EmitType {
- EmitSymbolList,
- EmitGrammarContent,
-};
-
-opt<std::string> Grammar("grammar", desc("Parse a BNF grammar file."),
- Required);
-opt<EmitType>
- Emit(desc("which information to emit:"),
- values(clEnumValN(EmitSymbolList, "emit-symbol-list",
- "Print nonterminal symbols (default)"),
- clEnumValN(EmitGrammarContent, "emit-grammar-content",
- "Print the BNF grammar content as a string")));
-
-opt<std::string> OutputFilename("o", init("-"), desc("Output"),
- value_desc("file"));
-
-std::string readOrDie(llvm::StringRef Path) {
- llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> Text =
- llvm::MemoryBuffer::getFile(Path);
- if (std::error_code EC = Text.getError()) {
- llvm::errs() << "Error: can't read grammar file '" << Path
- << "': " << EC.message() << "\n";
- ::exit(1);
- }
- return Text.get()->getBuffer().str();
-}
-} // namespace
-
-namespace clang {
-namespace pseudo {
-namespace {
-
-// Mangles a symbol name into a valid identifier.
-//
-// These follow names in the grammar fairly closely:
-// nonterminal: `ptr-declarator` becomes `ptr_declarator`;
-// punctuator: `,` becomes `COMMA`;
-// keyword: `INT` becomes `INT`;
-// terminal: `IDENTIFIER` becomes `IDENTIFIER`;
-std::string mangleSymbol(SymbolID SID, const Grammar &G) {
- static auto &TokNames = *new std::vector<std::string>{
-#define TOK(X) llvm::StringRef(#X).upper(),
-#define KEYWORD(Keyword, Condition) llvm::StringRef(#Keyword).upper(),
-#include "clang/Basic/TokenKinds.def"
- };
- if (isToken(SID))
- return TokNames[symbolToToken(SID)];
- std::string Name = G.symbolName(SID).str();
- // translation-unit -> translation_unit
- std::replace(Name.begin(), Name.end(), '-', '_');
- return Name;
-}
-
-// Mangles the RHS of a rule definition into a valid identifier.
-//
-// These are unique only for a fixed LHS.
-// e.g. for the grammar rule `ptr-declarator := ptr-operator ptr-declarator`,
-// it is `ptr_operator__ptr_declarator`.
-std::string mangleRule(RuleID RID, const Grammar &G) {
- const auto &R = G.lookupRule(RID);
- std::string MangleName = mangleSymbol(R.seq().front(), G);
- for (SymbolID S : R.seq().drop_front()) {
- MangleName.append("__");
- MangleName.append(mangleSymbol(S, G));
- }
- return MangleName;
-}
-
-} // namespace
-} // namespace pseudo
-} // namespace clang
-
-int main(int argc, char *argv[]) {
- llvm::cl::ParseCommandLineOptions(argc, argv, "");
-
- std::string GrammarText = readOrDie(Grammar);
- std::vector<std::string> Diags;
- auto G = clang::pseudo::Grammar::parseBNF(GrammarText, Diags);
-
- if (!Diags.empty()) {
- llvm::errs() << llvm::join(Diags, "\n");
- return 1;
- }
-
- std::error_code EC;
- llvm::ToolOutputFile Out{OutputFilename, EC, llvm::sys::fs::OF_None};
- if (EC) {
- llvm::errs() << EC.message() << '\n';
- return 1;
- }
-
- switch (Emit) {
- case EmitSymbolList:
- Out.os() << R"cpp(
-#ifndef NONTERMINAL
-#define NONTERMINAL(NAME, ID)
-#endif
-#ifndef RULE
-#define RULE(LHS, RHS, ID)
-#endif
-#ifndef EXTENSION
-#define EXTENSION(NAME, ID)
-#endif
-)cpp";
- for (clang::pseudo::SymbolID ID = 0; ID < G.table().Nonterminals.size();
- ++ID) {
- Out.os() << llvm::formatv("NONTERMINAL({0}, {1})\n",
- clang::pseudo::mangleSymbol(ID, G), ID);
- for (const clang::pseudo::Rule &R : G.rulesFor(ID)) {
- clang::pseudo::RuleID RID = &R - G.table().Rules.data();
- Out.os() << llvm::formatv("RULE({0}, {1}, {2})\n",
- clang::pseudo::mangleSymbol(R.Target, G),
- clang::pseudo::mangleRule(RID, G), RID);
- }
- }
- for (clang::pseudo::ExtensionID EID = 1 /*skip the sentinel 0 value*/;
- EID < G.table().AttributeValues.size(); ++EID) {
- llvm::StringRef Name = G.table().AttributeValues[EID];
- assert(!Name.empty());
- Out.os() << llvm::formatv("EXTENSION({0}, {1})\n", Name, EID);
- }
- Out.os() << R"cpp(
-#undef NONTERMINAL
-#undef RULE
-#undef EXTENSION
-)cpp";
- break;
- case EmitGrammarContent:
- for (llvm::StringRef Line : llvm::split(GrammarText, '\n')) {
- Out.os() << '"';
- Out.os().write_escaped((Line + "\n").str());
- Out.os() << "\"\n";
- }
- break;
- }
-
- Out.keep();
-
- return 0;
-}
diff --git a/clang-tools-extra/pseudo/include/CMakeLists.txt b/clang-tools-extra/pseudo/include/CMakeLists.txt
deleted file mode 100644
index 619b00f34a5caa..00000000000000
--- a/clang-tools-extra/pseudo/include/CMakeLists.txt
+++ /dev/null
@@ -1,32 +0,0 @@
-# The cxx.bnf grammar file
-set(cxx_bnf ${CMAKE_CURRENT_SOURCE_DIR}/../lib/cxx/cxx.bnf)
-
-setup_host_tool(clang-pseudo-gen CLANG_PSEUDO_GEN pseudo_gen pseudo_gen_target)
-
-# Generate inc files.
-set(cxx_symbols_inc ${CMAKE_CURRENT_BINARY_DIR}/CXXSymbols.inc)
-add_custom_command(OUTPUT ${cxx_symbols_inc}
- COMMAND "${pseudo_gen}"
- --grammar ${cxx_bnf}
- --emit-symbol-list
- -o ${cxx_symbols_inc}
- COMMENT "Generating nonterminal symbol file for cxx grammar..."
- DEPENDS ${pseudo_gen_target} ${cxx_bnf}
- VERBATIM)
-
-set(cxx_bnf_inc ${CMAKE_CURRENT_BINARY_DIR}/CXXBNF.inc)
-add_custom_command(OUTPUT ${cxx_bnf_inc}
- COMMAND "${pseudo_gen}"
- --grammar ${cxx_bnf}
- --emit-grammar-content
- -o ${cxx_bnf_inc}
- COMMENT "Generating bnf string file for cxx grammar..."
- DEPENDS ${pseudo_gen_target} ${cxx_bnf}
- VERBATIM)
-
-# add_custom_command does not create a new target, we need to deine a target
-# explicitly, so that other targets can depend on it.
-add_custom_target(cxx_gen
- DEPENDS ${cxx_symbols_inc} ${cxx_bnf_inc}
- VERBATIM)
-set_target_properties(cxx_gen PROPERTIES FOLDER "Clang Tools Extra/Sourcegenning")
diff --git a/clang-tools-extra/pseudo/include/clang-pseudo/Disambiguate.h b/clang-tools-extra/pseudo/include/clang-pseudo/Disambiguate.h
deleted file mode 100644
index 5f3a22c9cabb37..00000000000000
--- a/clang-tools-extra/pseudo/include/clang-pseudo/Disambiguate.h
+++ /dev/null
@@ -1,64 +0,0 @@
-//===--- Disambiguate.h - Find the best tree in the forest -------*- C++-*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// A GLR parse forest represents every possible parse tree for the source code.
-//
-// Before we can do useful analysis/editing of the code, we need to pick a
-// single tree which we think is accurate. We use three main types of clues:
-//
-// A) Semantic language rules may restrict which parses are allowed.
-// For example, `string string string X` is *grammatical* C++, but only a
-// single type-name is allowed in a decl-specifier-sequence.
-// Where possible, these interpretations are forbidden by guards.
-// Sometimes this isn't possible, or we want our parser to be lenient.
-//
-// B) Some constructs are rarer, while others are common.
-// For example `a<b>::c` is often a template specialization, and rarely a
-// double comparison between a, b, and c.
-//
-// C) Identifier text hints whether they name types/values/templates etc.
-// "std" is usually a namespace, a project index may also guide us.
-// Hints may be within the document: if one occurrence of 'foo' is a variable
-// then the others probably are too.
-// (Text need not match: similar CaseStyle can be a weak hint, too).
-//
-//----------------------------------------------------------------------------//
-//
-// Mechanically, we replace each ambiguous node with its best alternative.
-//
-// "Best" is determined by assigning bonuses/penalties to nodes, to express
-// the clues of type A and B above. A forest node representing an unlikely
-// parse would apply a penalty to every subtree is is present in.
-// Disambiguation proceeds bottom-up, so that the score of each alternative
-// is known when a decision is made.
-//
-// Identifier-based hints within the document mean some nodes should be
-// *correlated*. Rather than resolve these simultaneously, we make the most
-// certain decisions first and use these results to update bonuses elsewhere.
-//
-//===----------------------------------------------------------------------===//
-
-#include "clang-pseudo/Forest.h"
-
-namespace clang::pseudo {
-
-struct DisambiguateParams {};
-
-// Maps ambiguous nodes onto the index of their preferred alternative.
-using Disambiguation = llvm::DenseMap<const ForestNode *, unsigned>;
-
-// Resolve each ambiguous node in the forest.
-// Maps each ambiguous node to the index of the chosen alternative.
-// FIXME: current implementation is a placeholder and chooses arbitrarily.
-Disambiguation disambiguate(const ForestNode *Root,
- const DisambiguateParams &Params);
-
-// Remove all ambiguities from the forest, resolving them according to Disambig.
-void removeAmbiguities(ForestNode *&Root, const Disambiguation &Disambig);
-
-} // namespace clang::pseudo
diff --git a/clang-tools-extra/pseudo/include/clang-pseudo/Forest.h b/clang-tools-extra/pseudo/include/clang-pseudo/Forest.h
deleted file mode 100644
index e9edb40e02b64e..00000000000000
--- a/clang-tools-extra/pseudo/include/clang-pseudo/Forest.h
+++ /dev/null
@@ -1,236 +0,0 @@
-//===--- Forest.h - Parse forest, the output of the GLR parser ---*- C++-*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// A parse forest represents a set of possible parse trees efficiently, it is
-// produced by the GLR parser.
-//
-// Despite the name, its data structure is a tree-like DAG with a single root.
-// Multiple ways to parse the same tokens are presented as an ambiguous node
-// with all possible interpretations as children.
-// Common sub-parses are shared: if two interpretations both parse "1 + 1" as
-// "expr := expr + expr", they will share a Sequence node representing the expr.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef CLANG_PSEUDO_FOREST_H
-#define CLANG_PSEUDO_FOREST_H
-
-#include "clang-pseudo/Token.h"
-#include "clang-pseudo/grammar/Grammar.h"
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/Support/Allocator.h"
-#include <cstdint>
-
-namespace clang {
-namespace pseudo {
-
-// A node represents ways to parse a sequence of tokens, it interprets a fixed
-// range of tokens as a fixed grammar symbol.
-//
-// There are different kinds of nodes, some nodes have "children" (stored in a
-// trailing array) and have pointers to them. "Children" has different semantics
-// depending on the node kinds. For an Ambiguous node, it means all
-// possible interpretations; for a Sequence node, it means each symbol on the
-// right hand side of the production rule.
-//
-// Since this is a node in a DAG, a node may have multiple parents. And a node
-// doesn't have parent pointers.
-class alignas(class ForestNode *) ForestNode {
-public:
- class RecursiveIterator;
- enum Kind {
- // A Terminal node is a single terminal symbol bound to a token.
- Terminal,
- // A Sequence node is a nonterminal symbol parsed from a grammar rule,
- // elements() are the parses of each symbol on the RHS of the rule.
- // If the rule is A := X Y Z, the node is for nonterminal A, and elements()
- // are [X, Y, Z].
- Sequence,
- // An Ambiguous node exposes multiple ways to interpret the code as the
- // same symbol, alternatives() are all possible parses.
- Ambiguous,
- // An Opaque node is a placeholder. It asserts that tokens match a symbol,
- // without saying how.
- // It is used for lazy-parsing (not parsed yet), or error-recovery (invalid
- // code).
- Opaque,
- };
- Kind kind() const { return K; }
-
- SymbolID symbol() const { return Symbol; }
-
- // The start of the token range, it is a poistion within a token stream.
- Token::Index startTokenIndex() const { return StartIndex; }
-
- // Returns the corresponding grammar rule.
- // REQUIRES: this is a Sequence node.
- RuleID rule() const {
- assert(kind() == Sequence);
- return Data & ((1 << RuleBits) - 1);
- }
- // Returns the parses of each element on the RHS of the rule.
- // REQUIRES: this is a Sequence node;
- llvm::ArrayRef<const ForestNode *> elements() const {
- assert(kind() == Sequence);
- return children(Data >> RuleBits);
- }
- llvm::MutableArrayRef<ForestNode *> elements() {
- assert(kind() == Sequence);
- return children(Data >> RuleBits);
- }
-
- // Returns all possible interpretations of the code.
- // REQUIRES: this is an Ambiguous node.
- llvm::ArrayRef<const ForestNode *> alternatives() const {
- assert(kind() == Ambiguous);
- return children(Data);
- }
- llvm::MutableArrayRef<ForestNode *> alternatives() {
- assert(kind() == Ambiguous);
- return children(Data);
- }
-
- llvm::ArrayRef<const ForestNode *> children() const {
- switch (kind()) {
- case Sequence:
- return elements();
- case Ambiguous:
- return alternatives();
- case Terminal:
- case Opaque:
- return {};
- }
- llvm_unreachable("Bad kind");
- }
-
- // Iteration over all nodes in the forest, including this.
- llvm::iterator_range<RecursiveIterator> descendants() const;
-
- std::string dump(const Grammar &) const;
- std::string dumpRecursive(const Grammar &, bool Abbreviated = false) const;
-
-private:
- friend class ForestArena;
-
- ForestNode(Kind K, SymbolID Symbol, Token::Index StartIndex, uint16_t Data)
- : StartIndex(StartIndex), K(K), Symbol(Symbol), Data(Data) {}
-
- ForestNode(const ForestNode &) = delete;
- ForestNode &operator=(const ForestNode &) = delete;
- ForestNode(ForestNode &&) = delete;
- ForestNode &operator=(ForestNode &&) = delete;
-
- static uint16_t sequenceData(RuleID Rule,
- llvm::ArrayRef<const ForestNode *> Elements) {
- assert(Rule < (1 << RuleBits));
- assert(Elements.size() < (1 << (16 - RuleBits)));
- return Rule | Elements.size() << RuleBits;
- }
- static uint16_t
- ambiguousData(llvm::ArrayRef<const ForestNode *> Alternatives) {
- return Alternatives.size();
- }
-
- // Retrieves the trailing array.
- llvm::ArrayRef<const ForestNode *> children(uint16_t Num) const {
- return llvm::ArrayRef(reinterpret_cast<ForestNode *const *>(this + 1), Num);
- }
- llvm::MutableArrayRef<ForestNode *> children(uint16_t Num) {
- return llvm::MutableArrayRef(reinterpret_cast<ForestNode **>(this + 1),
- Num);
- }
-
- Token::Index StartIndex;
- Kind K : 4;
- SymbolID Symbol : SymbolBits;
- // Sequence - child count : 4 | RuleID : RuleBits (12)
- // Ambiguous - child count : 16
- // Terminal, Opaque - unused
- uint16_t Data;
- // An array of ForestNode* following the object.
-};
-// ForestNode may not be destroyed (for BumpPtrAllocator).
-static_assert(std::is_trivially_destructible<ForestNode>());
-
-// A memory arena for the parse forest.
-class ForestArena {
-public:
- llvm::ArrayRef<ForestNode> createTerminals(const TokenStream &Code);
- ForestNode &createSequence(SymbolID SID, RuleID RID,
- llvm::ArrayRef<const ForestNode *> Elements) {
- assert(!Elements.empty());
- return create(ForestNode::Sequence, SID,
- Elements.front()->startTokenIndex(),
- ForestNode::sequenceData(RID, Elements), Elements);
- }
- ForestNode &createAmbiguous(SymbolID SID,
- llvm::ArrayRef<const ForestNode *> Alternatives) {
- assert(!Alternatives.empty());
- assert(llvm::all_of(Alternatives,
- [SID](const ForestNode *Alternative) {
- return SID == Alternative->symbol();
- }) &&
- "Ambiguous alternatives must represent the same symbol!");
- return create(ForestNode::Ambiguous, SID,
- Alternatives.front()->startTokenIndex(),
- ForestNode::ambiguousData(Alternatives), Alternatives);
- }
- ForestNode &createOpaque(SymbolID SID, Token::Index Start) {
- return create(ForestNode::Opaque, SID, Start, 0, {});
- }
-
- ForestNode &createTerminal(tok::TokenKind TK, Token::Index Start) {
- return create(ForestNode::Terminal, tokenSymbol(TK), Start, 0, {});
- }
-
- size_t nodeCount() const { return NodeCount; }
- size_t bytes() const { return Arena.getBytesAllocated() + sizeof(*this); }
-
-private:
- ForestNode &create(ForestNode::Kind K, SymbolID SID, Token::Index Start,
- uint16_t Data,
- llvm::ArrayRef<const ForestNode *> Elements) {
- ++NodeCount;
- ForestNode *New = new (Arena.Allocate(
- sizeof(ForestNode) + Elements.size() * sizeof(ForestNode *),
- alignof(ForestNode))) ForestNode(K, SID, Start, Data);
- if (!Elements.empty())
- llvm::copy(Elements, reinterpret_cast<const ForestNode **>(New + 1));
- return *New;
- }
-
- llvm::BumpPtrAllocator Arena;
- uint32_t NodeCount = 0;
-};
-
-class ForestNode::RecursiveIterator
- : public llvm::iterator_facade_base<ForestNode::RecursiveIterator,
- std::input_iterator_tag,
- const ForestNode> {
- llvm::DenseSet<const ForestNode *> Seen;
- struct StackFrame {
- const ForestNode *Parent;
- unsigned ChildIndex;
- };
- std::vector<StackFrame> Stack;
- const ForestNode *Cur;
-
-public:
- RecursiveIterator(const ForestNode *N = nullptr) : Cur(N) {}
-
- const ForestNode &operator*() const { return *Cur; }
- void operator++();
- bool operator==(const RecursiveIterator &I) const { return Cur == I.Cur; }
- bool operator!=(const RecursiveIterator &I) const { return !(*this == I); }
-};
-
-} // namespace pseudo
-} // namespace clang
-
-#endif // CLANG_PSEUDO_FOREST_H
diff --git a/clang-tools-extra/pseudo/include/clang-pseudo/GLR.h b/clang-tools-extra/pseudo/include/clang-pseudo/GLR.h
deleted file mode 100644
index 0100f818d4ed78..00000000000000
--- a/clang-tools-extra/pseudo/include/clang-pseudo/GLR.h
+++ /dev/null
@@ -1,170 +0,0 @@
-//===--- GLR.h - Implement a GLR parsing algorithm ---------------*- C++-*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This implements a standard Generalized LR (GLR) parsing algorithm.
-//
-// The GLR parser behaves as a normal LR parser until it encounters a conflict.
-// To handle a conflict (where there are multiple actions could perform), the
-// parser will simulate nondeterminism by doing a breadth-first search
-// over all the possibilities.
-//
-// Basic mechanisims of the GLR parser:
-// - A number of processes are operated in parallel.
-// - Each process has its own parsing stack and behaves as a standard
-// determinism LR parser.
-// - When a process encounters a conflict, it will be fork (one for each
-// avaiable action).
-// - When a process encounters an error, it is abandoned.
-// - All process are synchronized by the lookahead token: they perfrom shift
-// action at the same time, which means some processes need wait until other
-// processes have performed all reduce actions.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef CLANG_PSEUDO_GLR_H
-#define CLANG_PSEUDO_GLR_H
-
-#include "clang-pseudo/Forest.h"
-#include "clang-pseudo/Language.h"
-#include "clang-pseudo/grammar/Grammar.h"
-#include "clang-pseudo/grammar/LRTable.h"
-#include "llvm/Support/Allocator.h"
-#include <vector>
-
-namespace clang {
-namespace pseudo {
-
-// A Graph-Structured Stack efficiently represents all parse stacks of a GLR
-// parser.
-//
-// Each node stores a parse state, the last parsed ForestNode, and the parent
-// node. There may be several heads (top of stack), and the parser operates by:
-// - shift: pushing terminal symbols on top of the stack
-// - reduce: replace N symbols on top of the stack with one nonterminal
-//
-// The structure is a DAG rather than a linear stack:
-// - GLR allows multiple actions (conflicts) on the same head, producing forks
-// where several nodes have the same parent
-// - The parser merges nodes with the same (state, ForestNode), producing joins
-// where one node has multiple parents
-//
-// The parser is responsible for creating nodes and keeping track of the set of
-// heads. The GSS class is mostly an arena for them.
-struct GSS {
- // A node represents a partial parse of the input up to some point.
- //
- // It is the equivalent of a frame in an LR parse stack.
- // Like such a frame, it has an LR parse state and a syntax-tree node
- // representing the last parsed symbol (a ForestNode in our case).
- // Unlike a regular LR stack frame, it may have multiple parents.
- //
- // Nodes are not exactly pushed and popped on the stack: pushing is just
- // allocating a new head node with a parent pointer to the old head. Popping
- // is just forgetting about a node and remembering its parent instead.
- struct alignas(struct Node *) Node {
- // LR state describing how parsing should continue from this head.
- LRTable::StateID State;
- // Used internally to track reachability during garbage collection.
- bool GCParity;
- // Have we already used this node for error recovery? (prevents loops)
- mutable bool Recovered = false;
- // Number of the parents of this node.
- // The parents hold previous parsed symbols, and may resume control after
- // this node is reduced.
- unsigned ParentCount;
- // The parse node for the last parsed symbol.
- // This symbol appears on the left of the dot in the parse state's items.
- // (In the literature, the node is attached to the *edge* to the parent).
- const ForestNode *Payload = nullptr;
-
- llvm::ArrayRef<const Node *> parents() const {
- return llvm::ArrayRef(reinterpret_cast<const Node *const *>(this + 1),
- ParentCount);
- };
- // Parents are stored as a trailing array of Node*.
- };
-
- // Allocates a new node in the graph.
- const Node *addNode(LRTable::StateID State, const ForestNode *Symbol,
- llvm::ArrayRef<const Node *> Parents);
- // Frees all nodes not reachable as ancestors of Roots, and returns the count.
- // Calling this periodically prevents steady memory growth of the GSS.
- unsigned gc(std::vector<const Node *> &&Roots);
-
- size_t bytes() const { return Arena.getTotalMemory() + sizeof(*this); }
- size_t nodesCreated() const { return NodesCreated; }
-
-private:
- // Nodes are recycled using freelists.
- // They are variable size, so use one free-list per distinct #parents.
- std::vector<std::vector<Node *>> FreeList;
- Node *allocate(unsigned Parents);
- void destroy(Node *N);
- // The list of nodes created and not destroyed - our candidates for gc().
- std::vector<Node *> Alive;
- bool GCParity = false; // All nodes should match this, except during GC.
-
- llvm::BumpPtrAllocator Arena;
- unsigned NodesCreated = 0;
-};
-llvm::raw_ostream &operator<<(llvm::raw_ostream &, const GSS::Node &);
-
-// Parameters for the GLR parsing.
-struct ParseParams {
- // The token stream to parse.
- const TokenStream &Code;
-
- // Arena for data structure used by the GLR algorithm.
- ForestArena &Forest; // Storage for the output forest.
- GSS &GSStack; // Storage for parsing stacks.
-};
-
-// Parses the given token stream as the start symbol with the GLR algorithm,
-// and returns a forest node of the start symbol.
-//
-// A rule `_ := StartSymbol` must exit for the chosen start symbol.
-//
-// If the parsing fails, we model it as an opaque node in the forest.
-ForestNode &glrParse(const ParseParams &Params, SymbolID StartSymbol,
- const Language &Lang);
-
-// Shift a token onto all OldHeads, placing the results into NewHeads.
-//
-// Exposed for testing only.
-void glrShift(llvm::ArrayRef<const GSS::Node *> OldHeads,
- const ForestNode &NextTok, const ParseParams &Params,
- const Language &Lang, std::vector<const GSS::Node *> &NewHeads);
-// Applies available reductions on Heads, appending resulting heads to the list.
-//
-// Exposed for testing only.
-void glrReduce(std::vector<const GSS::Node *> &Heads, SymbolID Lookahead,
- const ParseParams &Params, const Language &Lang);
-
-// Heuristically recover from a state where no further parsing is possible.
-//
-// OldHeads is the parse state at TokenIndex.
-// This function consumes zero or more tokens by advancing TokenIndex,
-// and places any recovery states created in NewHeads.
-//
-// On failure, NewHeads is empty and TokenIndex is unchanged.
-//
-// WARNING: glrRecover acts as a "fallback shift". If it consumes no tokens,
-// there is a risk of the parser falling into an infinite loop, creating an
-// endless sequence of recovery nodes.
-// Generally it is safe for recovery to match 0 tokens against sequence symbols
-// like `statement-seq`, as the grammar won't permit another statement-seq
-// immediately afterwards. However recovery strategies for `statement` should
-// consume at least one token, as statements may be adjacent in the input.
-void glrRecover(llvm::ArrayRef<const GSS::Node *> OldHeads,
- unsigned &TokenIndex, const ParseParams &Params,
- const Language &Lang, std::vector<const GSS::Node *> &NewHeads);
-
-} // namespace pseudo
-} // namespace clang
-
-#endif // CLANG_PSEUDO_GLR_H
diff --git a/clang-tools-extra/pseudo/include/clang-pseudo/Language.h b/clang-tools-extra/pseudo/include/clang-pseudo/Language.h
deleted file mode 100644
index 1a2b71f081da0a..00000000000000
--- a/clang-tools-extra/pseudo/include/clang-pseudo/Language.h
+++ /dev/null
@@ -1,64 +0,0 @@
-//===--- Language.h -------------------------------------------- -*- C++-*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef CLANG_PSEUDO_LANGUAGE_H
-#define CLANG_PSEUDO_LANGUAGE_H
-
-#include "clang-pseudo/Token.h"
-#include "clang-pseudo/grammar/Grammar.h"
-#include "clang-pseudo/grammar/LRTable.h"
-
-namespace clang {
-namespace pseudo {
-class ForestNode;
-class TokenStream;
-class LRTable;
-
-struct GuardParams {
- llvm::ArrayRef<const ForestNode *> RHS;
- const TokenStream &Tokens;
- // FIXME: use the index of Tokens.
- SymbolID Lookahead;
-};
-// A guard restricts when a grammar rule can be used.
-//
-// The GLR parser will use the guard to determine whether a rule reduction will
-// be conducted. For example, e.g. a guard may allow the rule
-// `virt-specifier := IDENTIFIER` only if the identifier's text is 'override`.
-//
-// Return true if the guard is satisfied.
-using RuleGuard = llvm::function_ref<bool(const GuardParams &)>;
-
-// A recovery strategy determines a region of code to skip when parsing fails.
-//
-// For example, given `class-def := CLASS IDENT { body [recover=Brackets] }`,
-// if parsing fails while attempting to parse `body`, we may skip up to the
-// matching `}` and assume everything between was a `body`.
-//
-// The provided index is the token where the skipped region begins.
-// Returns the (excluded) end of the range, or Token::Invalid for no recovery.
-using RecoveryStrategy =
- llvm::function_ref<Token::Index(Token::Index Start, const TokenStream &)>;
-
-// Specify a language that can be parsed by the pseduoparser.
-struct Language {
- Grammar G;
- LRTable Table;
-
- // Binding extension ids to corresponding implementations.
- llvm::DenseMap<RuleID, RuleGuard> Guards;
- llvm::DenseMap<ExtensionID, RecoveryStrategy> RecoveryStrategies;
-
- // FIXME: add clang::LangOptions.
- // FIXME: add default start symbols.
-};
-
-} // namespace pseudo
-} // namespace clang
-
-#endif // CLANG_PSEUDO_LANGUAGE_H
diff --git a/clang-tools-extra/pseudo/include/clang-pseudo/cli/CLI.h b/clang-tools-extra/pseudo/include/clang-pseudo/cli/CLI.h
deleted file mode 100644
index db09aba21502fd..00000000000000
--- a/clang-tools-extra/pseudo/include/clang-pseudo/cli/CLI.h
+++ /dev/null
@@ -1,35 +0,0 @@
-//===--- CLI.h - Get grammar from variant sources ----------------*- C++-*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Provides the Grammar, LRTable etc for a language specified by the `--grammar`
-// flags. It is by design to be used by pseudoparser-based CLI tools.
-//
-// The CLI library defines a `--grammar` CLI flag, which supports 1) using a
-// grammar from a file (--grammar=/path/to/lang.bnf) or using the prebuilt cxx
-// language (--grammar=cxx).
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef CLANG_PSEUDO_CLI_CLI_H
-#define CLANG_PSEUDO_CLI_CLI_H
-
-#include "clang-pseudo/Language.h"
-
-namespace clang {
-namespace pseudo {
-
-// Returns the corresponding Language from the '--grammar' command-line flag.
-//
-// !! If the grammar flag is invalid (e.g. unexisting file), this function will
-// exit the program immediately.
-const Language &getLanguageFromFlags();
-
-} // namespace pseudo
-} // namespace clang
-
-#endif // CLANG_PSEUDO_CLI_CLI_H
diff --git a/clang-tools-extra/pseudo/include/clang-pseudo/cxx/CXX.h b/clang-tools-extra/pseudo/include/clang-pseudo/cxx/CXX.h
deleted file mode 100644
index 7bbb4d2c00201f..00000000000000
--- a/clang-tools-extra/pseudo/include/clang-pseudo/cxx/CXX.h
+++ /dev/null
@@ -1,91 +0,0 @@
-//===--- CXX.h - Public interfaces for the C++ grammar -----------*- C++-*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines public interfaces for the C++ grammar
-// (pseudo/lib/cxx/cxx.bnf). It provides a fast way to access core building
-// pieces of the LR parser, e.g. Grammar, LRTable, rather than parsing the
-// grammar file at the runtime.
-//
-// We do a compilation of the C++ BNF grammar at build time, and generate
-// critical data sources. The implementation of the interfaces are based on the
-// generated data sources.
-//
-// FIXME: not everything is fully compiled yet. The implementation of the
-// interfaces are still parsing the grammar file at the runtime.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef CLANG_PSEUDO_CXX_CXX_H
-#define CLANG_PSEUDO_CXX_CXX_H
-
-#include "clang-pseudo/Language.h"
-#include "clang-pseudo/grammar/Grammar.h"
-
-namespace clang {
-namespace pseudo {
-namespace cxx {
-
-// We want enums to be scoped but implicitly convertible to RuleID etc.
-// So create regular (unscoped) enums inside subnamespaces of `detail`.
-// Then add aliases for them outside `detail`.
-namespace detail {
-namespace symbols {
-enum Symbol : SymbolID {
-#define NONTERMINAL(X, Y) X = Y,
-#include "CXXSymbols.inc"
-#undef NONTERMINAL
-};
-} // namespace symbols
-
-namespace extensions {
-enum Extension : ExtensionID {
-#define EXTENSION(X, Y) X = Y,
-#include "CXXSymbols.inc"
-#undef EXTENSION
-};
-} // namespace extensions
-
-namespace rules {
-// For each symbol we close the last symbol's enum+namespace and open new ones.
-// We need a dummy namespace+enum so that this works for the first rule.
-namespace dummy {
-enum Dummy {
-//clang-format off
-#define NONTERMINAL(NAME, ID) \
-}; \
-} \
-namespace NAME { \
-enum Rule : RuleID {
-//clang-format on
-#define RULE(LHS, RHS, ID) RHS = ID,
-#include "CXXSymbols.inc"
-};
-}
-} // namespace rules
-} // namespace detail
-
-// Symbol represents nonterminal symbols in the C++ grammar.
-// It provides a simple uniform way to access a particular nonterminal.
-using Symbol = detail::symbols::Symbol;
-
-using Extension = detail::extensions::Extension;
-
-namespace rule {
-#define NONTERMINAL(NAME, ID) using NAME = detail::rules::NAME::Rule;
-#include "CXXSymbols.inc"
-} // namespace rule
-
-// Returns the Language for the cxx.bnf grammar.
-const Language &getLanguage();
-
-} // namespace cxx
-
-} // namespace pseudo
-} // namespace clang
-
-#endif // CLANG_PSEUDO_CXX_CXX_H
diff --git a/clang-tools-extra/pseudo/include/clang-pseudo/grammar/Grammar.h b/clang-tools-extra/pseudo/include/clang-pseudo/grammar/Grammar.h
deleted file mode 100644
index a1c779a02d8640..00000000000000
--- a/clang-tools-extra/pseudo/include/clang-pseudo/grammar/Grammar.h
+++ /dev/null
@@ -1,230 +0,0 @@
-//===--- Grammar.h - grammar used by clang pseudoparser ---------*- C++-*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines base structures for parsing & modeling a grammar for a
-// programming language:
-//
-// # This is a fake C++ BNF grammar
-// _ := translation-unit
-// translation-unit := declaration-seq_opt
-// declaration-seq := declaration
-// declaration-seq := declaration-seq declaration
-//
-// A grammar formally describes a language, and it is constructed by a set of
-// production rules. A rule is of BNF form (AAA := BBB CCC). A symbol is either
-// nonterminal or terminal, identified by a SymbolID.
-//
-// Annotations are supported in a syntax form of [key=value]. They specify
-// attributes which are associated with either a grammar symbol (on the
-// right-hand side of the symbol) or a grammar rule (at the end of the rule
-// body).
-// Attributes provide a way to inject custom code into the GLR parser. Each
-// unique attribute value creates an extension point (identified by ExtensionID
-// ), and an extension point corresponds to a piece of native code. For
-// example, C++ grammar has a rule:
-//
-// compound_statement := { statement-seq [recover=Brackets] }
-//
-// The `recover` attribute instructs the parser that we should perform error
-// recovery if parsing the statement-seq fails. The `Brackets` recovery
-// heuristic is implemented in CXX.cpp by binding the ExtensionID for the
-// `Recovery` value to a specific C++ function that finds the recovery point.
-//
-// Notions about the BNF grammar:
-// - "_" is the start symbol of the augmented grammar;
-// - single-line comment is supported, starting with a #
-// - A rule describes how a nonterminal (left side of :=) is constructed, and
-// it is *per line* in the grammar file
-// - Terminals (also called tokens) correspond to the clang::TokenKind; they
-// are written in the grammar like "IDENTIFIER", "USING", "+"
-// - Nonterminals are specified with "lower-case" names in the grammar; they
-// shouldn't be nullable (has an empty sequence)
-// - optional symbols are supported (specified with a _opt suffix), and they
-// will be eliminated during the grammar parsing stage
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef CLANG_PSEUDO_GRAMMAR_GRAMMAR_H
-#define CLANG_PSEUDO_GRAMMAR_GRAMMAR_H
-
-#include "clang/Basic/TokenKinds.h"
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/DenseSet.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/Support/raw_ostream.h"
-#include <cstdint>
-#include <optional>
-#include <vector>
-
-namespace clang {
-namespace pseudo {
-// A SymbolID uniquely identifies a terminal/nonterminal symbol in a grammar.
-// nonterminal IDs are indexes into a table of nonterminal symbols.
-// Terminal IDs correspond to the clang TokenKind enum.
-using SymbolID = uint16_t;
-// SymbolID is only 12 bits wide.
-// There are maximum 2^11 terminals (aka tokens) and 2^11 nonterminals.
-static constexpr uint16_t SymbolBits = 12;
-static constexpr uint16_t NumTerminals = tok::NUM_TOKENS;
-// SymbolIDs with the top bit set are tokens/terminals.
-static constexpr SymbolID TokenFlag = 1 << (SymbolBits - 1);
-inline bool isToken(SymbolID ID) { return ID & TokenFlag; }
-inline bool isNonterminal(SymbolID ID) { return !isToken(ID); }
-// The terminals are always the clang tok::TokenKind (not all are used).
-inline tok::TokenKind symbolToToken(SymbolID SID) {
- assert(isToken(SID));
- SID &= ~TokenFlag;
- assert(SID < NumTerminals);
- return static_cast<tok::TokenKind>(SID);
-}
-inline constexpr SymbolID tokenSymbol(tok::TokenKind TK) {
- return TokenFlag | static_cast<SymbolID>(TK);
-}
-
-// An extension is a piece of native code specific to a grammar that modifies
-// the behavior of annotated rules. One ExtensionID is assigned for each unique
-// attribute value (all attributes share a namespace).
-using ExtensionID = uint16_t;
-
-// A RuleID uniquely identifies a production rule in a grammar.
-// It is an index into a table of rules.
-using RuleID = uint16_t;
-// There are maximum 2^12 rules.
-static constexpr unsigned RuleBits = 12;
-
-// Represent a production rule in the grammar, e.g.
-// expression := a b c
-// ^Target ^Sequence
-struct Rule {
- Rule(SymbolID Target, llvm::ArrayRef<SymbolID> Seq);
-
- // We occupy 4 bits for the sequence, in theory, it can be at most 2^4 tokens
- // long, however, we're stricter in order to reduce the size, we limit the max
- // length to 9 (this is the longest sequence in cxx grammar).
- static constexpr unsigned SizeBits = 4;
- static constexpr unsigned MaxElements = 9;
- static_assert(MaxElements < (1 << SizeBits), "Exceeds the maximum limit");
- static_assert(SizeBits + SymbolBits <= 16,
- "Must be able to store symbol ID + size efficiently");
-
- // 16 bits for target symbol and size of sequence:
- // SymbolID : 12 | Size : 4
- SymbolID Target : SymbolBits;
- uint8_t Size : SizeBits; // Size of the Sequence
- SymbolID Sequence[MaxElements];
-
- // A guarded rule has extra logic to determine whether the RHS is eligible.
- bool Guarded = false;
-
- // Specifies the index within Sequence eligible for error recovery.
- // Given stmt := { stmt-seq_opt }, if we fail to parse the stmt-seq then we
- // should recover by finding the matching brace, and forcing stmt-seq to match
- // everything between braces.
- // For now, only a single strategy at a single point is possible.
- uint8_t RecoveryIndex = -1;
- ExtensionID Recovery = 0;
-
- llvm::ArrayRef<SymbolID> seq() const {
- return llvm::ArrayRef<SymbolID>(Sequence, Size);
- }
- friend bool operator==(const Rule &L, const Rule &R) {
- return L.Target == R.Target && L.seq() == R.seq() && L.Guarded == R.Guarded;
- }
-};
-
-struct GrammarTable;
-
-// Grammar that describes a programming language, e.g. C++. It represents the
-// contents of the specified grammar.
-// It is a building block for constructing a table-based parser.
-class Grammar {
-public:
- Grammar() = default; // Creates an invalid dummy grammar.
- explicit Grammar(std::unique_ptr<GrammarTable>);
-
- // Parses grammar from a BNF file.
- // Diagnostics emitted during parsing are stored in Diags.
- static Grammar parseBNF(llvm::StringRef BNF, std::vector<std::string> &Diags);
-
- // Returns the SymbolID of the symbol '_'.
- SymbolID underscore() const { return Underscore; };
-
- // Returns all rules of the given nonterminal symbol.
- llvm::ArrayRef<Rule> rulesFor(SymbolID SID) const;
- const Rule &lookupRule(RuleID RID) const;
-
- // Gets symbol (terminal or nonterminal) name.
- // Terminals have names like "," (kw_comma) or "OPERATOR" (kw_operator).
- llvm::StringRef symbolName(SymbolID) const;
-
- // Lookup the SymbolID of the nonterminal symbol by Name.
- std::optional<SymbolID> findNonterminal(llvm::StringRef Name) const;
-
- // Dumps the whole grammar.
- std::string dump() const;
- // Dumps a particular rule.
- std::string dumpRule(RuleID) const;
- // Dumps all rules of the given nonterminal symbol.
- std::string dumpRules(SymbolID) const;
-
- const GrammarTable &table() const { return *T; }
-
-private:
- std::unique_ptr<GrammarTable> T;
- // The symbol ID of '_'. (In the LR literature, this is the start symbol of
- // the augmented grammar.)
- SymbolID Underscore;
-};
-// For each nonterminal X, computes the set of terminals that begin strings
-// derived from X. (Known as FIRST sets in grammar-based parsers).
-std::vector<llvm::DenseSet<SymbolID>> firstSets(const Grammar &);
-// For each nonterminal X, computes the set of terminals that could immediately
-// follow X. (Known as FOLLOW sets in grammar-based parsers).
-std::vector<llvm::DenseSet<SymbolID>> followSets(const Grammar &);
-
-// Storage for the underlying data of the Grammar.
-// It can be constructed dynamically (from compiling BNF file) or statically
-// (a compiled data-source).
-struct GrammarTable {
- GrammarTable();
-
- struct Nonterminal {
- std::string Name;
- // Corresponding rules that construct the nonterminal, it is a [Start, End)
- // index range of the Rules table.
- struct {
- RuleID Start;
- RuleID End;
- } RuleRange;
- };
-
- // RuleID is an index into this table of rule definitions.
- //
- // Rules with the same target symbol (LHS) are grouped into a single range.
- // The relative order of different target symbols is *not* by SymbolID, but
- // rather a topological sort: if S := T then the rules producing T have lower
- // RuleIDs than rules producing S.
- // (This strange order simplifies the GLR parser: for a given token range, if
- // we reduce in increasing RuleID order then we need never backtrack --
- // prerequisite reductions are reached before dependent ones).
- std::vector<Rule> Rules;
- // A table of terminals (aka tokens). It corresponds to the clang::Token.
- // clang::tok::TokenKind is the index of the table.
- llvm::ArrayRef<std::string> Terminals;
- // A table of nonterminals, sorted by name.
- // SymbolID is the index of the table.
- std::vector<Nonterminal> Nonterminals;
- // A table of attribute values, sorted by name.
- // ExtensionID is the index of the table.
- std::vector<std::string> AttributeValues;
-};
-
-} // namespace pseudo
-} // namespace clang
-
-#endif // CLANG_PSEUDO_GRAMMAR_GRAMMAR_H
diff --git a/clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRGraph.h b/clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRGraph.h
deleted file mode 100644
index dd9e87c2c172bf..00000000000000
--- a/clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRGraph.h
+++ /dev/null
@@ -1,196 +0,0 @@
-//===--- LRGraph.h - Build an LR automaton ------------------*- C++-*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// LR parsers are bottom-up parsers -- they scan the input from left to right,
-// and collect the right-hand side of a production rule (called handle) on top
-// of the stack, then replace (reduce) the handle with the nonterminal defined
-// by the production rule.
-//
-// This file defines LRGraph, a deterministic handle-finding finite-state
-// automaton, which is a key component in LR parsers to recognize any of
-// handles in the grammar efficiently. We build the LR table (ACTION and GOTO
-// Table) based on the LRGraph.
-//
-// LRGraph can be constructed for any context-free grammars.
-// Even for a LR-ambiguous grammar, we can construct a deterministic FSA, but
-// interpretation of the FSA is nondeterministic -- we might in a state where
-// we can continue searching an handle and identify a handle (called
-// shift/reduce conflicts), or identify more than one handle (callled
-// reduce/reduce conflicts).
-//
-// LRGraph is a common model for all variants of LR automatons, from the most
-// basic one LR(0), the powerful SLR(1), LR(1) which uses a one-token lookahead
-// in making decisions.
-//===----------------------------------------------------------------------===//
-
-#ifndef CLANG_PSEUDO_GRAMMAR_LRGRAPH_H
-#define CLANG_PSEUDO_GRAMMAR_LRGRAPH_H
-
-#include "clang-pseudo/grammar/Grammar.h"
-#include "llvm/ADT/Hashing.h"
-#include <vector>
-
-namespace clang {
-namespace pseudo {
-
-// An LR item -- a grammar rule with a dot at some position of the body.
-// e.g. a production rule A := X Y yields 3 items:
-// A := . X Y
-// A := X . Y
-// A := X Y .
-// An item indicates how much of a production rule has been recognized at a
-// position (described by dot), for example, A := X . Y indicates that we have
-// recognized the X part from the input, and we hope next to see the input
-// derivable from Y.
-class Item {
-public:
- static Item start(RuleID ID, const Grammar &G) {
- Item I;
- I.RID = ID;
- I.RuleLength = G.lookupRule(ID).Size;
- return I;
- }
- static Item sentinel(RuleID ID) {
- Item I;
- I.RID = ID;
- return I;
- }
-
- RuleID rule() const { return RID; }
- uint8_t dot() const { return DotPos; }
-
- bool hasNext() const { return DotPos < RuleLength; }
- SymbolID next(const Grammar &G) const {
- assert(hasNext());
- return G.lookupRule(RID).Sequence[DotPos];
- }
-
- Item advance() const {
- assert(hasNext());
- Item I = *this;
- ++I.DotPos;
- return I;
- }
-
- std::string dump(const Grammar &G) const;
-
- bool operator==(const Item &I) const {
- return DotPos == I.DotPos && RID == I.RID;
- }
- bool operator<(const Item &I) const {
- return std::tie(RID, DotPos) < std::tie(I.RID, I.DotPos);
- }
- friend llvm::hash_code hash_value(const Item &I) {
- return llvm::hash_combine(I.RID, I.DotPos);
- }
-
-private:
- RuleID RID = 0;
- uint8_t DotPos = 0;
- uint8_t RuleLength = 0; // the length of rule body.
-};
-
-// A state represents a node in the LR automaton graph. It is an item set, which
-// contains all possible rules that the LR parser may be parsing in that state.
-//
-// Conceptually, If we knew in advance what we're parsing, at any point we're
-// partway through parsing a production, sitting on a stack of partially parsed
-// productions. But because we don't know, there could be *several* productions
-// we're partway through. The set of possibilities is the parser state, and we
-// precompute all the transitions between these states.
-struct State {
- // A full set of items (including non-kernel items) representing the state,
- // in a canonical order (see SortByNextSymbol in the cpp file).
- std::vector<Item> Items;
-
- std::string dump(const Grammar &G, unsigned Indent = 0) const;
-};
-
-// LRGraph is a deterministic finite state automaton for LR parsing.
-//
-// Intuitively, an LR automaton is a transition graph. The graph has a
-// collection of nodes, called States. Each state corresponds to a particular
-// item set, which represents a condition that could occur during the process of
-// parsing a production. Edges are directed from one state to another. Each edge
-// is labeled by a grammar symbol (terminal or nonterminal).
-//
-// LRGraph is used to construct the LR parsing table which is a core
-// data-structure driving the LR parser.
-class LRGraph {
-public:
- // StateID is the index in States table.
- using StateID = uint16_t;
-
- // Constructs an LR(0) automaton.
- static LRGraph buildLR0(const Grammar &);
-
- // An edge in the LR graph, it represents a transition in the LR automaton.
- // If the parser is at state Src, with a lookahead Label, then it
- // transits to state Dst.
- struct Edge {
- StateID Src, Dst;
- SymbolID Label;
- };
-
- // A possible error recovery: choose to match some tokens against a symbol.
- //
- // e.g. a state that contains
- // stmt := { . stmt-seq [recover=braces] }
- // has a Recovery { Src = S, Strategy=braces, Result=stmt-seq }.
- struct Recovery {
- StateID Src; // The state we are in when encountering the error.
- ExtensionID Strategy; // Heuristic choosing the tokens to match.
- SymbolID Result; // The symbol that is produced.
- };
-
- llvm::ArrayRef<State> states() const { return States; }
- llvm::ArrayRef<Edge> edges() const { return Edges; }
- llvm::ArrayRef<Recovery> recoveries() const { return Recoveries; }
- llvm::ArrayRef<std::pair<SymbolID, StateID>> startStates() const {
- return StartStates;
- }
-
- std::string dumpForTests(const Grammar &) const;
-
-private:
- LRGraph(std::vector<State> States, std::vector<Edge> Edges,
- std::vector<Recovery> Recoveries,
- std::vector<std::pair<SymbolID, StateID>> StartStates)
- : States(std::move(States)), Edges(std::move(Edges)),
- Recoveries(std::move(Recoveries)), StartStates(std::move(StartStates)) {
- }
-
- std::vector<State> States;
- std::vector<Edge> Edges;
- std::vector<Recovery> Recoveries;
- std::vector<std::pair<SymbolID, StateID>> StartStates;
-};
-
-} // namespace pseudo
-} // namespace clang
-
-namespace llvm {
-// Support clang::pseudo::Item as DenseMap keys.
-template <> struct DenseMapInfo<clang::pseudo::Item> {
- static inline clang::pseudo::Item getEmptyKey() {
- return clang::pseudo::Item::sentinel(-1);
- }
- static inline clang::pseudo::Item getTombstoneKey() {
- return clang::pseudo::Item::sentinel(-2);
- }
- static unsigned getHashValue(const clang::pseudo::Item &I) {
- return hash_value(I);
- }
- static bool isEqual(const clang::pseudo::Item &LHS,
- const clang::pseudo::Item &RHS) {
- return LHS == RHS;
- }
-};
-} // namespace llvm
-
-#endif // CLANG_PSEUDO_GRAMMAR_LRGRAPH_H
diff --git a/clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRTable.h b/clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRTable.h
deleted file mode 100644
index 1706b6936c9ea2..00000000000000
--- a/clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRTable.h
+++ /dev/null
@@ -1,278 +0,0 @@
-//===--- LRTable.h - Define LR Parsing Table ---------------------*- C++-*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// The LRTable (referred as LR parsing table in the LR literature) is the core
-// component in LR parsers, it drives the LR parsers by specifying an action to
-// take given the current state on the top of the stack and the current
-// lookahead token.
-//
-// The LRTable can be described as a matrix where the rows represent
-// the states of the LR graph, the columns represent the symbols of the
-// grammar, and each entry of the matrix (called action) represents a
-// state transition in the graph.
-//
-// Typically, based on the category of the grammar symbol, the LRTable is
-// broken into two logically separate tables:
-// - ACTION table with terminals as columns -- e.g. ACTION[S, a] specifies
-// next action (shift/reduce) on state S under a lookahead terminal a
-// - GOTO table with nonterminals as columns -- e.g. GOTO[S, X] specifies
-// the state which we transist to from the state S with the nonterminal X
-//
-// LRTable is *performance-critial* as it is consulted frequently during a
-// parse. In general, LRTable is very sparse (most of the entries are empty).
-// For example, for the C++ language, the SLR table has ~1500 states and 650
-// symbols which results in a matrix having 975K entries, ~90% of entries are
-// empty.
-//
-// This file implements a speed-and-space-efficient LRTable.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef CLANG_PSEUDO_GRAMMAR_LRTABLE_H
-#define CLANG_PSEUDO_GRAMMAR_LRTABLE_H
-
-#include "clang-pseudo/grammar/Grammar.h"
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/BitVector.h"
-#include "llvm/ADT/SmallSet.h"
-#include "llvm/Support/Capacity.h"
-#include "llvm/Support/MathExtras.h"
-#include <cstdint>
-#include <vector>
-
-namespace clang {
-namespace pseudo {
-
-// Represents the LR parsing table, which can efficiently the question "what is
-// the next step given the lookahead token and current state on top of the
-// stack?".
-//
-// This is a dense implementation, which only takes an amount of space that is
-// proportional to the number of non-empty entries in the table.
-//
-// Unlike the typical LR parsing table which allows at most one available action
-// per entry, conflicted actions are allowed in LRTable. The LRTable is designed
-// to be used in nondeterministic LR parsers (e.g. GLR).
-//
-// There are no "accept" actions in the LRTable, instead the stack is inspected
-// after parsing completes: is the state goto(StartState, StartSymbol)?
-class LRTable {
-public:
- // StateID is only 13 bits wide.
- using StateID = uint16_t;
- static constexpr unsigned StateBits = 13;
-
- struct Recovery {
- ExtensionID Strategy;
- SymbolID Result;
- };
-
- // Returns the state after we reduce a nonterminal.
- // Expected to be called by LR parsers.
- // If the nonterminal is invalid here, returns std::nullopt.
- std::optional<StateID> getGoToState(StateID State,
- SymbolID Nonterminal) const {
- return Gotos.get(gotoIndex(State, Nonterminal, numStates()));
- }
- // Returns the state after we shift a terminal.
- // Expected to be called by LR parsers.
- // If the terminal is invalid here, returns std::nullopt.
- std::optional<StateID> getShiftState(StateID State,
- SymbolID Terminal) const {
- return Shifts.get(shiftIndex(State, Terminal, numStates()));
- }
-
- // Returns the possible reductions from a state.
- //
- // These are not keyed by a lookahead token. Instead, call canFollow() to
- // check whether a reduction should apply in the current context:
- // for (RuleID R : LR.getReduceRules(S)) {
- // if (!LR.canFollow(G.lookupRule(R).Target, NextToken))
- // continue;
- // // ...apply reduce...
- // }
- llvm::ArrayRef<RuleID> getReduceRules(StateID State) const {
- assert(State + 1u < ReduceOffset.size());
- return llvm::ArrayRef(Reduces.data() + ReduceOffset[State],
- Reduces.data() + ReduceOffset[State + 1]);
- }
- // Returns whether Terminal can follow Nonterminal in a valid source file.
- bool canFollow(SymbolID Nonterminal, SymbolID Terminal) const {
- assert(isToken(Terminal));
- assert(isNonterminal(Nonterminal));
- // tok::unknown is a sentinel value used in recovery: can follow anything.
- return Terminal == tokenSymbol(tok::unknown) ||
- FollowSets.test(tok::NUM_TOKENS * Nonterminal +
- symbolToToken(Terminal));
- }
-
- // Looks up available recovery actions if we stopped parsing in this state.
- llvm::ArrayRef<Recovery> getRecovery(StateID State) const {
- return llvm::ArrayRef(Recoveries.data() + RecoveryOffset[State],
- Recoveries.data() + RecoveryOffset[State + 1]);
- }
-
- // Returns the state from which the LR parser should start to parse the input
- // tokens as the given StartSymbol.
- //
- // In LR parsing, the start state of `translation-unit` corresponds to
- // `_ := • translation-unit`.
- //
- // Each start state responds to **a** single grammar rule like `_ := start`.
- // REQUIRE: The given StartSymbol must exist in the grammar (in a form of
- // `_ := start`).
- StateID getStartState(SymbolID StartSymbol) const;
-
- size_t bytes() const {
- return sizeof(*this) + Gotos.bytes() + Shifts.bytes() +
- llvm::capacity_in_bytes(Reduces) +
- llvm::capacity_in_bytes(ReduceOffset) +
- llvm::capacity_in_bytes(FollowSets);
- }
-
- std::string dumpStatistics() const;
- std::string dumpForTests(const Grammar &G) const;
-
- // Build a SLR(1) parsing table.
- static LRTable buildSLR(const Grammar &G);
-
- // Helper for building a table with specified actions/states.
- struct Builder {
- Builder() = default;
- Builder(const Grammar &G) {
- NumNonterminals = G.table().Nonterminals.size();
- FollowSets = followSets(G);
- }
-
- unsigned int NumNonterminals = 0;
- // States representing `_ := . start` for various start symbols.
- std::vector<std::pair<SymbolID, StateID>> StartStates;
- // State transitions `X := ABC . D EFG` => `X := ABC D . EFG`.
- // Key is (initial state, D), value is final state.
- llvm::DenseMap<std::pair<StateID, SymbolID>, StateID> Transition;
- // Reductions available in a given state.
- llvm::DenseMap<StateID, llvm::SmallSet<RuleID, 4>> Reduce;
- // FollowSets[NT] is the set of terminals that can follow the nonterminal.
- std::vector<llvm::DenseSet<SymbolID>> FollowSets;
- // Recovery options available at each state.
- std::vector<std::pair<StateID, Recovery>> Recoveries;
-
- LRTable build() &&;
- };
-
-private:
- unsigned numStates() const { return ReduceOffset.size() - 1; }
-
- // A map from unsigned key => StateID, used to store actions.
- // The keys should be sequential but the values are somewhat sparse.
- //
- // In practice, the keys encode (origin state, symbol) pairs, and the values
- // are the state we should move to after seeing that symbol.
- //
- // We store one bit for presence/absence of the value for each key.
- // At every 64th key, we store the offset into the table of values.
- // e.g. key 0x500 is checkpoint 0x500/64 = 20
- // Checkpoints[20] = 34
- // get(0x500) = Values[34] (assuming it has a value)
- // To look up values in between, we count the set bits:
- // get(0x509) has a value if HasValue[20] & (1<<9)
- // #values between 0x500 and 0x509: popcnt(HasValue[20] & (1<<9 - 1))
- // get(0x509) = Values[34 + popcnt(...)]
- //
- // Overall size is 1.25 bits/key + 16 bits/value.
- // Lookup is constant time with a low factor (no hashing).
- class TransitionTable {
- using Word = uint64_t;
- constexpr static unsigned WordBits = CHAR_BIT * sizeof(Word);
-
- std::vector<StateID> Values;
- std::vector<Word> HasValue;
- std::vector<uint16_t> Checkpoints;
-
- public:
- TransitionTable() = default;
- TransitionTable(const llvm::DenseMap<unsigned, StateID> &Entries,
- unsigned NumKeys) {
- assert(
- Entries.size() <
- std::numeric_limits<decltype(Checkpoints)::value_type>::max() &&
- "16 bits too small for value offsets!");
- unsigned NumWords = (NumKeys + WordBits - 1) / WordBits;
- HasValue.resize(NumWords, 0);
- Checkpoints.reserve(NumWords);
- Values.reserve(Entries.size());
- for (unsigned I = 0; I < NumKeys; ++I) {
- if ((I % WordBits) == 0)
- Checkpoints.push_back(Values.size());
- auto It = Entries.find(I);
- if (It != Entries.end()) {
- HasValue[I / WordBits] |= (Word(1) << (I % WordBits));
- Values.push_back(It->second);
- }
- }
- }
-
- std::optional<StateID> get(unsigned Key) const {
- // Do we have a value for this key?
- Word KeyMask = Word(1) << (Key % WordBits);
- unsigned KeyWord = Key / WordBits;
- if ((HasValue[KeyWord] & KeyMask) == 0)
- return std::nullopt;
- // Count the number of values since the checkpoint.
- Word BelowKeyMask = KeyMask - 1;
- unsigned CountSinceCheckpoint =
- llvm::popcount(HasValue[KeyWord] & BelowKeyMask);
- // Find the value relative to the last checkpoint.
- return Values[Checkpoints[KeyWord] + CountSinceCheckpoint];
- }
-
- unsigned size() const { return Values.size(); }
-
- size_t bytes() const {
- return llvm::capacity_in_bytes(HasValue) +
- llvm::capacity_in_bytes(Values) +
- llvm::capacity_in_bytes(Checkpoints);
- }
- };
- // Shift and Goto tables are keyed by encoded (State, Symbol).
- static unsigned shiftIndex(StateID State, SymbolID Terminal,
- unsigned NumStates) {
- return NumStates * symbolToToken(Terminal) + State;
- }
- static unsigned gotoIndex(StateID State, SymbolID Nonterminal,
- unsigned NumStates) {
- assert(isNonterminal(Nonterminal));
- return NumStates * Nonterminal + State;
- }
- TransitionTable Shifts;
- TransitionTable Gotos;
-
- // A sorted table, storing the start state for each target parsing symbol.
- std::vector<std::pair<SymbolID, StateID>> StartStates;
-
- // Given a state ID S, the half-open range of Reduces is
- // [ReduceOffset[S], ReduceOffset[S+1])
- std::vector<uint32_t> ReduceOffset;
- std::vector<RuleID> Reduces;
- // Conceptually this is a bool[SymbolID][Token], each entry describing whether
- // the grammar allows the (nonterminal) symbol to be followed by the token.
- //
- // This is flattened by encoding the (SymbolID Nonterminal, tok::Kind Token)
- // as an index: Nonterminal * NUM_TOKENS + Token.
- llvm::BitVector FollowSets;
-
- // Recovery stores all recovery actions from all states.
- // A given state has [RecoveryOffset[S], RecoveryOffset[S+1]).
- std::vector<uint32_t> RecoveryOffset;
- std::vector<Recovery> Recoveries;
-};
-
-} // namespace pseudo
-} // namespace clang
-
-#endif // CLANG_PSEUDO_GRAMMAR_LRTABLE_H
diff --git a/clang-tools-extra/pseudo/lib/CMakeLists.txt b/clang-tools-extra/pseudo/lib/CMakeLists.txt
deleted file mode 100644
index a13b5d20cf7c3b..00000000000000
--- a/clang-tools-extra/pseudo/lib/CMakeLists.txt
+++ /dev/null
@@ -1,31 +0,0 @@
-add_subdirectory(cli)
-add_subdirectory(cxx)
-add_subdirectory(grammar)
-
-set(LLVM_LINK_COMPONENTS Support)
-
-add_clang_library(clangPseudo
- Bracket.cpp
- DirectiveTree.cpp
- Disambiguate.cpp
- Forest.cpp
- GLR.cpp
- Lex.cpp
- Token.cpp
-
- LINK_LIBS
- clangPseudoGrammar
-
- DEPENDS
- ClangDriverOptions
- )
-
- target_include_directories(clangPseudo INTERFACE
- $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../include>
- )
-
-clang_target_link_libraries(clangPseudo
- PRIVATE
- clangBasic
- clangLex
- )
diff --git a/clang-tools-extra/pseudo/lib/Disambiguate.cpp b/clang-tools-extra/pseudo/lib/Disambiguate.cpp
deleted file mode 100644
index b0bc75cf96c938..00000000000000
--- a/clang-tools-extra/pseudo/lib/Disambiguate.cpp
+++ /dev/null
@@ -1,48 +0,0 @@
-//===--- Disambiguate.cpp - Find the best tree in the forest --------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "clang-pseudo/Disambiguate.h"
-
-namespace clang::pseudo {
-
-Disambiguation disambiguate(const ForestNode *Root,
- const DisambiguateParams &Params) {
- // FIXME: this is a dummy placeholder strategy, implement a real one!
- Disambiguation Result;
- for (const ForestNode &N : Root->descendants()) {
- if (N.kind() == ForestNode::Ambiguous)
- Result.try_emplace(&N, 1);
- }
- return Result;
-}
-
-void removeAmbiguities(ForestNode *&Root, const Disambiguation &D) {
- std::vector<ForestNode **> Queue = {&Root};
- while (!Queue.empty()) {
- ForestNode **Next = Queue.back();
- Queue.pop_back();
- switch ((*Next)->kind()) {
- case ForestNode::Sequence:
- for (ForestNode *&Child : (*Next)->elements())
- Queue.push_back(&Child);
- break;
- case ForestNode::Ambiguous: {
- assert(D.count(*Next) != 0 && "disambiguation is incomplete!");
- ForestNode *ChosenChild = (*Next)->alternatives()[D.lookup(*Next)];
- *Next = ChosenChild;
- Queue.push_back(Next);
- break;
- }
- case ForestNode::Terminal:
- case ForestNode::Opaque:
- break;
- }
- }
-}
-
-} // namespace clang::pseudo
diff --git a/clang-tools-extra/pseudo/lib/Forest.cpp b/clang-tools-extra/pseudo/lib/Forest.cpp
deleted file mode 100644
index e8e60e5ec475a4..00000000000000
--- a/clang-tools-extra/pseudo/lib/Forest.cpp
+++ /dev/null
@@ -1,199 +0,0 @@
-//===--- Forest.cpp - Parse forest ------------------------------*- C++-*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "clang-pseudo/Forest.h"
-#include "clang-pseudo/Token.h"
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/Support/FormatVariadic.h"
-#include <optional>
-
-namespace clang {
-namespace pseudo {
-
-void ForestNode::RecursiveIterator::operator++() {
- auto C = Cur->children();
- // Try to find a child of the current node to descend into.
- for (unsigned I = 0; I < C.size(); ++I) {
- if (Seen.insert(C[I]).second) {
- Stack.push_back({Cur, I});
- Cur = C[I];
- return;
- }
- }
- // Try to find a sibling af an ancestor to advance to.
- for (; !Stack.empty(); Stack.pop_back()) {
- C = Stack.back().Parent->children();
- unsigned &Index = Stack.back().ChildIndex;
- while (++Index < C.size()) {
- if (Seen.insert(C[Index]).second) {
- Cur = C[Index];
- return;
- }
- }
- }
- Cur = nullptr;
-}
-
-llvm::iterator_range<ForestNode::RecursiveIterator>
-ForestNode::descendants() const {
- return {RecursiveIterator(this), RecursiveIterator()};
-}
-
-std::string ForestNode::dump(const Grammar &G) const {
- switch (kind()) {
- case Ambiguous:
- return llvm::formatv("{0} := <ambiguous>", G.symbolName(symbol()));
- case Terminal:
- return llvm::formatv("{0} := tok[{1}]", G.symbolName(symbol()),
- startTokenIndex());
- case Sequence:
- return G.dumpRule(rule());
- case Opaque:
- return llvm::formatv("{0} := <opaque>", G.symbolName(symbol()));
- }
- llvm_unreachable("Unhandled node kind!");
-}
-
-std::string ForestNode::dumpRecursive(const Grammar &G,
- bool Abbreviated) const {
- using llvm::formatv;
- Token::Index MaxToken = 0;
- // Count visits of nodes so we can mark those seen multiple times.
- llvm::DenseMap<const ForestNode *, /*VisitCount*/ unsigned> VisitCounts;
- std::function<void(const ForestNode *)> CountVisits =
- [&](const ForestNode *P) {
- MaxToken = std::max(MaxToken, P->startTokenIndex());
- if (VisitCounts[P]++ > 0)
- return; // Don't count children as multiply visited.
- if (P->kind() == Ambiguous)
- llvm::for_each(P->alternatives(), CountVisits);
- else if (P->kind() == Sequence)
- llvm::for_each(P->elements(), CountVisits);
- };
- CountVisits(this);
-
- unsigned IndexWidth = std::max(3, (int)std::to_string(MaxToken).size());
- // e.g. "[{0,4}, {1,4})" if MaxToken is 5742.
- std::string RangeFormat = formatv("[{{0,{0}}, {{1,{0}}) ", IndexWidth);
-
- // The box-drawing characters that should be added as a child is rendered.
- struct LineDecoration {
- std::string Prefix; // Prepended to every line.
- llvm::StringRef First; // added to the child's line.
- llvm::StringRef Subsequent; // added to descendants' lines.
- };
-
- // We print a "#<id>" for nonterminal forest nodes that are being dumped
- // multiple times.
- llvm::DenseMap<const ForestNode *, size_t> ReferenceIds;
- std::string Result;
- constexpr Token::Index KEnd = std::numeric_limits<Token::Index>::max();
- std::function<void(const ForestNode *, Token::Index, std::optional<SymbolID>,
- LineDecoration &LineDec)>
- Dump = [&](const ForestNode *P, Token::Index End,
- std::optional<SymbolID> ElidedParent, LineDecoration LineDec) {
- bool SharedNode = VisitCounts.find(P)->getSecond() > 1;
- llvm::ArrayRef<const ForestNode *> Children;
- auto EndOfElement = [&](size_t ChildIndex) {
- return ChildIndex + 1 == Children.size()
- ? End
- : Children[ChildIndex + 1]->startTokenIndex();
- };
- if (P->kind() == Ambiguous) {
- Children = P->alternatives();
- } else if (P->kind() == Sequence) {
- Children = P->elements();
- if (Abbreviated) {
- // Abbreviate chains of trivial sequence nodes.
- // A := B, B := C, C := D, D := X Y Z
- // becomes
- // A~D := X Y Z
- //
- // We can't hide nodes that appear multiple times in the tree,
- // because we need to call out their identity with IDs.
- if (Children.size() == 1 && !SharedNode) {
- assert(Children[0]->startTokenIndex() == P->startTokenIndex() &&
- EndOfElement(0) == End);
- return Dump(Children[0], End,
- /*ElidedParent=*/ElidedParent.value_or(P->symbol()),
- LineDec);
- }
- }
- }
-
- if (End == KEnd)
- Result += formatv(RangeFormat.c_str(), P->startTokenIndex(), "end");
- else
- Result += formatv(RangeFormat.c_str(), P->startTokenIndex(), End);
- Result += LineDec.Prefix;
- Result += LineDec.First;
- if (ElidedParent) {
- Result += G.symbolName(*ElidedParent);
- Result += "~";
- }
-
- if (SharedNode && P->kind() != ForestNode::Terminal) {
- auto It = ReferenceIds.try_emplace(P, ReferenceIds.size() + 1);
- bool First = It.second;
- unsigned ID = It.first->second;
-
- // The first time, print as #1. Later, =#1.
- if (First) {
- Result += formatv("{0} #{1}", P->dump(G), ID);
- } else {
- Result += formatv("{0} =#{1}", G.symbolName(P->symbol()), ID);
- Children = {}; // Don't walk the children again.
- }
- } else {
- Result.append(P->dump(G));
- }
- Result.push_back('\n');
-
- auto OldPrefixSize = LineDec.Prefix.size();
- LineDec.Prefix += LineDec.Subsequent;
- for (size_t I = 0; I < Children.size(); ++I) {
- if (I == Children.size() - 1) {
- LineDec.First = "└─";
- LineDec.Subsequent = " ";
- } else {
- LineDec.First = "├─";
- LineDec.Subsequent = "│ ";
- }
- Dump(Children[I], P->kind() == Sequence ? EndOfElement(I) : End,
- std::nullopt, LineDec);
- }
- LineDec.Prefix.resize(OldPrefixSize);
- };
- LineDecoration LineDec;
- Dump(this, KEnd, std::nullopt, LineDec);
- return Result;
-}
-
-llvm::ArrayRef<ForestNode>
-ForestArena::createTerminals(const TokenStream &Code) {
- ForestNode *Terminals = Arena.Allocate<ForestNode>(Code.tokens().size() + 1);
- size_t Index = 0;
- for (const auto &T : Code.tokens()) {
- new (&Terminals[Index])
- ForestNode(ForestNode::Terminal, tokenSymbol(T.Kind),
- /*Start=*/Index, /*TerminalData*/ 0);
- ++Index;
- }
- // Include an `eof` terminal.
- // This is important to drive the final shift/recover/reduce loop.
- new (&Terminals[Index])
- ForestNode(ForestNode::Terminal, tokenSymbol(tok::eof),
- /*Start=*/Index, /*TerminalData*/ 0);
- ++Index;
- NodeCount = Index;
- return llvm::ArrayRef(Terminals, Index);
-}
-
-} // namespace pseudo
-} // namespace clang
diff --git a/clang-tools-extra/pseudo/lib/GLR.cpp b/clang-tools-extra/pseudo/lib/GLR.cpp
deleted file mode 100644
index ac43c02db521eb..00000000000000
--- a/clang-tools-extra/pseudo/lib/GLR.cpp
+++ /dev/null
@@ -1,772 +0,0 @@
-//===--- GLR.cpp -----------------------------------------------*- C++-*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "clang-pseudo/GLR.h"
-#include "clang-pseudo/Language.h"
-#include "clang-pseudo/grammar/Grammar.h"
-#include "clang-pseudo/grammar/LRTable.h"
-#include "clang/Basic/TokenKinds.h"
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/ScopeExit.h"
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/FormatVariadic.h"
-#include <algorithm>
-#include <memory>
-#include <optional>
-#include <queue>
-
-#define DEBUG_TYPE "GLR.cpp"
-
-namespace clang {
-namespace pseudo {
-namespace {
-
-Token::Index findRecoveryEndpoint(ExtensionID Strategy, Token::Index Begin,
- const TokenStream &Tokens,
- const Language &Lang) {
- assert(Strategy != 0);
- if (auto S = Lang.RecoveryStrategies.lookup(Strategy))
- return S(Begin, Tokens);
- return Token::Invalid;
-}
-
-} // namespace
-
-void glrRecover(llvm::ArrayRef<const GSS::Node *> OldHeads,
- unsigned &TokenIndex, const ParseParams &Params,
- const Language &Lang,
- std::vector<const GSS::Node *> &NewHeads) {
- LLVM_DEBUG(llvm::dbgs() << "Recovery at token " << TokenIndex << "...\n");
- // Describes a possibility to recover by forcibly interpreting a range of
- // tokens around the cursor as a nonterminal that we expected to see.
- struct PlaceholderRecovery {
- // The token prior to the nonterminal which is being recovered.
- // This starts of the region we're skipping, so higher Position is better.
- Token::Index Position;
- // The nonterminal which will be created in order to recover.
- SymbolID Symbol;
- // The heuristic used to choose the bounds of the nonterminal to recover.
- ExtensionID Strategy;
-
- // The GSS head where we are expecting the recovered nonterminal.
- const GSS::Node *RecoveryNode;
- // Payload of nodes on the way back from the OldHead to the recovery node.
- // These represent the partial parse that is being discarded.
- // They should become the children of the opaque recovery node.
- // FIXME: internal structure of opaque nodes is not implemented.
- //
- // There may be multiple paths leading to the same recovery node, we choose
- // one arbitrarily.
- std::vector<const ForestNode *> DiscardedParse;
- };
- std::vector<PlaceholderRecovery> Options;
-
- // Find recovery options by walking up the stack.
- //
- // This is similar to exception handling: we walk up the "frames" of nested
- // rules being parsed until we find one that has a "handler" which allows us
- // to determine the node bounds without parsing it.
- //
- // Unfortunately there's a significant difference: the stack contains both
- // "upward" nodes (ancestor parses) and "leftward" ones.
- // e.g. when parsing `{ if (1) ? }` as compound-stmt, the stack contains:
- // stmt := IF ( expr ) . stmt - current state, we should recover here!
- // stmt := IF ( expr . ) stmt - (left, no recovery here)
- // stmt := IF ( . expr ) stmt - left, we should NOT recover here!
- // stmt := IF . ( expr ) stmt - (left, no recovery here)
- // stmt-seq := . stmt - up, we might recover here
- // compound-stmt := { . stmt-seq } - up, we should recover here!
- //
- // It's not obvious how to avoid collecting "leftward" recovery options.
- // I think the distinction is ill-defined after merging items into states.
- // For now, we have to take this into account when defining recovery rules.
- // (e.g. in the expr recovery above, stay inside the parentheses).
- // FIXME: find a more satisfying way to avoid such false recovery.
- // FIXME: Add a test for spurious recovery once tests can define strategies.
- std::vector<const ForestNode *> Path;
- llvm::DenseSet<const GSS::Node *> Seen;
- auto WalkUp = [&](const GSS::Node *N, Token::Index NextTok, auto &WalkUp) {
- if (!Seen.insert(N).second)
- return;
- if (!N->Recovered) { // Don't recover the same way twice!
- for (auto Strategy : Lang.Table.getRecovery(N->State)) {
- Options.push_back(PlaceholderRecovery{
- NextTok,
- Strategy.Result,
- Strategy.Strategy,
- N,
- Path,
- });
- LLVM_DEBUG(llvm::dbgs()
- << "Option: recover " << Lang.G.symbolName(Strategy.Result)
- << " at token " << NextTok << "\n");
- }
- }
- Path.push_back(N->Payload);
- for (const GSS::Node *Parent : N->parents())
- WalkUp(Parent, N->Payload->startTokenIndex(), WalkUp);
- Path.pop_back();
- };
- for (auto *N : OldHeads)
- WalkUp(N, TokenIndex, WalkUp);
-
- // Now we select the option(s) we will use to recover.
- //
- // We prefer options starting further right, as these discard less code
- // (e.g. we prefer to recover inner scopes rather than outer ones).
- // The options also need to agree on an endpoint, so the parser has a
- // consistent position afterwards.
- //
- // So conceptually we're sorting by the tuple (start, end), though we avoid
- // computing `end` for options that can't be winners.
-
- // Consider options starting further right first.
- // Don't drop the others yet though, we may still use them if preferred fails.
- llvm::stable_sort(Options, [&](const auto &L, const auto &R) {
- return L.Position > R.Position;
- });
-
- // We may find multiple winners, but they will have the same range.
- std::optional<Token::Range> RecoveryRange;
- std::vector<const PlaceholderRecovery *> BestOptions;
- for (const PlaceholderRecovery &Option : Options) {
- // If this starts further left than options we've already found, then
- // we'll never find anything better. Skip computing End for the rest.
- if (RecoveryRange && Option.Position < RecoveryRange->Begin)
- break;
-
- auto End = findRecoveryEndpoint(Option.Strategy, Option.Position,
- Params.Code, Lang);
- // Recovery may not take the parse backwards.
- if (End == Token::Invalid || End < TokenIndex)
- continue;
- if (RecoveryRange) {
- // If this is worse than our previous options, ignore it.
- if (RecoveryRange->End < End)
- continue;
- // If this is an improvement over our previous options, then drop them.
- if (RecoveryRange->End > End)
- BestOptions.clear();
- }
- // Create recovery nodes and heads for them in the GSS. These may be
- // discarded if a better recovery is later found, but this path isn't hot.
- RecoveryRange = {Option.Position, End};
- BestOptions.push_back(&Option);
- }
-
- if (BestOptions.empty()) {
- LLVM_DEBUG(llvm::dbgs() << "Recovery failed after trying " << Options.size()
- << " strategies\n");
- return;
- }
-
- // We've settled on a set of recovery options, so create their nodes and
- // advance the cursor.
- LLVM_DEBUG({
- llvm::dbgs() << "Recovered range=" << *RecoveryRange << ":";
- for (const auto *Option : BestOptions)
- llvm::dbgs() << " " << Lang.G.symbolName(Option->Symbol);
- llvm::dbgs() << "\n";
- });
- // FIXME: in general, we might have the same Option->Symbol multiple times,
- // and we risk creating redundant Forest and GSS nodes.
- // We also may inadvertently set up the next glrReduce to create a sequence
- // node duplicating an opaque node that we're creating here.
- // There are various options, including simply breaking ties between options.
- // For now it's obscure enough to ignore.
- for (const PlaceholderRecovery *Option : BestOptions) {
- Option->RecoveryNode->Recovered = true;
- const ForestNode &Placeholder =
- Params.Forest.createOpaque(Option->Symbol, RecoveryRange->Begin);
- LRTable::StateID OldState = Option->RecoveryNode->State;
- LRTable::StateID NewState =
- isToken(Option->Symbol)
- ? *Lang.Table.getShiftState(OldState, Option->Symbol)
- : *Lang.Table.getGoToState(OldState, Option->Symbol);
- const GSS::Node *NewHead =
- Params.GSStack.addNode(NewState, &Placeholder, {Option->RecoveryNode});
- NewHeads.push_back(NewHead);
- }
- TokenIndex = RecoveryRange->End;
-}
-
-using StateID = LRTable::StateID;
-
-llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const GSS::Node &N) {
- std::vector<std::string> ParentStates;
- for (const auto *Parent : N.parents())
- ParentStates.push_back(llvm::formatv("{0}", Parent->State));
- OS << llvm::formatv("state {0}, parsed symbol {1}, parents {3}", N.State,
- N.Payload ? N.Payload->symbol() : 0,
- llvm::join(ParentStates, ", "));
- return OS;
-}
-
-// Apply all pending shift actions.
-// In theory, LR parsing doesn't have shift/shift conflicts on a single head.
-// But we may have multiple active heads, and each head has a shift action.
-//
-// We merge the stack -- if multiple heads will reach the same state after
-// shifting a token, we shift only once by combining these heads.
-//
-// E.g. we have two heads (2, 3) in the GSS, and will shift both to reach 4:
-// 0---1---2
-// └---3
-// After the shift action, the GSS is:
-// 0---1---2---4
-// └---3---┘
-void glrShift(llvm::ArrayRef<const GSS::Node *> OldHeads,
- const ForestNode &NewTok, const ParseParams &Params,
- const Language &Lang, std::vector<const GSS::Node *> &NewHeads) {
- assert(NewTok.kind() == ForestNode::Terminal);
- LLVM_DEBUG(llvm::dbgs() << llvm::formatv(" Shift {0} ({1} active heads):\n",
- Lang.G.symbolName(NewTok.symbol()),
- OldHeads.size()));
-
- // We group pending shifts by their target state so we can merge them.
- llvm::SmallVector<std::pair<StateID, const GSS::Node *>, 8> Shifts;
- for (const auto *H : OldHeads)
- if (auto S = Lang.Table.getShiftState(H->State, NewTok.symbol()))
- Shifts.push_back({*S, H});
- llvm::stable_sort(Shifts, llvm::less_first{});
-
- auto Rest = llvm::ArrayRef(Shifts);
- llvm::SmallVector<const GSS::Node *> Parents;
- while (!Rest.empty()) {
- // Collect the batch of PendingShift that have compatible shift states.
- // Their heads become TempParents, the parents of the new GSS node.
- StateID NextState = Rest.front().first;
-
- Parents.clear();
- for (const auto &Base : Rest) {
- if (Base.first != NextState)
- break;
- Parents.push_back(Base.second);
- }
- Rest = Rest.drop_front(Parents.size());
-
- LLVM_DEBUG(llvm::dbgs() << llvm::formatv(" --> S{0} ({1} heads)\n",
- NextState, Parents.size()));
- NewHeads.push_back(Params.GSStack.addNode(NextState, &NewTok, Parents));
- }
-}
-
-namespace {
-// A KeyedQueue yields pairs of keys and values in order of the keys.
-template <typename Key, typename Value>
-using KeyedQueue =
- std::priority_queue<std::pair<Key, Value>,
- std::vector<std::pair<Key, Value>>, llvm::less_first>;
-
-template <typename T> void sortAndUnique(std::vector<T> &Vec) {
- llvm::sort(Vec);
- Vec.erase(std::unique(Vec.begin(), Vec.end()), Vec.end());
-}
-
-// Perform reduces until no more are possible.
-//
-// Generally this means walking up from the heads gathering ForestNodes that
-// will match the RHS of the rule we're reducing into a sequence ForestNode,
-// and ending up at a base node.
-// Then we push a new GSS node onto that base, taking care to:
-// - pack alternative sequence ForestNodes into an ambiguous ForestNode.
-// - use the same GSS node for multiple heads if the parse state matches.
-//
-// Examples of reduction:
-// Before (simple):
-// 0--1(expr)--2(semi)
-// After reducing 2 by `stmt := expr semi`:
-// 0--3(stmt) // 3 is goto(0, stmt)
-//
-// Before (splitting due to R/R conflict):
-// 0--1(IDENTIFIER)
-// After reducing 1 by `class-name := IDENTIFIER` & `enum-name := IDENTIFIER`:
-// 0--2(class-name) // 2 is goto(0, class-name)
-// └--3(enum-name) // 3 is goto(0, enum-name)
-//
-// Before (splitting due to multiple bases):
-// 0--2(class-name)--4(STAR)
-// └--3(enum-name)---┘
-// After reducing 4 by `ptr-operator := STAR`:
-// 0--2(class-name)--5(ptr-operator) // 5 is goto(2, ptr-operator)
-// └--3(enum-name)---6(ptr-operator) // 6 is goto(3, ptr-operator)
-//
-// Before (joining due to same goto state, multiple bases):
-// 0--1(cv-qualifier)--3(class-name)
-// └--2(cv-qualifier)--4(enum-name)
-// After reducing 3 by `type-name := class-name` and
-// 4 by `type-name := enum-name`:
-// 0--1(cv-qualifier)--5(type-name) // 5 is goto(1, type-name) and
-// └--2(cv-qualifier)--┘ // goto(2, type-name)
-//
-// Before (joining due to same goto state, the same base):
-// 0--1(class-name)--3(STAR)
-// └--2(enum-name)--4(STAR)
-// After reducing 3 by `pointer := class-name STAR` and
-// 2 by`enum-name := class-name STAR`:
-// 0--5(pointer) // 5 is goto(0, pointer)
-//
-// (This is a functor rather than a function to allow it to reuse scratch
-// storage across calls).
-class GLRReduce {
- const ParseParams &Params;
- const Language& Lang;
- // There are two interacting complications:
- // 1. Performing one reduce can unlock new reduces on the newly-created head.
- // 2a. The ambiguous ForestNodes must be complete (have all sequence nodes).
- // This means we must have unlocked all the reduces that contribute to it.
- // 2b. Similarly, the new GSS nodes must be complete (have all parents).
- //
- // We define a "family" of reduces as those that produce the same symbol and
- // cover the same range of tokens. These are exactly the set of reductions
- // whose sequence nodes would be covered by the same ambiguous node.
- // We wish to process a whole family at a time (to satisfy complication 2),
- // and can address complication 1 by carefully ordering the families:
- // - Process families covering fewer tokens first.
- // A reduce can't depend on a longer reduce!
- // - For equal token ranges: if S := T, process T families before S families.
- // Parsing T can't depend on an equal-length S, as the grammar is acyclic.
- //
- // This isn't quite enough: we don't know the token length of the reduction
- // until we walk up the stack to perform the pop.
- // So we perform the pop part upfront, and place the push specification on
- // priority queues such that we can retrieve a family at a time.
-
- // A reduction family is characterized by its token range and symbol produced.
- // It is used as a key in the priority queues to group pushes by family.
- struct Family {
- // The start of the token range of the reduce.
- Token::Index Start;
- SymbolID Symbol;
- // Rule must produce Symbol and can otherwise be arbitrary.
- // RuleIDs have the topological order based on the acyclic grammar.
- // FIXME: should SymbolIDs be so ordered instead?
- RuleID Rule;
-
- bool operator==(const Family &Other) const {
- return Start == Other.Start && Symbol == Other.Symbol;
- }
- // The larger Family is the one that should be processed first.
- bool operator<(const Family &Other) const {
- if (Start != Other.Start)
- return Start < Other.Start;
- if (Symbol != Other.Symbol)
- return Rule > Other.Rule;
- assert(*this == Other);
- return false;
- }
- };
-
- // A sequence is the ForestNode payloads of the GSS nodes we are reducing.
- using Sequence = llvm::SmallVector<const ForestNode *, Rule::MaxElements>;
- // Like ArrayRef<const ForestNode*>, but with the missing operator<.
- // (Sequences are big to move by value as the collections gets rearranged).
- struct SequenceRef {
- SequenceRef(const Sequence &S) : S(S) {}
- llvm::ArrayRef<const ForestNode *> S;
- friend bool operator==(SequenceRef A, SequenceRef B) { return A.S == B.S; }
- friend bool operator<(const SequenceRef &A, const SequenceRef &B) {
- return std::lexicographical_compare(A.S.begin(), A.S.end(), B.S.begin(),
- B.S.end());
- }
- };
- // Underlying storage for sequences pointed to by stored SequenceRefs.
- std::deque<Sequence> SequenceStorage;
- // We don't actually destroy the sequences between calls, to reuse storage.
- // Everything SequenceStorage[ >=SequenceStorageCount ] is reusable scratch.
- unsigned SequenceStorageCount;
-
- // Halfway through a reduction (after the pop, before the push), we have
- // collected nodes for the RHS of a rule, and reached a base node.
- // They specify a sequence ForestNode we may build (but we dedup first).
- // (The RuleID is not stored here, but rather in the Family).
- struct PushSpec {
- // The last node popped before pushing. Its parent is the reduction base(s).
- // (Base is more fundamental, but this is cheaper to store).
- const GSS::Node* LastPop = nullptr;
- Sequence *Seq = nullptr;
- };
- KeyedQueue<Family, PushSpec> Sequences; // FIXME: rename => PendingPushes?
-
- // We treat Heads as a queue of Pop operations still to be performed.
- // PoppedHeads is our position within it.
- std::vector<const GSS::Node *> *Heads;
- unsigned NextPopHead;
- SymbolID Lookahead;
-
- Sequence TempSequence;
-public:
- GLRReduce(const ParseParams &Params, const Language &Lang)
- : Params(Params), Lang(Lang) {}
-
- // Reduce Heads, resulting in new nodes that are appended to Heads.
- // The "consumed" nodes are not removed!
- // Only reduce rules compatible with the Lookahead are applied, though
- // tokenSymbol(tok::unknown) will match any rule.
- void operator()(std::vector<const GSS::Node *> &Heads, SymbolID Lookahead) {
- assert(isToken(Lookahead));
-
- NextPopHead = 0;
- this->Heads = &Heads;
- this->Lookahead = Lookahead;
- assert(Sequences.empty());
- SequenceStorageCount = 0;
-
- popPending();
- while (!Sequences.empty()) {
- pushNext();
- popPending();
- }
- }
-
-private:
- bool canReduce(const Rule &R, RuleID RID,
- llvm::ArrayRef<const ForestNode *> RHS) const {
- if (!R.Guarded)
- return true;
- if (auto Guard = Lang.Guards.lookup(RID))
- return Guard({RHS, Params.Code, Lookahead});
- LLVM_DEBUG(llvm::dbgs()
- << llvm::formatv("missing guard implementation for rule {0}\n",
- Lang.G.dumpRule(RID)));
- return true;
- }
- // pop walks up the parent chain(s) for a reduction from Head by to Rule.
- // Once we reach the end, record the bases and sequences.
- void pop(const GSS::Node *Head, RuleID RID, const Rule &Rule) {
- LLVM_DEBUG(llvm::dbgs() << " Pop " << Lang.G.dumpRule(RID) << "\n");
- Family F{/*Start=*/0, /*Symbol=*/Rule.Target, /*Rule=*/RID};
- TempSequence.resize_for_overwrite(Rule.Size);
- auto DFS = [&](const GSS::Node *N, unsigned I, auto &DFS) {
- TempSequence[Rule.Size - 1 - I] = N->Payload;
- if (I + 1 == Rule.Size) {
- F.Start = TempSequence.front()->startTokenIndex();
- LLVM_DEBUG({
- for (const auto *B : N->parents())
- llvm::dbgs() << " --> base at S" << B->State << "\n";
- });
- if (!canReduce(Rule, RID, TempSequence))
- return;
- // Copy the chain to stable storage so it can be enqueued.
- if (SequenceStorageCount == SequenceStorage.size())
- SequenceStorage.emplace_back();
- SequenceStorage[SequenceStorageCount] = TempSequence;
- Sequence *Seq = &SequenceStorage[SequenceStorageCount++];
-
- Sequences.emplace(F, PushSpec{N, Seq});
- return;
- }
- for (const GSS::Node *Parent : N->parents())
- DFS(Parent, I + 1, DFS);
- };
- DFS(Head, 0, DFS);
- }
-
- // popPending pops every available reduction.
- void popPending() {
- for (; NextPopHead < Heads->size(); ++NextPopHead) {
- // In trivial cases, we perform the complete reduce here!
- if (popAndPushTrivial())
- continue;
- for (RuleID RID :
- Lang.Table.getReduceRules((*Heads)[NextPopHead]->State)) {
- const auto &Rule = Lang.G.lookupRule(RID);
- if (Lang.Table.canFollow(Rule.Target, Lookahead))
- pop((*Heads)[NextPopHead], RID, Rule);
- }
- }
- }
-
- // Storage reused by each call to pushNext.
- std::vector<std::pair</*Goto*/ StateID, const GSS::Node *>> FamilyBases;
- std::vector<std::pair<RuleID, SequenceRef>> FamilySequences;
- std::vector<const GSS::Node *> Parents;
- std::vector<const ForestNode *> SequenceNodes;
-
- // Process one push family, forming a forest node.
- // This produces new GSS heads which may enable more pops.
- void pushNext() {
- assert(!Sequences.empty());
- Family F = Sequences.top().first;
-
- LLVM_DEBUG(llvm::dbgs() << " Push " << Lang.G.symbolName(F.Symbol)
- << " from token " << F.Start << "\n");
-
- // Grab the sequences and bases for this family.
- // We don't care which rule yielded each base. If Family.Symbol is S, the
- // base includes an item X := ... • S ... and since the grammar is
- // context-free, *all* parses of S are valid here.
- FamilySequences.clear();
- FamilyBases.clear();
- do {
- const PushSpec &Push = Sequences.top().second;
- FamilySequences.emplace_back(Sequences.top().first.Rule, *Push.Seq);
- for (const GSS::Node *Base : Push.LastPop->parents()) {
- auto NextState = Lang.Table.getGoToState(Base->State, F.Symbol);
- assert(NextState.has_value() && "goto must succeed after reduce!");
- FamilyBases.emplace_back(*NextState, Base);
- }
-
- Sequences.pop();
- } while (!Sequences.empty() && Sequences.top().first == F);
- // Build a forest node for each unique sequence.
- sortAndUnique(FamilySequences);
- SequenceNodes.clear();
- for (const auto &SequenceSpec : FamilySequences)
- SequenceNodes.push_back(&Params.Forest.createSequence(
- F.Symbol, SequenceSpec.first, SequenceSpec.second.S));
- // Wrap in an ambiguous node if needed.
- const ForestNode *Parsed =
- SequenceNodes.size() == 1
- ? SequenceNodes.front()
- : &Params.Forest.createAmbiguous(F.Symbol, SequenceNodes);
- LLVM_DEBUG(llvm::dbgs() << " --> " << Parsed->dump(Lang.G) << "\n");
-
- // Bases for this family, deduplicate them, and group by the goTo State.
- sortAndUnique(FamilyBases);
- // Create a GSS node for each unique goto state.
- llvm::ArrayRef<decltype(FamilyBases)::value_type> BasesLeft = FamilyBases;
- while (!BasesLeft.empty()) {
- StateID NextState = BasesLeft.front().first;
- Parents.clear();
- for (const auto &Base : BasesLeft) {
- if (Base.first != NextState)
- break;
- Parents.push_back(Base.second);
- }
- BasesLeft = BasesLeft.drop_front(Parents.size());
- Heads->push_back(Params.GSStack.addNode(NextState, Parsed, Parents));
- }
- }
-
- // In general we split a reduce into a pop/push, so concurrently-available
- // reductions can run in the correct order. The data structures are expensive.
- //
- // When only one reduction is possible at a time, we can skip this:
- // we pop and immediately push, as an LR parser (as opposed to GLR) would.
- // This is valid whenever there's only one concurrent PushSpec.
- //
- // This function handles a trivial but common subset of these cases:
- // - there must be no pending pushes, and only one poppable head
- // - the head must have only one reduction rule
- // - the reduction path must be a straight line (no multiple parents)
- // (Roughly this means there's no local ambiguity, so the LR algorithm works).
- //
- // Returns true if we successfully consumed the next unpopped head.
- bool popAndPushTrivial() {
- if (!Sequences.empty() || Heads->size() != NextPopHead + 1)
- return false;
- const GSS::Node *Head = Heads->back();
- std::optional<RuleID> RID;
- for (RuleID R : Lang.Table.getReduceRules(Head->State)) {
- if (RID.has_value())
- return false;
- RID = R;
- }
- if (!RID)
- return true; // no reductions available, but we've processed the head!
- const auto &Rule = Lang.G.lookupRule(*RID);
- if (!Lang.Table.canFollow(Rule.Target, Lookahead))
- return true; // reduction is not available
- const GSS::Node *Base = Head;
- TempSequence.resize_for_overwrite(Rule.Size);
- for (unsigned I = 0; I < Rule.Size; ++I) {
- if (Base->parents().size() != 1)
- return false;
- TempSequence[Rule.Size - 1 - I] = Base->Payload;
- Base = Base->parents().front();
- }
- if (!canReduce(Rule, *RID, TempSequence))
- return true; // reduction is not available
- const ForestNode *Parsed =
- &Params.Forest.createSequence(Rule.Target, *RID, TempSequence);
- auto NextState = Lang.Table.getGoToState(Base->State, Rule.Target);
- assert(NextState.has_value() && "goto must succeed after reduce!");
- Heads->push_back(Params.GSStack.addNode(*NextState, Parsed, {Base}));
- LLVM_DEBUG(llvm::dbgs()
- << " Reduce (trivial) " << Lang.G.dumpRule(*RID) << "\n"
- << " --> S" << Heads->back()->State << "\n");
- return true;
- }
-};
-
-} // namespace
-
-ForestNode &glrParse(const ParseParams &Params, SymbolID StartSymbol,
- const Language &Lang) {
- GLRReduce Reduce(Params, Lang);
- assert(isNonterminal(StartSymbol) && "Start symbol must be a nonterminal");
- llvm::ArrayRef<ForestNode> Terminals = Params.Forest.createTerminals(Params.Code);
- auto &GSS = Params.GSStack;
-
- StateID StartState = Lang.Table.getStartState(StartSymbol);
- // Heads correspond to the parse of tokens [0, I), NextHeads to [0, I+1).
- std::vector<const GSS::Node *> Heads = {GSS.addNode(/*State=*/StartState,
- /*ForestNode=*/nullptr,
- {})};
- // Invariant: Heads is partitioned by source: {shifted | reduced}.
- // HeadsPartition is the index of the first head formed by reduction.
- // We use this to discard and recreate the reduced heads during recovery.
- unsigned HeadsPartition = Heads.size();
- std::vector<const GSS::Node *> NextHeads;
- auto MaybeGC = [&, Roots(std::vector<const GSS::Node *>{}), I(0u)]() mutable {
- assert(NextHeads.empty() && "Running GC at the wrong time!");
- if (++I != 20) // Run periodically to balance CPU and memory usage.
- return;
- I = 0;
-
- // We need to copy the list: Roots is consumed by the GC.
- Roots = Heads;
- GSS.gc(std::move(Roots));
- };
- // Each iteration fully processes a single token.
- for (unsigned I = 0; I < Terminals.size();) {
- LLVM_DEBUG(llvm::dbgs() << llvm::formatv(
- "Next token {0} (id={1})\n",
- Lang.G.symbolName(Terminals[I].symbol()), Terminals[I].symbol()));
- // Consume the token.
- glrShift(Heads, Terminals[I], Params, Lang, NextHeads);
-
- // If we weren't able to consume the token, try to skip over some tokens
- // so we can keep parsing.
- if (NextHeads.empty()) {
- // The reduction in the previous round was constrained by lookahead.
- // On valid code this only rejects dead ends, but on broken code we should
- // consider all possibilities.
- //
- // We discard all heads formed by reduction, and recreate them without
- // this constraint. This may duplicate some nodes, but it's rare.
- LLVM_DEBUG(llvm::dbgs() << "Shift failed, will attempt recovery. "
- "Re-reducing without lookahead.\n");
- Heads.resize(HeadsPartition);
- Reduce(Heads, /*allow all reductions*/ tokenSymbol(tok::unknown));
-
- glrRecover(Heads, I, Params, Lang, NextHeads);
- if (NextHeads.empty())
- // FIXME: Ensure the `_ := start-symbol` rules have a fallback
- // error-recovery strategy attached. Then this condition can't happen.
- return Params.Forest.createOpaque(StartSymbol, /*Token::Index=*/0);
- } else
- ++I;
-
- // Form nonterminals containing the token we just consumed.
- SymbolID Lookahead =
- I == Terminals.size() ? tokenSymbol(tok::eof) : Terminals[I].symbol();
- HeadsPartition = NextHeads.size();
- Reduce(NextHeads, Lookahead);
- // Prepare for the next token.
- std::swap(Heads, NextHeads);
- NextHeads.clear();
- MaybeGC();
- }
- LLVM_DEBUG(llvm::dbgs() << llvm::formatv("Reached eof\n"));
-
- // The parse was successful if in state `_ := start-symbol EOF .`
- // The GSS parent has `_ := start-symbol . EOF`; its payload is the parse.
- auto AfterStart = Lang.Table.getGoToState(StartState, StartSymbol);
- assert(AfterStart.has_value() && "goto must succeed after start symbol!");
- auto Accept = Lang.Table.getShiftState(*AfterStart, tokenSymbol(tok::eof));
- assert(Accept.has_value() && "shift EOF must succeed!");
- auto SearchForAccept = [&](llvm::ArrayRef<const GSS::Node *> Heads) {
- const ForestNode *Result = nullptr;
- for (const auto *Head : Heads) {
- if (Head->State == *Accept) {
- assert(Head->Payload->symbol() == tokenSymbol(tok::eof));
- assert(Result == nullptr && "multiple results!");
- Result = Head->parents().front()->Payload;
- assert(Result->symbol() == StartSymbol);
- }
- }
- return Result;
- };
- if (auto *Result = SearchForAccept(Heads))
- return *const_cast<ForestNode *>(Result); // Safe: we created all nodes.
- // We failed to parse the input, returning an opaque forest node for recovery.
- // FIXME: as above, we can add fallback error handling so this is impossible.
- return Params.Forest.createOpaque(StartSymbol, /*Token::Index=*/0);
-}
-
-void glrReduce(std::vector<const GSS::Node *> &Heads, SymbolID Lookahead,
- const ParseParams &Params, const Language &Lang) {
- // Create a new GLRReduce each time for tests, performance doesn't matter.
- GLRReduce{Params, Lang}(Heads, Lookahead);
-}
-
-const GSS::Node *GSS::addNode(LRTable::StateID State, const ForestNode *Symbol,
- llvm::ArrayRef<const Node *> Parents) {
- Node *Result = new (allocate(Parents.size())) Node();
- Result->State = State;
- Result->GCParity = GCParity;
- Result->ParentCount = Parents.size();
- Alive.push_back(Result);
- ++NodesCreated;
- Result->Payload = Symbol;
- if (!Parents.empty())
- llvm::copy(Parents, reinterpret_cast<const Node **>(Result + 1));
- return Result;
-}
-
-GSS::Node *GSS::allocate(unsigned Parents) {
- if (FreeList.size() <= Parents)
- FreeList.resize(Parents + 1);
- auto &SizedList = FreeList[Parents];
- if (!SizedList.empty()) {
- auto *Result = SizedList.back();
- SizedList.pop_back();
- return Result;
- }
- return static_cast<Node *>(
- Arena.Allocate(sizeof(Node) + Parents * sizeof(Node *), alignof(Node)));
-}
-
-void GSS::destroy(Node *N) {
- unsigned ParentCount = N->ParentCount;
- N->~Node();
- assert(FreeList.size() > ParentCount && "established on construction!");
- FreeList[ParentCount].push_back(N);
-}
-
-unsigned GSS::gc(std::vector<const Node *> &&Queue) {
-#ifndef NDEBUG
- auto ParityMatches = [&](const Node *N) { return N->GCParity == GCParity; };
- assert("Before GC" && llvm::all_of(Alive, ParityMatches));
- auto Deferred = llvm::make_scope_exit(
- [&] { assert("After GC" && llvm::all_of(Alive, ParityMatches)); });
- assert(llvm::all_of(
- Queue, [&](const Node *R) { return llvm::is_contained(Alive, R); }));
-#endif
- unsigned InitialCount = Alive.size();
-
- // Mark
- GCParity = !GCParity;
- while (!Queue.empty()) {
- Node *N = const_cast<Node *>(Queue.back()); // Safe: we created these nodes.
- Queue.pop_back();
- if (N->GCParity != GCParity) { // Not seen yet
- N->GCParity = GCParity; // Mark as seen
- for (const Node *P : N->parents()) // And walk parents
- Queue.push_back(P);
- }
- }
- // Sweep
- llvm::erase_if(Alive, [&](Node *N) {
- if (N->GCParity == GCParity) // Walk reached this node.
- return false;
- destroy(N);
- return true;
- });
-
- LLVM_DEBUG(llvm::dbgs() << "GC pruned " << (InitialCount - Alive.size())
- << "/" << InitialCount << " GSS nodes\n");
- return InitialCount - Alive.size();
-}
-
-} // namespace pseudo
-} // namespace clang
diff --git a/clang-tools-extra/pseudo/lib/cli/CLI.cpp b/clang-tools-extra/pseudo/lib/cli/CLI.cpp
deleted file mode 100644
index 5c7c3b6c827ea8..00000000000000
--- a/clang-tools-extra/pseudo/lib/cli/CLI.cpp
+++ /dev/null
@@ -1,54 +0,0 @@
-//===--- CLI.cpp - ----------------------------------------------*- C++-*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "clang-pseudo/cli/CLI.h"
-#include "clang-pseudo/cxx/CXX.h"
-#include "clang-pseudo/grammar/Grammar.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/ErrorOr.h"
-#include "llvm/Support/MemoryBuffer.h"
-
-static llvm::cl::opt<std::string> Grammar(
- "grammar",
- llvm::cl::desc(
- "Specify a BNF grammar file path, or a builtin language (cxx)."),
- llvm::cl::init("cxx"));
-
-namespace clang {
-namespace pseudo {
-
-const Language &getLanguageFromFlags() {
- if (::Grammar == "cxx")
- return cxx::getLanguage();
-
- static Language *Lang = []() {
- // Read from a bnf grammar file.
- llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> GrammarText =
- llvm::MemoryBuffer::getFile(::Grammar);
- if (std::error_code EC = GrammarText.getError()) {
- llvm::errs() << "Error: can't read grammar file '" << ::Grammar
- << "': " << EC.message() << "\n";
- std::exit(1);
- }
- std::vector<std::string> Diags;
- auto G = Grammar::parseBNF(GrammarText->get()->getBuffer(), Diags);
- for (const auto &Diag : Diags)
- llvm::errs() << Diag << "\n";
- auto Table = LRTable::buildSLR(G);
- return new Language{
- std::move(G),
- std::move(Table),
- llvm::DenseMap<ExtensionID, RuleGuard>(),
- llvm::DenseMap<ExtensionID, RecoveryStrategy>(),
- };
- }();
- return *Lang;
-}
-
-} // namespace pseudo
-} // namespace clang
diff --git a/clang-tools-extra/pseudo/lib/cli/CMakeLists.txt b/clang-tools-extra/pseudo/lib/cli/CMakeLists.txt
deleted file mode 100644
index 68e644f62fded4..00000000000000
--- a/clang-tools-extra/pseudo/lib/cli/CMakeLists.txt
+++ /dev/null
@@ -1,15 +0,0 @@
-set(LLVM_LINK_COMPONENTS
- Support
- )
-
-add_clang_library(clangPseudoCLI
- CLI.cpp
-
- # FIXME export the headers from clangPseudoCXX instead
- DEPENDS
- cxx_gen
-
- LINK_LIBS
- clangPseudoGrammar
- clangPseudoCXX
- )
diff --git a/clang-tools-extra/pseudo/lib/cxx/CMakeLists.txt b/clang-tools-extra/pseudo/lib/cxx/CMakeLists.txt
deleted file mode 100644
index 2fecdce6a10f9c..00000000000000
--- a/clang-tools-extra/pseudo/lib/cxx/CMakeLists.txt
+++ /dev/null
@@ -1,19 +0,0 @@
-set(LLVM_LINK_COMPONENTS
- Support
- )
-
-add_clang_library(clangPseudoCXX
- CXX.cpp
-
- DEPENDS
- cxx_gen
-
- LINK_LIBS
- clangPseudo
- clangPseudoGrammar
- )
-
-clang_target_link_libraries(clangPseudoCXX
- PRIVATE
- clangBasic
- )
diff --git a/clang-tools-extra/pseudo/lib/cxx/CXX.cpp b/clang-tools-extra/pseudo/lib/cxx/CXX.cpp
deleted file mode 100644
index 4188dab31d3a91..00000000000000
--- a/clang-tools-extra/pseudo/lib/cxx/CXX.cpp
+++ /dev/null
@@ -1,452 +0,0 @@
-//===--- CXX.cpp - Define public interfaces for C++ grammar ---------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "clang-pseudo/cxx/CXX.h"
-#include "clang-pseudo/Forest.h"
-#include "clang-pseudo/Language.h"
-#include "clang-pseudo/grammar/Grammar.h"
-#include "clang-pseudo/grammar/LRTable.h"
-#include "clang/Basic/CharInfo.h"
-#include "clang/Basic/TokenKinds.h"
-#include "llvm/ADT/StringSwitch.h"
-#include "llvm/Support/Debug.h"
-#include <utility>
-#define DEBUG_TYPE "CXX.cpp"
-
-namespace clang {
-namespace pseudo {
-namespace cxx {
-namespace {
-static const char *CXXBNF =
-#include "CXXBNF.inc"
- ;
-
-// User-defined string literals look like `""suffix`.
-bool isStringUserDefined(const Token &Tok) {
- return !Tok.text().ends_with("\"");
-}
-bool isCharUserDefined(const Token &Tok) { return !Tok.text().ends_with("'"); }
-
-// Combinable flags describing numbers.
-// Clang has just one numeric_token kind, the grammar has 4.
-enum NumericKind {
- Integer = 0,
- Floating = 1 << 0,
- UserDefined = 1 << 1,
-};
-// Determine the kind of numeric_constant we have.
-// We can assume it's something valid, as it has been lexed.
-// FIXME: is this expensive enough that we should set flags on the token
-// and reuse them rather than computing it for each guard?
-unsigned numKind(const Token &Tok) {
- assert(Tok.Kind == tok::numeric_constant);
- llvm::StringRef Text = Tok.text();
- if (Text.size() <= 1)
- return Integer;
- bool Hex =
- Text.size() > 2 && Text[0] == '0' && (Text[1] == 'x' || Text[1] == 'X');
- uint8_t K = Integer;
-
- for (char C : Text) {
- switch (C) {
- case '.':
- K |= Floating;
- break;
- case 'e':
- case 'E':
- if (!Hex)
- K |= Floating;
- break;
- case 'p':
- case 'P':
- if (Hex)
- K |= Floating;
- break;
- case '_':
- K |= UserDefined;
- break;
- default:
- break;
- }
- }
-
- // We would be done here, but there are stdlib UDLs that lack _.
- // We must distinguish these from the builtin suffixes.
- unsigned LastLetter = Text.size();
- while (LastLetter > 0 && isLetter(Text[LastLetter - 1]))
- --LastLetter;
- if (LastLetter == Text.size()) // Common case
- return NumericKind(K);
- // Trailing d/e/f are not part of the suffix in hex numbers.
- while (Hex && LastLetter < Text.size() && isHexDigit(Text[LastLetter]))
- ++LastLetter;
- return llvm::StringSwitch<int, unsigned>(Text.substr(LastLetter))
- // std::chrono
- .Cases("h", "min", "s", "ms", "us", "ns", "d", "y", K | UserDefined)
- // complex
- .Cases("il", "i", "if", K | UserDefined)
- .Default(K);
-}
-
-// RHS is expected to contain a single terminal.
-// Returns the corresponding token.
-const Token &onlyToken(tok::TokenKind Kind,
- const ArrayRef<const ForestNode *> RHS,
- const TokenStream &Tokens) {
- assert(RHS.size() == 1 && RHS.front()->symbol() == tokenSymbol(Kind));
- return Tokens.tokens()[RHS.front()->startTokenIndex()];
-}
-// RHS is expected to contain a single symbol.
-// Returns the corresponding ForestNode.
-const ForestNode &onlySymbol(SymbolID Kind,
- const ArrayRef<const ForestNode *> RHS,
- const TokenStream &Tokens) {
- assert(RHS.size() == 1 && RHS.front()->symbol() == Kind);
- return *RHS.front();
-}
-
-bool isFunctionDeclarator(const ForestNode *Declarator) {
- assert(Declarator->symbol() == cxx::Symbol::declarator);
- bool IsFunction = false;
- while (true) {
- // not well-formed code, return the best guess.
- if (Declarator->kind() != ForestNode::Sequence)
- return IsFunction;
-
- switch (Declarator->rule()) {
- case rule::noptr_declarator::declarator_id: // reached the bottom
- return IsFunction;
- // *X is a nonfunction (unless X is a function).
- case rule::ptr_declarator::ptr_operator__ptr_declarator:
- Declarator = Declarator->elements()[1];
- IsFunction = false;
- continue;
- // X() is a function (unless X is a pointer or similar).
- case rule::declarator::
- noptr_declarator__parameters_and_qualifiers__trailing_return_type:
- case rule::noptr_declarator::noptr_declarator__parameters_and_qualifiers:
- Declarator = Declarator->elements()[0];
- IsFunction = true;
- continue;
- // X[] is an array (unless X is a pointer or function).
- case rule::noptr_declarator::
- noptr_declarator__L_SQUARE__constant_expression__R_SQUARE:
- case rule::noptr_declarator::noptr_declarator__L_SQUARE__R_SQUARE:
- Declarator = Declarator->elements()[0];
- IsFunction = false;
- continue;
- // (X) is whatever X is.
- case rule::noptr_declarator::L_PAREN__ptr_declarator__R_PAREN:
- Declarator = Declarator->elements()[1];
- continue;
- case rule::ptr_declarator::noptr_declarator:
- case rule::declarator::ptr_declarator:
- Declarator = Declarator->elements()[0];
- continue;
-
- default:
- assert(false && "unhandled declarator for IsFunction");
- return IsFunction;
- }
- }
- llvm_unreachable("unreachable");
-}
-
-bool guardNextTokenNotElse(const GuardParams &P) {
- return symbolToToken(P.Lookahead) != tok::kw_else;
-}
-
-bool specifiesStructuredBinding(const GuardParams &P) {
- const auto DSS = P.RHS[0];
- assert(DSS->symbol() == Symbol::decl_specifier_seq);
-
- auto Length = P.RHS[1]->startTokenIndex() - DSS->startTokenIndex();
- for (const auto &T :
- P.Tokens.tokens().slice(DSS->startTokenIndex(), Length)) {
- switch (T.Kind) {
- case clang::tok::kw_static:
- case clang::tok::kw_thread_local:
- case clang::tok::kw_auto:
- case clang::tok::kw_const:
- case clang::tok::kw_volatile:
- break;
- default:
- return false;
- }
- }
- return true;
-}
-
-// Whether this e.g. decl-specifier contains an "exclusive" type such as a class
-// name, and thus can't combine with a second exclusive type.
-//
-// Returns false for
-// - non-types
-// - "unsigned" etc that may suffice as types but may modify others
-// - cases of uncertainty (e.g. due to ambiguity)
-bool hasExclusiveType(const ForestNode *N) {
- // FIXME: every time we apply this check, we walk the whole subtree.
- // Add per-node caching instead.
- while (true) {
- assert(N->symbol() == Symbol::decl_specifier_seq ||
- N->symbol() == Symbol::type_specifier_seq ||
- N->symbol() == Symbol::defining_type_specifier_seq ||
- N->symbol() == Symbol::decl_specifier ||
- N->symbol() == Symbol::type_specifier ||
- N->symbol() == Symbol::defining_type_specifier ||
- N->symbol() == Symbol::simple_type_specifier);
- if (N->kind() == ForestNode::Opaque)
- return false; // conservative
- if (N->kind() == ForestNode::Ambiguous)
- return llvm::all_of(N->alternatives(), hasExclusiveType); // conservative
- // All supported symbols are nonterminals.
- assert(N->kind() == ForestNode::Sequence);
- switch (N->rule()) {
- // seq := element seq: check element then continue into seq
- case rule::decl_specifier_seq::decl_specifier__decl_specifier_seq:
- case rule::defining_type_specifier_seq::defining_type_specifier__defining_type_specifier_seq:
- case rule::type_specifier_seq::type_specifier__type_specifier_seq:
- if (hasExclusiveType(N->children()[0]))
- return true;
- N = N->children()[1];
- continue;
- // seq := element: continue into element
- case rule::decl_specifier_seq::decl_specifier:
- case rule::type_specifier_seq::type_specifier:
- case rule::defining_type_specifier_seq::defining_type_specifier:
- N = N->children()[0];
- continue;
-
- // defining-type-specifier
- case rule::defining_type_specifier::type_specifier:
- N = N->children()[0];
- continue;
- case rule::defining_type_specifier::class_specifier:
- case rule::defining_type_specifier::enum_specifier:
- return true;
-
- // decl-specifier
- case rule::decl_specifier::defining_type_specifier:
- N = N->children()[0];
- continue;
- case rule::decl_specifier::CONSTEVAL:
- case rule::decl_specifier::CONSTEXPR:
- case rule::decl_specifier::CONSTINIT:
- case rule::decl_specifier::INLINE:
- case rule::decl_specifier::FRIEND:
- case rule::decl_specifier::storage_class_specifier:
- case rule::decl_specifier::TYPEDEF:
- case rule::decl_specifier::function_specifier:
- return false;
-
- // type-specifier
- case rule::type_specifier::elaborated_type_specifier:
- case rule::type_specifier::typename_specifier:
- return true;
- case rule::type_specifier::simple_type_specifier:
- N = N->children()[0];
- continue;
- case rule::type_specifier::cv_qualifier:
- return false;
-
- // simple-type-specifier
- case rule::simple_type_specifier::type_name:
- case rule::simple_type_specifier::template_name:
- case rule::simple_type_specifier::builtin_type:
- case rule::simple_type_specifier::nested_name_specifier__TEMPLATE__simple_template_id:
- case rule::simple_type_specifier::nested_name_specifier__template_name:
- case rule::simple_type_specifier::nested_name_specifier__type_name:
- case rule::simple_type_specifier::decltype_specifier:
- case rule::simple_type_specifier::placeholder_type_specifier:
- return true;
- case rule::simple_type_specifier::LONG:
- case rule::simple_type_specifier::SHORT:
- case rule::simple_type_specifier::SIGNED:
- case rule::simple_type_specifier::UNSIGNED:
- return false;
-
- default:
- LLVM_DEBUG(llvm::errs() << "Unhandled rule " << N->rule() << "\n");
- llvm_unreachable("hasExclusiveType be exhaustive!");
- }
- }
-}
-
-llvm::DenseMap<ExtensionID, RuleGuard> buildGuards() {
-#define GUARD(cond) \
- { \
- [](const GuardParams &P) { return cond; } \
- }
-#define TOKEN_GUARD(kind, cond) \
- [](const GuardParams& P) { \
- const Token &Tok = onlyToken(tok::kind, P.RHS, P.Tokens); \
- return cond; \
- }
-#define SYMBOL_GUARD(kind, cond) \
- [](const GuardParams& P) { \
- const ForestNode &N = onlySymbol(Symbol::kind, P.RHS, P.Tokens); \
- return cond; \
- }
- return {
- {rule::function_declarator::declarator,
- SYMBOL_GUARD(declarator, isFunctionDeclarator(&N))},
- {rule::non_function_declarator::declarator,
- SYMBOL_GUARD(declarator, !isFunctionDeclarator(&N))},
-
- // A {decl,type,defining-type}-specifier-sequence cannot have multiple
- // "exclusive" types (like class names): a value has only one type.
- {rule::defining_type_specifier_seq::
- defining_type_specifier__defining_type_specifier_seq,
- GUARD(!hasExclusiveType(P.RHS[0]) || !hasExclusiveType(P.RHS[1]))},
- {rule::type_specifier_seq::type_specifier__type_specifier_seq,
- GUARD(!hasExclusiveType(P.RHS[0]) || !hasExclusiveType(P.RHS[1]))},
- {rule::decl_specifier_seq::decl_specifier__decl_specifier_seq,
- GUARD(!hasExclusiveType(P.RHS[0]) || !hasExclusiveType(P.RHS[1]))},
-
- {rule::contextual_override::IDENTIFIER,
- TOKEN_GUARD(identifier, Tok.text() == "override")},
- {rule::contextual_final::IDENTIFIER,
- TOKEN_GUARD(identifier, Tok.text() == "final")},
- {rule::import_keyword::IDENTIFIER,
- TOKEN_GUARD(identifier, Tok.text() == "import")},
- {rule::export_keyword::IDENTIFIER,
- TOKEN_GUARD(identifier, Tok.text() == "export")},
- {rule::module_keyword::IDENTIFIER,
- TOKEN_GUARD(identifier, Tok.text() == "module")},
- {rule::contextual_zero::NUMERIC_CONSTANT,
- TOKEN_GUARD(numeric_constant, Tok.text() == "0")},
-
- {rule::selection_statement::IF__L_PAREN__condition__R_PAREN__statement,
- guardNextTokenNotElse},
- {rule::selection_statement::
- IF__L_PAREN__init_statement__condition__R_PAREN__statement,
- guardNextTokenNotElse},
- {rule::selection_statement::
- IF__CONSTEXPR__L_PAREN__condition__R_PAREN__statement,
- guardNextTokenNotElse},
- {rule::selection_statement::
- IF__CONSTEXPR__L_PAREN__init_statement__condition__R_PAREN__statement,
- guardNextTokenNotElse},
-
- // Implement C++ [basic.lookup.qual.general]:
- // If a name, template-id, or decltype-specifier is followed by a
- // ::, it shall designate a namespace, class, enumeration, or
- // dependent type, and the :: is never interpreted as a complete
- // nested-name-specifier.
- {rule::nested_name_specifier::COLONCOLON,
- TOKEN_GUARD(coloncolon, Tok.prev().Kind != tok::identifier)},
-
- // Implement C++ [dcl.pre#6]:
- // A simple-declaration with an identifier-list is called a structured
- // binding declaration ([dcl.struct.bind]). If the decl-specifier-seq
- // contains any decl-specifier other than static, thread_local, auto,
- // or cv-qualifiers, the program is ill-formed.
- {rule::simple_declaration::
- decl_specifier_seq__ref_qualifier__L_SQUARE__identifier_list__R_SQUARE__initializer__SEMI,
- specifiesStructuredBinding},
- {rule::simple_declaration::
- decl_specifier_seq__L_SQUARE__identifier_list__R_SQUARE__initializer__SEMI,
- specifiesStructuredBinding},
-
- // The grammar distinguishes (only) user-defined vs plain string literals,
- // where the clang lexer distinguishes (only) encoding types.
- {rule::user_defined_string_literal_chunk::STRING_LITERAL,
- TOKEN_GUARD(string_literal, isStringUserDefined(Tok))},
- {rule::user_defined_string_literal_chunk::UTF8_STRING_LITERAL,
- TOKEN_GUARD(utf8_string_literal, isStringUserDefined(Tok))},
- {rule::user_defined_string_literal_chunk::UTF16_STRING_LITERAL,
- TOKEN_GUARD(utf16_string_literal, isStringUserDefined(Tok))},
- {rule::user_defined_string_literal_chunk::UTF32_STRING_LITERAL,
- TOKEN_GUARD(utf32_string_literal, isStringUserDefined(Tok))},
- {rule::user_defined_string_literal_chunk::WIDE_STRING_LITERAL,
- TOKEN_GUARD(wide_string_literal, isStringUserDefined(Tok))},
- {rule::string_literal_chunk::STRING_LITERAL,
- TOKEN_GUARD(string_literal, !isStringUserDefined(Tok))},
- {rule::string_literal_chunk::UTF8_STRING_LITERAL,
- TOKEN_GUARD(utf8_string_literal, !isStringUserDefined(Tok))},
- {rule::string_literal_chunk::UTF16_STRING_LITERAL,
- TOKEN_GUARD(utf16_string_literal, !isStringUserDefined(Tok))},
- {rule::string_literal_chunk::UTF32_STRING_LITERAL,
- TOKEN_GUARD(utf32_string_literal, !isStringUserDefined(Tok))},
- {rule::string_literal_chunk::WIDE_STRING_LITERAL,
- TOKEN_GUARD(wide_string_literal, !isStringUserDefined(Tok))},
- // And the same for chars.
- {rule::user_defined_character_literal::CHAR_CONSTANT,
- TOKEN_GUARD(char_constant, isCharUserDefined(Tok))},
- {rule::user_defined_character_literal::UTF8_CHAR_CONSTANT,
- TOKEN_GUARD(utf8_char_constant, isCharUserDefined(Tok))},
- {rule::user_defined_character_literal::UTF16_CHAR_CONSTANT,
- TOKEN_GUARD(utf16_char_constant, isCharUserDefined(Tok))},
- {rule::user_defined_character_literal::UTF32_CHAR_CONSTANT,
- TOKEN_GUARD(utf32_char_constant, isCharUserDefined(Tok))},
- {rule::user_defined_character_literal::WIDE_CHAR_CONSTANT,
- TOKEN_GUARD(wide_char_constant, isCharUserDefined(Tok))},
- {rule::character_literal::CHAR_CONSTANT,
- TOKEN_GUARD(char_constant, !isCharUserDefined(Tok))},
- {rule::character_literal::UTF8_CHAR_CONSTANT,
- TOKEN_GUARD(utf8_char_constant, !isCharUserDefined(Tok))},
- {rule::character_literal::UTF16_CHAR_CONSTANT,
- TOKEN_GUARD(utf16_char_constant, !isCharUserDefined(Tok))},
- {rule::character_literal::UTF32_CHAR_CONSTANT,
- TOKEN_GUARD(utf32_char_constant, !isCharUserDefined(Tok))},
- {rule::character_literal::WIDE_CHAR_CONSTANT,
- TOKEN_GUARD(wide_char_constant, !isCharUserDefined(Tok))},
- // clang just has one NUMERIC_CONSTANT token for {ud,plain}x{float,int}
- {rule::user_defined_integer_literal::NUMERIC_CONSTANT,
- TOKEN_GUARD(numeric_constant, numKind(Tok) == (Integer | UserDefined))},
- {rule::user_defined_floating_point_literal::NUMERIC_CONSTANT,
- TOKEN_GUARD(numeric_constant, numKind(Tok) == (Floating | UserDefined))},
- {rule::integer_literal::NUMERIC_CONSTANT,
- TOKEN_GUARD(numeric_constant, numKind(Tok) == Integer)},
- {rule::floating_point_literal::NUMERIC_CONSTANT,
- TOKEN_GUARD(numeric_constant, numKind(Tok) == Floating)},
- };
-#undef TOKEN_GUARD
-#undef SYMBOL_GUARD
-}
-
-Token::Index recoverBrackets(Token::Index Begin, const TokenStream &Tokens) {
- assert(Begin > 0);
- const Token &Left = Tokens.tokens()[Begin - 1];
- assert(Left.Kind == tok::l_brace || Left.Kind == tok::l_paren ||
- Left.Kind == tok::l_square);
- if (const Token *Right = Left.pair()) {
- assert(Tokens.index(*Right) > Begin - 1);
- return Tokens.index(*Right);
- }
- return Token::Invalid;
-}
-
-llvm::DenseMap<ExtensionID, RecoveryStrategy> buildRecoveryStrategies() {
- return {
- {Extension::Brackets, recoverBrackets},
- };
-}
-
-} // namespace
-
-const Language &getLanguage() {
- static const auto &CXXLanguage = []() -> const Language & {
- std::vector<std::string> Diags;
- auto G = Grammar::parseBNF(CXXBNF, Diags);
- assert(Diags.empty());
- LRTable Table = LRTable::buildSLR(G);
- const Language *PL = new Language{
- std::move(G),
- std::move(Table),
- buildGuards(),
- buildRecoveryStrategies(),
- };
- return *PL;
- }();
- return CXXLanguage;
-}
-
-} // namespace cxx
-} // namespace pseudo
-} // namespace clang
diff --git a/clang-tools-extra/pseudo/lib/cxx/cxx.bnf b/clang-tools-extra/pseudo/lib/cxx/cxx.bnf
deleted file mode 100644
index fbd964d4abe861..00000000000000
--- a/clang-tools-extra/pseudo/lib/cxx/cxx.bnf
+++ /dev/null
@@ -1,775 +0,0 @@
-# This is a C++ grammar from the C++ standard [1].
-#
-# The grammar is a superset of the true grammar requring semantic constraints to
-# resolve ambiguities. The grammar is context-free and ambiguous (beyond the
-# limit of LR(k)). We use general parsing algorithm (e.g GLR) to handle the
-# grammar and generate a transition table which is used to drive the parsing.
-#
-# It aims to align with the ISO C++ grammar as much as possible. We adjust it
-# to fit the need for the grammar-based parser:
-# - attributes are omitted, which will be handled as comments;
-# - we don't allow nullable nonterminal symbols. There are few nullable
-# nonterminals in the spec grammar, they are adjusted to be non-nullable;
-# - the file merely describes the core C++ grammar. Preprocessor directives and
-# lexical conversions are omitted as we reuse clang's lexer and run a fake
-# preprocessor;
-# - grammar rules with the >> token are adjusted, the greatergreater token is
-# split into two > tokens, to make the GLR parser aware of nested templates
-# and right shift operator;
-#
-# Guidelines:
-# - nonterminals are lower_case; terminals (aka tokens) correspond to
-# clang::TokenKind, written as "IDENTIFIER", "USING", "::" etc;
-# - optional symbols are supported, with a _opt suffix;
-#
-# [1] https://isocpp.org/files/papers/N4860.pdf
-
-# _ lists all the start-symbols which we support parsing.
-#
-# We list important nonterminals as start symbols, rather than doing it for all
-# nonterminals by default, this reduces the number of states by 30% and LRTable
-# actions by 16%.
-_ := translation-unit EOF
-_ := statement-seq EOF
-_ := declaration-seq EOF
-
-# gram.key
-#! we don't distinguish between namespaces and namespace aliases, as it's hard
-#! and uninteresting.
-namespace-name := IDENTIFIER
-template-name := IDENTIFIER
-
-# gram.basic
-#! Custom modifications to eliminate optional declaration-seq
-translation-unit := declaration-seq
-translation-unit := global-module-fragment_opt module-declaration declaration-seq_opt private-module-fragment_opt
-
-# gram.expr
-# expr.prim
-primary-expression := literal
-primary-expression := THIS
-primary-expression := ( expression )
-primary-expression := id-expression
-primary-expression := lambda-expression
-primary-expression := fold-expression
-primary-expression := requires-expression
-id-expression := unqualified-id
-id-expression := qualified-id
-unqualified-id := IDENTIFIER
-unqualified-id := operator-function-id
-unqualified-id := conversion-function-id
-unqualified-id := literal-operator-id
-unqualified-id := ~ type-name
-unqualified-id := ~ decltype-specifier
-unqualified-id := template-id
-qualified-id := nested-name-specifier TEMPLATE_opt unqualified-id
-nested-name-specifier := :: [guard]
-nested-name-specifier := type-name ::
-nested-name-specifier := namespace-name ::
-nested-name-specifier := decltype-specifier ::
-nested-name-specifier := nested-name-specifier IDENTIFIER ::
-nested-name-specifier := nested-name-specifier TEMPLATE_opt simple-template-id ::
-lambda-expression := lambda-introducer lambda-declarator_opt compound-statement
-lambda-expression := lambda-introducer < template-parameter-list > requires-clause_opt lambda-declarator_opt compound-statement
-#! We allow a capture-default to appear anywhere in a capture-list.
-# This simplifies the grammar and error recovery.
-lambda-introducer := [ capture-list_opt ]
-lambda-declarator := ( parameter-declaration-clause_opt ) decl-specifier-seq_opt noexcept-specifier_opt trailing-return-type_opt requires-clause_opt
-capture-list := capture
-capture-list := capture-list , capture
-capture := capture-default
-capture := simple-capture
-capture := init-capture
-capture-default := &
-capture-default := =
-simple-capture := IDENTIFIER ..._opt
-simple-capture := & IDENTIFIER ..._opt
-simple-capture := THIS
-simple-capture := * THIS
-init-capture := ..._opt IDENTIFIER initializer
-init-capture := & ..._opt IDENTIFIER initializer
-fold-expression := ( cast-expression fold-operator ... )
-fold-expression := ( ... fold-operator cast-expression )
-fold-expression := ( cast-expression fold-operator ... fold-operator cast-expression )
-fold-operator := +
-fold-operator := -
-fold-operator := *
-fold-operator := /
-fold-operator := %
-fold-operator := ^
-fold-operator := |
-fold-operator := <<
-fold-operator := greatergreater
-fold-operator := +=
-fold-operator := -=
-fold-operator := *=
-fold-operator := /=
-fold-operator := %=
-fold-operator := ^=
-fold-operator := &=
-fold-operator := |=
-fold-operator := <<=
-fold-operator := >>=
-fold-operator := =
-fold-operator := ==
-fold-operator := !=
-fold-operator := <
-fold-operator := >
-fold-operator := <=
-fold-operator := >=
-fold-operator := &&
-fold-operator := ||
-fold-operator := ,
-fold-operator := .*
-fold-operator := ->*
-requires-expression := REQUIRES requirement-parameter-list_opt requirement-body
-requirement-parameter-list := ( parameter-declaration-clause_opt )
-requirement-body := { requirement-seq }
-requirement-seq := requirement
-requirement-seq := requirement-seq requirement
-requirement := simple-requirement
-requirement := type-requirement
-requirement := compound-requirement
-requirement := nested-requirement
-simple-requirement := expression ;
-type-requirement := TYPENAME nested-name-specifier_opt type-name ;
-compound-requirement := { expression } NOEXCEPT_opt return-type-requirement_opt ;
-return-type-requirement := -> type-constraint
-nested-requirement := REQUIRES constraint-expression ;
-# expr.post
-postfix-expression := primary-expression
-postfix-expression := postfix-expression [ expr-or-braced-init-list ]
-postfix-expression := postfix-expression ( expression-list_opt )
-postfix-expression := simple-type-specifier ( expression-list_opt )
-postfix-expression := typename-specifier ( expression-list_opt )
-postfix-expression := simple-type-specifier braced-init-list
-postfix-expression := postfix-expression . TEMPLATE_opt id-expression
-postfix-expression := postfix-expression -> TEMPLATE_opt id-expression
-postfix-expression := postfix-expression ++
-postfix-expression := postfix-expression --
-postfix-expression := DYNAMIC_CAST < type-id > ( expression )
-postfix-expression := STATIC_CAST < type-id > ( expression )
-postfix-expression := REINTERPRET_CAST < type-id > ( expression )
-postfix-expression := CONST_CAST < type-id > ( expression )
-postfix-expression := TYPEID ( expression )
-postfix-expression := TYPEID ( type-id )
-#! Standard defines expression-list in terms of initializer-list, but our
-# initializer-list allows designators.
-expression-list := initializer-clause ..._opt
-expression-list := expression-list , initializer-clause ..._opt
-# expr.unary
-unary-expression := postfix-expression
-unary-expression := unary-operator cast-expression
-unary-expression := ++ cast-expression
-unary-expression := -- cast-expression
-unary-expression := await-expression
-unary-expression := SIZEOF unary-expression
-unary-expression := SIZEOF ( type-id )
-unary-expression := SIZEOF ... ( IDENTIFIER )
-unary-expression := ALIGNOF ( type-id )
-unary-expression := noexcept-expression
-unary-expression := new-expression
-unary-expression := delete-expression
-unary-operator := *
-unary-operator := &
-unary-operator := +
-unary-operator := -
-unary-operator := !
-unary-operator := ~
-await-expression := CO_AWAIT cast-expression
-noexcept-expression := NOEXCEPT ( expression )
-new-expression := ::_opt NEW new-placement_opt new-type-id new-initializer_opt
-new-expression := ::_opt NEW new-placement_opt ( type-id ) new-initializer_opt
-new-placement := ( expression-list )
-new-type-id := type-specifier-seq new-declarator_opt
-new-declarator := ptr-operator new-declarator_opt
-new-declarator := noptr-new-declarator
-noptr-new-declarator := [ expression_opt ]
-noptr-new-declarator := noptr-new-declarator [ constant-expression ]
-new-initializer := ( expression-list_opt )
-new-initializer := braced-init-list
-delete-expression := ::_opt DELETE cast-expression
-delete-expression := ::_opt DELETE [ ] cast-expression
-cast-expression := unary-expression
-cast-expression := ( type-id ) cast-expression
-# expr.mptr.oper
-pm-expression := cast-expression
-pm-expression := pm-expression .* cast-expression
-pm-expression := pm-expression ->* cast-expression
-# expr.mul
-multiplicative-expression := pm-expression
-multiplicative-expression := multiplicative-expression * pm-expression
-multiplicative-expression := multiplicative-expression / pm-expression
-multiplicative-expression := multiplicative-expression % pm-expression
-# expr.add
-additive-expression := multiplicative-expression
-additive-expression := additive-expression + multiplicative-expression
-additive-expression := additive-expression - multiplicative-expression
-# expr.shift
-shift-expression := additive-expression
-shift-expression := shift-expression << additive-expression
-shift-expression := shift-expression greatergreater additive-expression
-# expr.spaceship
-compare-expression := shift-expression
-compare-expression := compare-expression <=> shift-expression
-# expr.rel
-relational-expression := compare-expression
-relational-expression := relational-expression < compare-expression
-relational-expression := relational-expression > compare-expression
-relational-expression := relational-expression <= compare-expression
-relational-expression := relational-expression >= compare-expression
-# expr.eq
-equality-expression := relational-expression
-equality-expression := equality-expression == relational-expression
-equality-expression := equality-expression != relational-expression
-# expr.bit.and
-and-expression := equality-expression
-and-expression := and-expression & equality-expression
-# expr.xor
-exclusive-or-expression := and-expression
-exclusive-or-expression := exclusive-or-expression ^ and-expression
-# expr.or
-inclusive-or-expression := exclusive-or-expression
-inclusive-or-expression := inclusive-or-expression | exclusive-or-expression
-# expr.log.and
-logical-and-expression := inclusive-or-expression
-logical-and-expression := logical-and-expression && inclusive-or-expression
-# expr.log.or
-logical-or-expression := logical-and-expression
-logical-or-expression := logical-or-expression || logical-and-expression
-# expr.cond
-conditional-expression := logical-or-expression
-conditional-expression := logical-or-expression ? expression : assignment-expression
-# expr.ass
-yield-expression := CO_YIELD assignment-expression
-yield-expression := CO_YIELD braced-init-list
-throw-expression := THROW assignment-expression_opt
-assignment-expression := conditional-expression
-assignment-expression := yield-expression
-assignment-expression := throw-expression
-assignment-expression := logical-or-expression assignment-operator initializer-clause
-assignment-operator := =
-assignment-operator := *=
-assignment-operator := /=
-assignment-operator := %=
-assignment-operator := +=
-assignment-operator := -=
-assignment-operator := >>=
-assignment-operator := <<=
-assignment-operator := &=
-assignment-operator := ^=
-assignment-operator := |=
-# expr.comma
-expression := assignment-expression
-expression := expression , assignment-expression
-# expr.const
-constant-expression := conditional-expression
-
-# gram.stmt
-statement := labeled-statement
-statement := expression-statement
-statement := compound-statement
-statement := selection-statement
-statement := iteration-statement
-statement := jump-statement
-statement := declaration-statement
-statement := try-block
-init-statement := expression-statement
-init-statement := simple-declaration
-condition := expression
-condition := decl-specifier-seq declarator brace-or-equal-initializer
-labeled-statement := IDENTIFIER : statement
-labeled-statement := CASE constant-expression : statement
-labeled-statement := DEFAULT : statement
-expression-statement := expression_opt ;
-compound-statement := { statement-seq_opt [recover=Brackets] }
-statement-seq := statement
-statement-seq := statement-seq statement
-selection-statement := IF CONSTEXPR_opt ( init-statement_opt condition ) statement [guard]
-selection-statement := IF CONSTEXPR_opt ( init-statement_opt condition ) statement ELSE statement
-selection-statement := SWITCH ( init-statement_opt condition ) statement
-iteration-statement := WHILE ( condition ) statement
-iteration-statement := DO statement WHILE ( expression ) ;
-iteration-statement := FOR ( init-statement condition_opt ; expression_opt ) statement
-iteration-statement := FOR ( init-statement_opt for-range-declaration : for-range-initializer ) statement
-for-range-declaration := decl-specifier-seq declarator
-for-range-declaration := decl-specifier-seq ref-qualifier_opt [ identifier-list ]
-for-range-initializer := expr-or-braced-init-list
-jump-statement := BREAK ;
-jump-statement := CONTINUE ;
-jump-statement := RETURN expr-or-braced-init-list_opt ;
-jump-statement := coroutine-return-statement
-jump-statement := GOTO IDENTIFIER ;
-coroutine-return-statement := CO_RETURN expr-or-braced-init-list_opt ;
-declaration-statement := block-declaration
-
-# gram.dcl
-declaration-seq := declaration
-declaration-seq := declaration-seq declaration
-declaration := block-declaration
-declaration := nodeclspec-function-declaration
-declaration := function-definition
-declaration := template-declaration
-declaration := deduction-guide
-declaration := explicit-instantiation
-declaration := explicit-specialization
-declaration := export-declaration
-declaration := linkage-specification
-declaration := namespace-definition
-declaration := empty-declaration
-declaration := module-import-declaration
-block-declaration := simple-declaration
-block-declaration := asm-declaration
-block-declaration := namespace-alias-definition
-block-declaration := using-declaration
-block-declaration := using-enum-declaration
-block-declaration := using-directive
-block-declaration := static_assert-declaration
-block-declaration := alias-declaration
-block-declaration := opaque-enum-declaration
-nodeclspec-function-declaration := function-declarator ;
-alias-declaration := USING IDENTIFIER = defining-type-id ;
-simple-declaration := decl-specifier-seq init-declarator-list_opt ;
-simple-declaration := decl-specifier-seq ref-qualifier_opt [ identifier-list ] initializer ; [guard]
-static_assert-declaration := STATIC_ASSERT ( constant-expression ) ;
-static_assert-declaration := STATIC_ASSERT ( constant-expression , string-literal ) ;
-empty-declaration := ;
-# dcl.spec
-decl-specifier := storage-class-specifier
-decl-specifier := defining-type-specifier
-decl-specifier := function-specifier
-decl-specifier := FRIEND
-decl-specifier := TYPEDEF
-decl-specifier := CONSTEXPR
-decl-specifier := CONSTEVAL
-decl-specifier := CONSTINIT
-decl-specifier := INLINE
-decl-specifier-seq := decl-specifier
-decl-specifier-seq := decl-specifier decl-specifier-seq [guard]
-storage-class-specifier := STATIC
-storage-class-specifier := THREAD_LOCAL
-storage-class-specifier := EXTERN
-storage-class-specifier := MUTABLE
-function-specifier := VIRTUAL
-function-specifier := explicit-specifier
-explicit-specifier := EXPLICIT ( constant-expression )
-explicit-specifier := EXPLICIT
-type-specifier := simple-type-specifier
-type-specifier := elaborated-type-specifier
-type-specifier := typename-specifier
-type-specifier := cv-qualifier
-type-specifier-seq := type-specifier
-type-specifier-seq := type-specifier type-specifier-seq [guard]
-defining-type-specifier := type-specifier
-defining-type-specifier := class-specifier
-defining-type-specifier := enum-specifier
-defining-type-specifier-seq := defining-type-specifier
-defining-type-specifier-seq := defining-type-specifier defining-type-specifier-seq [guard]
-simple-type-specifier := nested-name-specifier_opt type-name
-simple-type-specifier := nested-name-specifier TEMPLATE simple-template-id
-simple-type-specifier := decltype-specifier
-simple-type-specifier := placeholder-type-specifier
-simple-type-specifier := nested-name-specifier_opt template-name
-simple-type-specifier := SHORT
-simple-type-specifier := LONG
-simple-type-specifier := SIGNED
-simple-type-specifier := UNSIGNED
-simple-type-specifier := builtin-type
-#! builtin-type added to aid in classifying which specifiers may combined.
-builtin-type := CHAR
-builtin-type := CHAR8_T
-builtin-type := CHAR16_T
-builtin-type := CHAR32_T
-builtin-type := WCHAR_T
-builtin-type := BOOL
-builtin-type := INT
-builtin-type := FLOAT
-builtin-type := DOUBLE
-builtin-type := VOID
-#! Unlike C++ standard grammar, we don't distinguish the underlying type (class,
-#! enum, typedef) of the IDENTIFIER, as these ambiguities are "local" and don't
-#! affect the final parse tree. Eliminating them gives a significant performance
-#! boost to the parser.
-type-name := IDENTIFIER
-type-name := simple-template-id
-elaborated-type-specifier := class-key nested-name-specifier_opt IDENTIFIER
-elaborated-type-specifier := class-key simple-template-id
-elaborated-type-specifier := class-key nested-name-specifier TEMPLATE_opt simple-template-id
-elaborated-type-specifier := elaborated-enum-specifier
-elaborated-enum-specifier := ENUM nested-name-specifier_opt IDENTIFIER
-decltype-specifier := DECLTYPE ( expression )
-placeholder-type-specifier := type-constraint_opt AUTO
-placeholder-type-specifier := type-constraint_opt DECLTYPE ( AUTO )
-init-declarator-list := init-declarator
-init-declarator-list := init-declarator-list , init-declarator
-#! The standard grammar allows:
-#! 1) an initializer with any declarator, including a function declarator, this
-#! creates an ambiguity where a function definition is misparsed as a simple
-#! declaration;
-#! 2) an function-body with any declarator, includeing a non-function
-#! declarator, this creates an ambiguity whwere a simple-declaration is
-#! misparsed as a function-definition;
-#! We extend the standard declarator to function-declarator and non-function-declarator
-#! to eliminate these false parses.
-init-declarator := non-function-declarator initializer_opt
-init-declarator := function-declarator requires-clause_opt
-function-declarator := declarator [guard]
-non-function-declarator := declarator [guard]
-declarator := ptr-declarator
-declarator := noptr-declarator parameters-and-qualifiers trailing-return-type
-ptr-declarator := noptr-declarator
-ptr-declarator := ptr-operator ptr-declarator
-noptr-declarator := declarator-id
-noptr-declarator := noptr-declarator parameters-and-qualifiers
-noptr-declarator := noptr-declarator [ constant-expression_opt ]
-noptr-declarator := ( ptr-declarator )
-parameters-and-qualifiers := ( parameter-declaration-clause_opt [recover=Brackets] ) cv-qualifier-seq_opt ref-qualifier_opt noexcept-specifier_opt
-trailing-return-type := -> type-id
-ptr-operator := * cv-qualifier-seq_opt
-ptr-operator := &
-ptr-operator := &&
-ptr-operator := nested-name-specifier * cv-qualifier-seq_opt
-cv-qualifier-seq := cv-qualifier cv-qualifier-seq_opt
-cv-qualifier := CONST
-cv-qualifier := VOLATILE
-ref-qualifier := &
-ref-qualifier := &&
-declarator-id := ..._opt id-expression
-type-id := type-specifier-seq abstract-declarator_opt
-defining-type-id := defining-type-specifier-seq abstract-declarator_opt
-abstract-declarator := ptr-abstract-declarator
-abstract-declarator := noptr-abstract-declarator_opt parameters-and-qualifiers trailing-return-type
-abstract-declarator := abstract-pack-declarator
-ptr-abstract-declarator := noptr-abstract-declarator
-ptr-abstract-declarator := ptr-operator ptr-abstract-declarator_opt
-noptr-abstract-declarator := noptr-abstract-declarator_opt parameters-and-qualifiers
-noptr-abstract-declarator := noptr-abstract-declarator_opt [ constant-expression_opt ]
-noptr-abstract-declarator := ( ptr-abstract-declarator )
-abstract-pack-declarator := noptr-abstract-pack-declarator
-abstract-pack-declarator := ptr-operator abstract-pack-declarator
-noptr-abstract-pack-declarator := noptr-abstract-pack-declarator parameters-and-qualifiers
-noptr-abstract-pack-declarator := noptr-abstract-pack-declarator [ constant-expression_opt ]
-noptr-abstract-pack-declarator := ...
-#! Custom modifications to avoid nullable clause.
-parameter-declaration-clause := parameter-declaration-list
-parameter-declaration-clause := parameter-declaration-list_opt ...
-parameter-declaration-clause := parameter-declaration-list , ...
-parameter-declaration-list := parameter-declaration
-parameter-declaration-list := parameter-declaration-list , parameter-declaration
-parameter-declaration := decl-specifier-seq declarator
-parameter-declaration := decl-specifier-seq declarator = initializer-clause
-parameter-declaration := decl-specifier-seq abstract-declarator_opt
-parameter-declaration := decl-specifier-seq abstract-declarator_opt = initializer-clause
-# dcl.init
-initializer := brace-or-equal-initializer
-initializer := ( expression-list )
-brace-or-equal-initializer := = initializer-clause
-brace-or-equal-initializer := braced-init-list
-initializer-clause := assignment-expression
-initializer-clause := braced-init-list
-#! Allow mixed designated/non-designated init-list.
-# This is standard C, and accepted by clang and others as an extension.
-# FIXME: Decouple recovery from is-there-a-trailing-comma!
-braced-init-list := { initializer-list [recover=Brackets] }
-braced-init-list := { initializer-list , }
-braced-init-list := { }
-initializer-list := initializer-list-item
-initializer-list := initializer-list , initializer-list-item
-initializer-list-item := initializer-clause ..._opt
-initializer-list-item := designator brace-or-equal-initializer ..._opt
-designator := . IDENTIFIER
-#! Array designators are legal in C, and a common extension in C++.
-designator := [ expression ]
-expr-or-braced-init-list := expression
-expr-or-braced-init-list := braced-init-list
-# dcl.fct
-function-definition := decl-specifier-seq_opt function-declarator virt-specifier-seq_opt function-body
-function-definition := decl-specifier-seq_opt function-declarator requires-clause function-body
-function-body := ctor-initializer_opt compound-statement
-function-body := function-try-block
-function-body := = DEFAULT ;
-function-body := = DELETE ;
-# dcl.enum
-enum-specifier := enum-head { enumerator-list_opt }
-enum-specifier := enum-head { enumerator-list , }
-enum-head := enum-key enum-head-name_opt enum-base_opt
-enum-head-name := nested-name-specifier_opt IDENTIFIER
-opaque-enum-declaration := enum-key enum-head-name enum-base_opt ;
-enum-key := ENUM
-enum-key := ENUM CLASS
-enum-key := ENUM STRUCT
-enum-base := : type-specifier-seq
-enumerator-list := enumerator-definition
-enumerator-list := enumerator-list , enumerator-definition
-enumerator-definition := enumerator
-enumerator-definition := enumerator = constant-expression
-enumerator := IDENTIFIER
-using-enum-declaration := USING elaborated-enum-specifier ;
-# basic.namespace
-namespace-definition := named-namespace-definition
-namespace-definition := unnamed-namespace-definition
-namespace-definition := nested-namespace-definition
-named-namespace-definition := INLINE_opt NAMESPACE IDENTIFIER { namespace-body_opt }
-unnamed-namespace-definition := INLINE_opt NAMESPACE { namespace-body_opt }
-nested-namespace-definition := NAMESPACE enclosing-namespace-specifier :: INLINE_opt IDENTIFIER { namespace-body }
-enclosing-namespace-specifier := IDENTIFIER
-enclosing-namespace-specifier := enclosing-namespace-specifier :: INLINE_opt IDENTIFIER
-#! Custom modification to avoid nullable namespace-body.
-namespace-body := declaration-seq
-namespace-alias-definition := NAMESPACE IDENTIFIER = qualified-namespace-specifier ;
-qualified-namespace-specifier := nested-name-specifier_opt namespace-name
-using-directive := USING NAMESPACE nested-name-specifier_opt namespace-name ;
-using-declaration := USING using-declarator-list ;
-using-declarator-list := using-declarator ..._opt
-using-declarator-list := using-declarator-list , using-declarator ..._opt
-using-declarator := TYPENAME_opt nested-name-specifier unqualified-id
-# dcl.asm
-asm-declaration := ASM ( string-literal ) ;
-# dcl.link
-linkage-specification := EXTERN string-literal { declaration-seq_opt }
-linkage-specification := EXTERN string-literal declaration
-
-# gram.module
-module-declaration := export-keyword_opt module-keyword module-name module-partition_opt ;
-module-name := module-name-qualifier_opt IDENTIFIER
-module-partition := : module-name-qualifier_opt IDENTIFIER
-module-name-qualifier := IDENTIFIER .
-module-name-qualifier := module-name-qualifier IDENTIFIER .
-export-declaration := EXPORT declaration
-export-declaration := EXPORT { declaration-seq_opt }
-export-declaration := export-keyword module-import-declaration
-module-import-declaration := import-keyword module-name ;
-module-import-declaration := import-keyword module-partition ;
-# FIXME: we don't have header-name in the grammar. Handle these in PP?
-# module-import-declaration := import-keyword header-name ;
-global-module-fragment := module-keyword ; declaration-seq_opt
-private-module-fragment := module-keyword : PRIVATE ; declaration-seq_opt
-
-# gram.class
-class-specifier := class-head { member-specification_opt [recover=Brackets] }
-class-head := class-key class-head-name class-virt-specifier_opt base-clause_opt
-class-head := class-key base-clause_opt
-class-head-name := nested-name-specifier_opt type-name
-class-virt-specifier := contextual-final
-class-key := CLASS
-class-key := STRUCT
-class-key := UNION
-member-specification := member-declaration member-specification_opt
-member-specification := access-specifier : member-specification_opt
-member-declaration := decl-specifier-seq member-declarator-list_opt ;
-member-declaration := member-declarator-list ;
-member-declaration := function-definition
-member-declaration := using-declaration
-member-declaration := using-enum-declaration
-member-declaration := static_assert-declaration
-member-declaration := template-declaration
-member-declaration := explicit-specialization
-member-declaration := deduction-guide
-member-declaration := alias-declaration
-member-declaration := opaque-enum-declaration
-member-declaration := empty-declaration
-member-declarator-list := member-declarator
-member-declarator-list := member-declarator-list , member-declarator
-member-declarator := function-declarator virt-specifier-seq_opt pure-specifier_opt
-member-declarator := function-declarator requires-clause
-member-declarator := non-function-declarator brace-or-equal-initializer_opt
-member-declarator := IDENTIFIER_opt : constant-expression brace-or-equal-initializer_opt
-virt-specifier-seq := virt-specifier
-virt-specifier-seq := virt-specifier-seq virt-specifier
-virt-specifier := contextual-override
-virt-specifier := contextual-final
-pure-specifier := = contextual-zero
-conversion-function-id := OPERATOR conversion-type-id
-conversion-type-id := type-specifier-seq conversion-declarator_opt
-conversion-declarator := ptr-operator conversion-declarator_opt
-base-clause := : base-specifier-list
-base-specifier-list := base-specifier ..._opt
-base-specifier-list := base-specifier-list , base-specifier ..._opt
-base-specifier := class-or-decltype
-base-specifier := VIRTUAL access-specifier_opt class-or-decltype
-base-specifier := access-specifier VIRTUAL_opt class-or-decltype
-class-or-decltype := nested-name-specifier_opt type-name
-class-or-decltype := nested-name-specifier TEMPLATE simple-template-id
-class-or-decltype := decltype-specifier
-access-specifier := PRIVATE
-access-specifier := PROTECTED
-access-specifier := PUBLIC
-ctor-initializer := : mem-initializer-list
-mem-initializer-list := mem-initializer ..._opt
-mem-initializer-list := mem-initializer-list , mem-initializer ..._opt
-mem-initializer := mem-initializer-id ( expression-list_opt )
-mem-initializer := mem-initializer-id braced-init-list
-mem-initializer-id := class-or-decltype
-mem-initializer-id := IDENTIFIER
-
-# gram.over
-operator-function-id := OPERATOR operator-name
-operator-name := NEW
-operator-name := DELETE
-operator-name := NEW [ ]
-operator-name := DELETE [ ]
-operator-name := CO_AWAIT
-operator-name := ( )
-operator-name := [ ]
-operator-name := ->
-operator-name := ->*
-operator-name := ~
-operator-name := !
-operator-name := +
-operator-name := -
-operator-name := *
-operator-name := /
-operator-name := %
-operator-name := ^
-operator-name := &
-operator-name := |
-operator-name := =
-operator-name := +=
-operator-name := -=
-operator-name := *=
-operator-name := /=
-operator-name := %=
-operator-name := ^=
-operator-name := &=
-operator-name := |=
-operator-name := ==
-operator-name := !=
-operator-name := <
-operator-name := >
-operator-name := <=
-operator-name := >=
-operator-name := <=>
-operator-name := ||
-operator-name := <<
-operator-name := greatergreater
-operator-name := <<=
-operator-name := >>=
-operator-name := ++
-operator-name := --
-operator-name := ,
-literal-operator-id := OPERATOR string-literal IDENTIFIER
-literal-operator-id := OPERATOR user-defined-string-literal
-
-# gram.temp
-template-declaration := template-head declaration
-template-declaration := template-head concept-definition
-template-head := TEMPLATE < template-parameter-list > requires-clause_opt
-template-parameter-list := template-parameter
-template-parameter-list := template-parameter-list , template-parameter
-requires-clause := REQUIRES constraint-logical-or-expression
-constraint-logical-or-expression := constraint-logical-and-expression
-constraint-logical-or-expression := constraint-logical-or-expression || constraint-logical-and-expression
-constraint-logical-and-expression := primary-expression
-constraint-logical-and-expression := constraint-logical-and-expression && primary-expression
-template-parameter := type-parameter
-template-parameter := parameter-declaration
-type-parameter := type-parameter-key ..._opt IDENTIFIER_opt
-type-parameter := type-parameter-key IDENTIFIER_opt = type-id
-type-parameter := type-constraint ..._opt IDENTIFIER_opt
-type-parameter := type-constraint IDENTIFIER_opt = type-id
-type-parameter := template-head type-parameter-key ..._opt IDENTIFIER_opt
-type-parameter := template-head type-parameter-key IDENTIFIER_opt = id-expression
-type-parameter-key := CLASS
-type-parameter-key := TYPENAME
-type-constraint := nested-name-specifier_opt concept-name
-type-constraint := nested-name-specifier_opt concept-name < template-argument-list_opt >
-simple-template-id := template-name < template-argument-list_opt >
-template-id := simple-template-id
-template-id := operator-function-id < template-argument-list_opt >
-template-id := literal-operator-id < template-argument-list_opt >
-template-argument-list := template-argument ..._opt
-template-argument-list := template-argument-list , template-argument ..._opt
-template-argument := constant-expression
-template-argument := type-id
-template-argument := id-expression
-constraint-expression := logical-or-expression
-deduction-guide := explicit-specifier_opt template-name ( parameter-declaration-list_opt ) -> simple-template-id ;
-concept-definition := CONCEPT concept-name = constraint-expression ;
-concept-name := IDENTIFIER
-typename-specifier := TYPENAME nested-name-specifier IDENTIFIER
-typename-specifier := TYPENAME nested-name-specifier TEMPLATE_opt simple-template-id
-explicit-instantiation := EXTERN_opt TEMPLATE declaration
-explicit-specialization := TEMPLATE < > declaration
-
-# gram.except
-try-block := TRY compound-statement handler-seq
-function-try-block := TRY ctor-initializer_opt compound-statement handler-seq
-handler-seq := handler handler-seq_opt
-handler := CATCH ( exception-declaration ) compound-statement
-exception-declaration := type-specifier-seq declarator
-exception-declaration := type-specifier-seq abstract-declarator_opt
-noexcept-specifier := NOEXCEPT ( constant-expression )
-noexcept-specifier := NOEXCEPT
-
-# gram.cpp
-identifier-list := IDENTIFIER
-identifier-list := identifier-list , IDENTIFIER
-
-# gram.lex
-#! As we use clang lexer, most of lexical symbols are not needed, we only add
-#! needed literals.
-literal := integer-literal
-literal := character-literal
-literal := floating-point-literal
-literal := string-literal
-literal := boolean-literal
-literal := pointer-literal
-literal := user-defined-literal
-integer-literal := NUMERIC_CONSTANT [guard]
-character-literal := CHAR_CONSTANT [guard]
-character-literal := WIDE_CHAR_CONSTANT [guard]
-character-literal := UTF8_CHAR_CONSTANT [guard]
-character-literal := UTF16_CHAR_CONSTANT [guard]
-character-literal := UTF32_CHAR_CONSTANT [guard]
-floating-point-literal := NUMERIC_CONSTANT [guard]
-string-literal-chunk := STRING_LITERAL [guard]
-string-literal-chunk := WIDE_STRING_LITERAL [guard]
-string-literal-chunk := UTF8_STRING_LITERAL [guard]
-string-literal-chunk := UTF16_STRING_LITERAL [guard]
-string-literal-chunk := UTF32_STRING_LITERAL [guard]
-#! Technically, string concatenation happens at phase 6 which is before parsing,
-#! so it doesn't belong to the grammar. However, we extend the grammar to
-#! support it, to make the pseudoparser fully functional on practical code.
-string-literal := string-literal-chunk
-string-literal := string-literal string-literal-chunk
-user-defined-literal := user-defined-integer-literal
-user-defined-literal := user-defined-floating-point-literal
-user-defined-literal := user-defined-string-literal
-user-defined-literal := user-defined-character-literal
-user-defined-integer-literal := NUMERIC_CONSTANT [guard]
-user-defined-string-literal-chunk := STRING_LITERAL [guard]
-user-defined-string-literal-chunk := WIDE_STRING_LITERAL [guard]
-user-defined-string-literal-chunk := UTF8_STRING_LITERAL [guard]
-user-defined-string-literal-chunk := UTF16_STRING_LITERAL [guard]
-user-defined-string-literal-chunk := UTF32_STRING_LITERAL [guard]
-user-defined-string-literal := user-defined-string-literal-chunk
-user-defined-string-literal := string-literal-chunk user-defined-string-literal
-user-defined-string-literal := user-defined-string-literal string-literal-chunk
-user-defined-floating-point-literal := NUMERIC_CONSTANT [guard]
-user-defined-character-literal := CHAR_CONSTANT [guard]
-user-defined-character-literal := WIDE_CHAR_CONSTANT [guard]
-user-defined-character-literal := UTF8_CHAR_CONSTANT [guard]
-user-defined-character-literal := UTF16_CHAR_CONSTANT [guard]
-user-defined-character-literal := UTF32_CHAR_CONSTANT [guard]
-boolean-literal := FALSE
-boolean-literal := TRUE
-pointer-literal := NULLPTR
-
-#! Contextual keywords -- clang lexer always lexes them as identifier tokens.
-#! Placeholders for literal text in the grammar that lex as other things.
-contextual-override := IDENTIFIER [guard]
-contextual-final := IDENTIFIER [guard]
-contextual-zero := NUMERIC_CONSTANT [guard]
-module-keyword := IDENTIFIER [guard]
-import-keyword := IDENTIFIER [guard]
-export-keyword := IDENTIFIER [guard]
-
-#! greatergreater token -- clang lexer always lexes it as a single token, we
-#! split it into two tokens to make the GLR parser aware of the nested-template
-#! case.
-greatergreater := > >
-
-#! C++ predefined identifier, __func__ [dcl.fct.def.general] p8
-#! FIXME: add other (MSVC, GNU extension) predefined identifiers.
-primary-expression := predefined-expression
-predefined-expression := __FUNC__
diff --git a/clang-tools-extra/pseudo/lib/grammar/CMakeLists.txt b/clang-tools-extra/pseudo/lib/grammar/CMakeLists.txt
deleted file mode 100644
index bb08ebab0fa62e..00000000000000
--- a/clang-tools-extra/pseudo/lib/grammar/CMakeLists.txt
+++ /dev/null
@@ -1,10 +0,0 @@
-set(LLVM_LINK_COMPONENTS Support)
-
-add_clang_library(clangPseudoGrammar
- Grammar.cpp
- GrammarBNF.cpp
- LRGraph.cpp
- LRTable.cpp
- LRTableBuild.cpp
- )
-
diff --git a/clang-tools-extra/pseudo/lib/grammar/Grammar.cpp b/clang-tools-extra/pseudo/lib/grammar/Grammar.cpp
deleted file mode 100644
index 3e9c5c3c7a6c42..00000000000000
--- a/clang-tools-extra/pseudo/lib/grammar/Grammar.cpp
+++ /dev/null
@@ -1,190 +0,0 @@
-//===--- Grammar.cpp - Grammar for clang pseudoparser -----------*- C++-*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "clang-pseudo/grammar/Grammar.h"
-#include "clang/Basic/TokenKinds.h"
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/Support/FormatVariadic.h"
-#include "llvm/Support/raw_ostream.h"
-#include <optional>
-
-namespace clang {
-namespace pseudo {
-
-Rule::Rule(SymbolID Target, llvm::ArrayRef<SymbolID> Sequence)
- : Target(Target), Size(static_cast<uint8_t>(Sequence.size())) {
- assert(Sequence.size() <= Rule::MaxElements);
- llvm::copy(Sequence, this->Sequence);
-}
-
-Grammar::Grammar(std::unique_ptr<GrammarTable> Table) : T(std::move(Table)) {
- Underscore = *findNonterminal("_");
-}
-
-llvm::ArrayRef<Rule> Grammar::rulesFor(SymbolID SID) const {
- assert(isNonterminal(SID));
- const auto &R = T->Nonterminals[SID].RuleRange;
- assert(R.End <= T->Rules.size());
- return llvm::ArrayRef(&T->Rules[R.Start], R.End - R.Start);
-}
-
-const Rule &Grammar::lookupRule(RuleID RID) const {
- assert(RID < T->Rules.size());
- return T->Rules[RID];
-}
-
-llvm::StringRef Grammar::symbolName(SymbolID SID) const {
- if (isToken(SID))
- return T->Terminals[symbolToToken(SID)];
- return T->Nonterminals[SID].Name;
-}
-
-std::optional<SymbolID> Grammar::findNonterminal(llvm::StringRef Name) const {
- auto It = llvm::partition_point(
- T->Nonterminals,
- [&](const GrammarTable::Nonterminal &X) { return X.Name < Name; });
- if (It != T->Nonterminals.end() && It->Name == Name)
- return It - T->Nonterminals.begin();
- return std::nullopt;
-}
-
-std::string Grammar::dumpRule(RuleID RID) const {
- std::string Result;
- llvm::raw_string_ostream OS(Result);
- const Rule &R = T->Rules[RID];
- OS << symbolName(R.Target) << " :=";
- for (unsigned I = 0; I < R.Size; ++I) {
- OS << " " << symbolName(R.Sequence[I]);
- if (R.RecoveryIndex == I)
- OS << " [recover=" << T->AttributeValues[R.Recovery] << "]";
- }
- if (R.Guarded)
- OS << " [guard]";
- return Result;
-}
-
-std::string Grammar::dumpRules(SymbolID SID) const {
- assert(isNonterminal(SID));
- std::string Result;
- const auto &Range = T->Nonterminals[SID].RuleRange;
- for (RuleID RID = Range.Start; RID < Range.End; ++RID)
- Result.append(dumpRule(RID)).push_back('\n');
- return Result;
-}
-
-std::string Grammar::dump() const {
- std::string Result;
- llvm::raw_string_ostream OS(Result);
- OS << "Nonterminals:\n";
- for (SymbolID SID = 0; SID < T->Nonterminals.size(); ++SID)
- OS << llvm::formatv(" {0} {1}\n", SID, symbolName(SID));
- OS << "Rules:\n";
- for (RuleID RID = 0; RID < T->Rules.size(); ++RID)
- OS << llvm::formatv(" {0} {1}\n", RID, dumpRule(RID));
- return OS.str();
-}
-
-std::vector<llvm::DenseSet<SymbolID>> firstSets(const Grammar &G) {
- std::vector<llvm::DenseSet<SymbolID>> FirstSets(
- G.table().Nonterminals.size());
- auto ExpandFirstSet = [&FirstSets](SymbolID Target, SymbolID First) {
- assert(isNonterminal(Target));
- if (isToken(First))
- return FirstSets[Target].insert(First).second;
- bool Changed = false;
- for (SymbolID SID : FirstSets[First])
- Changed |= FirstSets[Target].insert(SID).second;
- return Changed;
- };
-
- // A rule S := T ... implies elements in FIRST(S):
- // - if T is a terminal, FIRST(S) contains T
- // - if T is a nonterminal, FIRST(S) contains FIRST(T)
- // Since FIRST(T) may not have been fully computed yet, FIRST(S) itself may
- // end up being incomplete.
- // We iterate until we hit a fixed point.
- // (This isn't particularly efficient, but table building isn't on the
- // critical path).
- bool Changed = true;
- while (Changed) {
- Changed = false;
- for (const auto &R : G.table().Rules)
- // We only need to consider the first element because symbols are
- // non-nullable.
- Changed |= ExpandFirstSet(R.Target, R.seq().front());
- }
- return FirstSets;
-}
-
-std::vector<llvm::DenseSet<SymbolID>> followSets(const Grammar &G) {
- auto FirstSets = firstSets(G);
- std::vector<llvm::DenseSet<SymbolID>> FollowSets(
- G.table().Nonterminals.size());
- // Expand the follow set of a nonterminal symbol Y by adding all from the
- // given symbol set.
- auto ExpandFollowSet = [&FollowSets](SymbolID Y,
- const llvm::DenseSet<SymbolID> &ToAdd) {
- assert(isNonterminal(Y));
- bool Changed = false;
- for (SymbolID F : ToAdd)
- Changed |= FollowSets[Y].insert(F).second;
- return Changed;
- };
- // Follow sets is computed based on the following 3 rules, the computation
- // is completed at a fixed point where there is no more new symbols can be
- // added to any of the follow sets.
- //
- // Rule 1: add endmarker to the FOLLOW(S), where S is the start symbol of the
- // augmented grammar, in our case it is '_'.
- FollowSets[G.underscore()].insert(tokenSymbol(tok::eof));
- bool Changed = true;
- while (Changed) {
- Changed = false;
- for (const auto &R : G.table().Rules) {
- // Rule 2: for a rule X := ... Y Z, we add all symbols from FIRST(Z) to
- // FOLLOW(Y).
- for (size_t I = 0; I + 1 < R.seq().size(); ++I) {
- if (isToken(R.seq()[I]))
- continue;
- // We only need to consider the next symbol because symbols are
- // non-nullable.
- SymbolID Next = R.seq()[I + 1];
- if (isToken(Next))
- // First set for a terminal is itself.
- Changed |= ExpandFollowSet(R.seq()[I], {Next});
- else
- Changed |= ExpandFollowSet(R.seq()[I], FirstSets[Next]);
- }
- // Rule 3: for a rule X := ... Z, we add all symbols from FOLLOW(X) to
- // FOLLOW(Z).
- SymbolID Z = R.seq().back();
- if (isNonterminal(Z))
- Changed |= ExpandFollowSet(Z, FollowSets[R.Target]);
- }
- }
- return FollowSets;
-}
-
-static llvm::ArrayRef<std::string> getTerminalNames() {
- static const auto &TerminalNames = []() {
- auto TerminalNames = new std::string[NumTerminals];
-#define PUNCTUATOR(Tok, Spelling) TerminalNames[tok::Tok] = Spelling;
-#define KEYWORD(Keyword, Condition) \
- TerminalNames[tok::kw_##Keyword] = llvm::StringRef(#Keyword).upper();
-#define TOK(Tok) TerminalNames[tok::Tok] = llvm::StringRef(#Tok).upper();
-#include "clang/Basic/TokenKinds.def"
- return llvm::ArrayRef(TerminalNames, NumTerminals);
- }();
- return TerminalNames;
-}
-GrammarTable::GrammarTable() : Terminals(getTerminalNames()) {}
-
-} // namespace pseudo
-} // namespace clang
diff --git a/clang-tools-extra/pseudo/lib/grammar/GrammarBNF.cpp b/clang-tools-extra/pseudo/lib/grammar/GrammarBNF.cpp
deleted file mode 100644
index f1b8e06e22432c..00000000000000
--- a/clang-tools-extra/pseudo/lib/grammar/GrammarBNF.cpp
+++ /dev/null
@@ -1,362 +0,0 @@
-//===--- GrammarBNF.cpp - build grammar from BNF files ----------*- C++-*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "clang-pseudo/grammar/Grammar.h"
-#include "clang/Basic/TokenKinds.h"
-#include "llvm/ADT/DenseSet.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/Support/FormatVariadic.h"
-#include <memory>
-#include <utility>
-
-namespace clang {
-namespace pseudo {
-
-namespace {
-static const llvm::StringRef OptSuffix = "_opt";
-static const llvm::StringRef StartSymbol = "_";
-
-// Builds grammar from BNF files.
-class GrammarBuilder {
-public:
- GrammarBuilder(std::vector<std::string> &Diagnostics)
- : Diagnostics(Diagnostics) {}
-
- Grammar build(llvm::StringRef BNF) {
- auto Specs = eliminateOptional(parse(BNF));
-
- assert(llvm::all_of(Specs,
- [](const RuleSpec &R) {
- if (R.Target.ends_with(OptSuffix))
- return false;
- return llvm::all_of(
- R.Sequence, [](const RuleSpec::Element &E) {
- return !E.Symbol.ends_with(OptSuffix);
- });
- }) &&
- "Optional symbols should be eliminated!");
-
- auto T = std::make_unique<GrammarTable>();
-
- // Assemble the name->ID and ID->nonterminal name maps.
- llvm::DenseSet<llvm::StringRef> UniqueNonterminals;
- llvm::DenseMap<llvm::StringRef, SymbolID> SymbolIds;
-
- llvm::DenseSet<llvm::StringRef> UniqueAttributeValues;
-
- for (uint16_t I = 0; I < NumTerminals; ++I)
- SymbolIds.try_emplace(T->Terminals[I], tokenSymbol(tok::TokenKind(I)));
- auto Consider = [&](llvm::StringRef Name) {
- if (!SymbolIds.count(Name))
- UniqueNonterminals.insert(Name);
- };
- for (const auto &Spec : Specs) {
- Consider(Spec.Target);
- for (const RuleSpec::Element &Elt : Spec.Sequence) {
- Consider(Elt.Symbol);
- for (const auto& KV : Elt.Attributes)
- UniqueAttributeValues.insert(KV.second);
- }
- }
- for (llvm::StringRef Name : UniqueNonterminals) {
- T->Nonterminals.emplace_back();
- T->Nonterminals.back().Name = Name.str();
- }
- assert(T->Nonterminals.size() < (1 << (SymbolBits - 1)) &&
- "Too many nonterminals to fit in SymbolID bits!");
- llvm::sort(T->Nonterminals, [](const GrammarTable::Nonterminal &L,
- const GrammarTable::Nonterminal &R) {
- return L.Name < R.Name;
- });
- // Add an empty string for the corresponding sentinel unset attribute.
- T->AttributeValues.push_back("");
- UniqueAttributeValues.erase("");
- for (llvm::StringRef Name : UniqueAttributeValues) {
- T->AttributeValues.emplace_back();
- T->AttributeValues.back() = Name.str();
- }
- llvm::sort(T->AttributeValues);
- assert(T->AttributeValues.front() == "");
-
- // Build name -> ID maps for nonterminals.
- for (SymbolID SID = 0; SID < T->Nonterminals.size(); ++SID)
- SymbolIds.try_emplace(T->Nonterminals[SID].Name, SID);
-
- // Convert the rules.
- T->Rules.reserve(Specs.size());
- std::vector<SymbolID> Symbols;
- auto Lookup = [SymbolIds](llvm::StringRef Name) {
- auto It = SymbolIds.find(Name);
- assert(It != SymbolIds.end() && "Didn't find the symbol in SymbolIds!");
- return It->second;
- };
- for (const auto &Spec : Specs) {
- assert(Spec.Sequence.size() <= Rule::MaxElements);
- Symbols.clear();
- for (const RuleSpec::Element &Elt : Spec.Sequence)
- Symbols.push_back(Lookup(Elt.Symbol));
- T->Rules.push_back(Rule(Lookup(Spec.Target), Symbols));
- applyAttributes(Spec, *T, T->Rules.back());
- }
-
- assert(T->Rules.size() < (1 << RuleBits) &&
- "Too many rules to fit in RuleID bits!");
- const auto &SymbolOrder = getTopologicalOrder(T.get());
- llvm::stable_sort(
- T->Rules, [&SymbolOrder](const Rule &Left, const Rule &Right) {
- // Sorted by the topological order of the nonterminal Target.
- return SymbolOrder[Left.Target] < SymbolOrder[Right.Target];
- });
- for (SymbolID SID = 0; SID < T->Nonterminals.size(); ++SID) {
- auto StartIt = llvm::partition_point(T->Rules, [&](const Rule &R) {
- return SymbolOrder[R.Target] < SymbolOrder[SID];
- });
- RuleID Start = StartIt - T->Rules.begin();
- RuleID End = Start;
- while (End < T->Rules.size() && T->Rules[End].Target == SID)
- ++End;
- T->Nonterminals[SID].RuleRange = {Start, End};
- }
- Grammar G(std::move(T));
- diagnoseGrammar(G);
- return G;
- }
-
- // Gets topological order for nonterminal symbols.
- //
- // The topological order is defined as: if a *single* nonterminal A produces
- // (or transitively) a nonterminal B (that said, there is a production rule
- // B := A), then A is less than B.
- //
- // It returns the sort key for each symbol, the array is indexed by SymbolID.
- std::vector<unsigned> getTopologicalOrder(GrammarTable *T) {
- std::vector<std::pair<SymbolID, SymbolID>> Dependencies;
- for (const auto &Rule : T->Rules) {
- // if A := B, A depends on B.
- if (Rule.Size == 1 && pseudo::isNonterminal(Rule.Sequence[0]))
- Dependencies.push_back({Rule.Target, Rule.Sequence[0]});
- }
- llvm::sort(Dependencies);
- std::vector<SymbolID> Order;
- // Each nonterminal state flows: NotVisited -> Visiting -> Visited.
- enum State {
- NotVisited,
- Visiting,
- Visited,
- };
- std::vector<State> VisitStates(T->Nonterminals.size(), NotVisited);
- std::function<void(SymbolID)> DFS = [&](SymbolID SID) -> void {
- if (VisitStates[SID] == Visited)
- return;
- if (VisitStates[SID] == Visiting) {
- Diagnostics.push_back(
- llvm::formatv("The grammar contains a cycle involving symbol {0}",
- T->Nonterminals[SID].Name));
- return;
- }
- VisitStates[SID] = Visiting;
- for (auto It = llvm::lower_bound(Dependencies,
- std::pair<SymbolID, SymbolID>{SID, 0});
- It != Dependencies.end() && It->first == SID; ++It)
- DFS(It->second);
- VisitStates[SID] = Visited;
- Order.push_back(SID);
- };
- for (SymbolID ID = 0; ID != T->Nonterminals.size(); ++ID)
- DFS(ID);
- std::vector<unsigned> Result(T->Nonterminals.size(), 0);
- for (size_t I = 0; I < Order.size(); ++I)
- Result[Order[I]] = I;
- return Result;
- }
-
-private:
- // Text representation of a BNF grammar rule.
- struct RuleSpec {
- llvm::StringRef Target;
- struct Element {
- llvm::StringRef Symbol; // Name of the symbol
- // Attributes that are associated to the sequence symbol or rule.
- std::vector<std::pair<llvm::StringRef/*Key*/, llvm::StringRef/*Value*/>>
- Attributes;
- };
- std::vector<Element> Sequence;
-
- std::string toString() const {
- std::vector<llvm::StringRef> Body;
- for (const auto &E : Sequence)
- Body.push_back(E.Symbol);
- return llvm::formatv("{0} := {1}", Target, llvm::join(Body, " "));
- }
- };
-
- std::vector<RuleSpec> parse(llvm::StringRef Lines) {
- std::vector<RuleSpec> Specs;
- for (llvm::StringRef Line : llvm::split(Lines, '\n')) {
- Line = Line.trim();
- // Strip anything coming after the '#' (comment).
- Line = Line.take_while([](char C) { return C != '#'; });
- if (Line.empty())
- continue;
- RuleSpec Rule;
- if (parseLine(Line, Rule))
- Specs.push_back(std::move(Rule));
- }
- return Specs;
- }
-
- bool parseLine(llvm::StringRef Line, RuleSpec &Out) {
- auto Parts = Line.split(":=");
- if (Parts.first == Line) { // no separator in Line
- Diagnostics.push_back(
- llvm::formatv("Failed to parse '{0}': no separator :=", Line).str());
- return false;
- }
-
- Out.Target = Parts.first.trim();
- Out.Sequence.clear();
- for (llvm::StringRef Chunk : llvm::split(Parts.second, ' ')) {
- Chunk = Chunk.trim();
- if (Chunk.empty())
- continue; // skip empty
- if (Chunk.starts_with("[") && Chunk.ends_with("]")) {
- if (Out.Sequence.empty())
- continue;
-
- parseAttributes(Chunk, Out.Sequence.back().Attributes);
- continue;
- }
-
- Out.Sequence.push_back({Chunk, /*Attributes=*/{}});
- }
- return true;
- }
-
- bool parseAttributes(
- llvm::StringRef Content,
- std::vector<std::pair<llvm::StringRef, llvm::StringRef>> &Out) {
- assert(Content.starts_with("[") && Content.ends_with("]"));
- auto KV = Content.drop_front().drop_back().split('=');
- Out.push_back({KV.first, KV.second.trim()});
-
- return true;
- }
- // Apply the parsed extensions (stored in RuleSpec) to the grammar Rule.
- void applyAttributes(const RuleSpec& Spec, const GrammarTable& T, Rule& R) {
- auto LookupExtensionID = [&T](llvm::StringRef Name) {
- const auto It = llvm::partition_point(
- T.AttributeValues, [&](llvm::StringRef X) { return X < Name; });
- assert(It != T.AttributeValues.end() && *It == Name &&
- "Didn't find the attribute in AttrValues!");
- return It - T.AttributeValues.begin();
- };
- for (unsigned I = 0; I < Spec.Sequence.size(); ++I) {
- for (const auto &KV : Spec.Sequence[I].Attributes) {
- if (KV.first == "guard") {
- R.Guarded = true;
- } else if (KV.first == "recover") {
- R.Recovery = LookupExtensionID(KV.second);
- R.RecoveryIndex = I;
- } else {
- Diagnostics.push_back(
- llvm::formatv("Unknown attribute '{0}'", KV.first).str());
- }
- }
- }
- }
-
- // Inlines all _opt symbols.
- // For example, a rule E := id +_opt id, after elimination, we have two
- // equivalent rules:
- // 1) E := id + id
- // 2) E := id id
- std::vector<RuleSpec> eliminateOptional(llvm::ArrayRef<RuleSpec> Input) {
- std::vector<RuleSpec> Results;
- std::vector<RuleSpec::Element> Storage;
- for (const auto &R : Input) {
- eliminateOptionalTail(
- R.Sequence, Storage, [&Results, &Storage, &R, this]() {
- if (Storage.empty()) {
- Diagnostics.push_back(
- llvm::formatv("Rule '{0}' has a nullable RHS", R.toString()));
- return;
- }
- Results.push_back({R.Target, Storage});
- });
- assert(Storage.empty());
- }
- return Results;
- }
- void eliminateOptionalTail(llvm::ArrayRef<RuleSpec::Element> Elements,
- std::vector<RuleSpec::Element> &Result,
- llvm::function_ref<void()> CB) {
- if (Elements.empty())
- return CB();
- auto Front = Elements.front();
- if (!Front.Symbol.ends_with(OptSuffix)) {
- Result.push_back(std::move(Front));
- eliminateOptionalTail(Elements.drop_front(1), Result, CB);
- Result.pop_back();
- return;
- }
- // Enumerate two options: skip the opt symbol, or inline the symbol.
- eliminateOptionalTail(Elements.drop_front(1), Result, CB); // skip
- Front.Symbol = Front.Symbol.drop_back(OptSuffix.size()); // drop "_opt"
- Result.push_back(std::move(Front));
- eliminateOptionalTail(Elements.drop_front(1), Result, CB);
- Result.pop_back();
- }
-
- // Diagnoses the grammar and emit warnings if any.
- void diagnoseGrammar(const Grammar &G) {
- const auto &T = G.table();
- for (SymbolID SID = 0; SID < T.Nonterminals.size(); ++SID) {
- auto Range = T.Nonterminals[SID].RuleRange;
- if (Range.Start == Range.End)
- Diagnostics.push_back(
- llvm::formatv("No rules for nonterminal: {0}", G.symbolName(SID)));
- llvm::StringRef NameRef = T.Nonterminals[SID].Name;
- if (llvm::all_of(NameRef, llvm::isAlpha) && NameRef.upper() == NameRef) {
- Diagnostics.push_back(llvm::formatv(
- "Token-like name {0} is used as a nonterminal", G.symbolName(SID)));
- }
- }
- llvm::DenseSet<llvm::hash_code> VisitedRules;
- for (RuleID RID = 0; RID < T.Rules.size(); ++RID) {
- const auto &R = T.Rules[RID];
- auto Code = llvm::hash_combine(
- R.Target, llvm::hash_combine_range(R.seq().begin(), R.seq().end()));
- auto [_, New] = VisitedRules.insert(Code);
- if (!New)
- Diagnostics.push_back(
- llvm::formatv("Duplicate rule: `{0}`", G.dumpRule(RID)));
- }
- // symbol-id -> used counts
- std::vector<unsigned> UseCounts(T.Nonterminals.size(), 0);
- for (const Rule &R : T.Rules)
- for (SymbolID SID : R.seq())
- if (isNonterminal(SID))
- ++UseCounts[SID];
- for (SymbolID SID = 0; SID < UseCounts.size(); ++SID)
- if (UseCounts[SID] == 0 && T.Nonterminals[SID].Name != StartSymbol)
- Diagnostics.push_back(
- llvm::formatv("Nonterminal never used: {0}", G.symbolName(SID)));
- }
- std::vector<std::string> &Diagnostics;
-};
-} // namespace
-
-Grammar Grammar::parseBNF(llvm::StringRef BNF,
- std::vector<std::string> &Diagnostics) {
- Diagnostics.clear();
- return GrammarBuilder(Diagnostics).build(BNF);
-}
-
-} // namespace pseudo
-} // namespace clang
diff --git a/clang-tools-extra/pseudo/lib/grammar/LRGraph.cpp b/clang-tools-extra/pseudo/lib/grammar/LRGraph.cpp
deleted file mode 100644
index 82c7cc7d8b2936..00000000000000
--- a/clang-tools-extra/pseudo/lib/grammar/LRGraph.cpp
+++ /dev/null
@@ -1,265 +0,0 @@
-//===--- LRGraph.cpp - -------------------------------------------*- C++-*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "clang-pseudo/grammar/LRGraph.h"
-#include "clang-pseudo/grammar/Grammar.h"
-#include "llvm/ADT/DenseSet.h"
-#include "llvm/ADT/Hashing.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/Support/FormatVariadic.h"
-#include "llvm/Support/raw_ostream.h"
-
-using ItemSet = std::vector<clang::pseudo::Item>;
-
-namespace llvm {
-// Support clang::pseudo::Item as DenseMap keys.
-template <> struct DenseMapInfo<ItemSet> {
- static inline ItemSet getEmptyKey() {
- return {DenseMapInfo<clang::pseudo::Item>::getEmptyKey()};
- }
- static inline ItemSet getTombstoneKey() {
- return {DenseMapInfo<clang::pseudo::Item>::getTombstoneKey()};
- }
- static unsigned getHashValue(const ItemSet &I) {
- return llvm::hash_combine_range(I.begin(), I.end());
- }
- static bool isEqual(const ItemSet &LHS, const ItemSet &RHS) {
- return LHS == RHS;
- }
-};
-} // namespace llvm
-
-namespace clang {
-namespace pseudo {
-namespace {
-
-struct SortByNextSymbol {
- SortByNextSymbol(const Grammar &G) : G(G) {}
- bool operator()(const Item &L, const Item &R) {
- if (L.hasNext() && R.hasNext() && L.next(G) != R.next(G))
- return L.next(G) < R.next(G);
- if (L.hasNext() != R.hasNext())
- return L.hasNext() < R.hasNext(); // a trailing dot is minimal.
- return L < R;
- }
- const Grammar &G;
-};
-
-// Computes a closure of the given item set S:
-// - extends the given S to contain all options for parsing next token;
-// - nonterminals after a dot are recursively expanded into the begin-state
-// of all production rules that produce that nonterminal;
-//
-// Given
-// Grammar rules = [ _ := E, E := E - T, E := T, T := n, T := ( E ) ]
-// Input = [ E := . T ]
-// returns [ E := . T, T := . n, T := . ( E ) ]
-State closure(ItemSet Queue, const Grammar &G) {
- llvm::DenseSet<Item> InQueue = {Queue.begin(), Queue.end()};
- // We reuse the passed-by-value Queue as the final result, as it's already
- // initialized to the right elements.
- size_t ItIndex = 0;
- while (ItIndex < Queue.size()) {
- const Item &ExpandingItem = Queue[ItIndex];
- ++ItIndex;
- if (!ExpandingItem.hasNext())
- continue;
-
- SymbolID NextSym = ExpandingItem.next(G);
- if (pseudo::isToken(NextSym))
- continue;
- auto RRange = G.table().Nonterminals[NextSym].RuleRange;
- for (RuleID RID = RRange.Start; RID < RRange.End; ++RID) {
- Item NewItem = Item::start(RID, G);
- if (InQueue.insert(NewItem).second) // new
- Queue.push_back(std::move(NewItem));
- }
- }
- Queue.shrink_to_fit();
- llvm::sort(Queue, SortByNextSymbol(G));
- return {std::move(Queue)};
-}
-
-// Returns all next (with a dot advanced) kernel item sets, partitioned by the
-// advanced symbol.
-//
-// Given
-// S = [ E := . a b, E := E . - T ]
-// returns [
-// {id(a), [ E := a . b ]},
-// {id(-), [ E := E - . T ]}
-// ]
-std::vector<std::pair<SymbolID, ItemSet>>
-nextAvailableKernelItems(const State &S, const Grammar &G) {
- std::vector<std::pair<SymbolID, ItemSet>> Results;
- llvm::ArrayRef<Item> AllItems = S.Items;
- AllItems = AllItems.drop_while([](const Item &I) { return !I.hasNext(); });
- while (!AllItems.empty()) {
- SymbolID AdvancedSymbol = AllItems.front().next(G);
- auto Batch = AllItems.take_while([AdvancedSymbol, &G](const Item &I) {
- assert(I.hasNext());
- return I.next(G) == AdvancedSymbol;
- });
- assert(!Batch.empty());
- AllItems = AllItems.drop_front(Batch.size());
-
- // Advance a dot over the Symbol.
- ItemSet Next;
- for (const Item &I : Batch)
- Next.push_back(I.advance());
- // sort the set to keep order determinism for hash computation.
- llvm::sort(Next);
- Results.push_back({AdvancedSymbol, std::move(Next)});
- }
- return Results;
-}
-
-std::vector<std::pair<ExtensionID, SymbolID>>
-availableRecovery(const State &S, const Grammar &G) {
- std::vector<std::pair<ExtensionID, SymbolID>> Result;
- for (const Item &I : S.Items) {
- const auto &Rule = G.lookupRule(I.rule());
- if (I.dot() != Rule.RecoveryIndex)
- continue;
- Result.push_back({Rule.Recovery, Rule.seq()[Rule.RecoveryIndex]});
- }
- llvm::sort(Result);
- Result.erase(std::unique(Result.begin(), Result.end()), Result.end());
- return Result;
-}
-
-} // namespace
-
-std::string Item::dump(const Grammar &G) const {
- const auto &Rule = G.lookupRule(RID);
- auto ToNames = [&](llvm::ArrayRef<SymbolID> Syms) {
- std::vector<llvm::StringRef> Results;
- for (auto SID : Syms)
- Results.push_back(G.symbolName(SID));
- return Results;
- };
- return llvm::formatv("{0} := {1} • {2}{3}", G.symbolName(Rule.Target),
- llvm::join(ToNames(Rule.seq().take_front(DotPos)), " "),
- llvm::join(ToNames(Rule.seq().drop_front(DotPos)), " "),
- Rule.RecoveryIndex == DotPos ? " [recovery]" : "")
- .str();
-}
-
-std::string State::dump(const Grammar &G, unsigned Indent) const {
- std::string Result;
- llvm::raw_string_ostream OS(Result);
- for (const auto &Item : Items)
- OS.indent(Indent) << llvm::formatv("{0}\n", Item.dump(G));
- return OS.str();
-}
-
-std::string LRGraph::dumpForTests(const Grammar &G) const {
- std::string Result;
- llvm::raw_string_ostream OS(Result);
- OS << "States:\n";
- for (StateID ID = 0; ID < States.size(); ++ID) {
- OS << llvm::formatv("State {0}\n", ID);
- OS << States[ID].dump(G, /*Indent*/ 4);
- }
- for (const auto &E : Edges) {
- OS << llvm::formatv("{0} ->[{1}] {2}\n", E.Src, G.symbolName(E.Label),
- E.Dst);
- }
- return OS.str();
-}
-
-LRGraph LRGraph::buildLR0(const Grammar &G) {
- class Builder {
- public:
- Builder(const Grammar &G) : G(G) {}
-
- // Adds a given state if not existed.
- std::pair<StateID, /*inserted*/ bool> insert(ItemSet KernelItems) {
- assert(llvm::is_sorted(KernelItems) &&
- "Item must be sorted before inserting to a hash map!");
- auto It = StatesIndex.find(KernelItems);
- if (It != StatesIndex.end())
- return {It->second, false};
- States.push_back(closure(KernelItems, G));
- StateID NextStateID = States.size() - 1;
- StatesIndex.insert({std::move(KernelItems), NextStateID});
- return {NextStateID, true};
- }
-
- void insertEdge(StateID Src, StateID Dst, SymbolID Label) {
- Edges.push_back({Src, Dst, Label});
- }
-
- void insertRecovery(StateID Src, ExtensionID Strategy, SymbolID Result) {
- Recoveries.push_back({Src, Strategy, Result});
- }
-
- // Returns a state with the given id.
- const State &find(StateID ID) const {
- assert(ID < States.size());
- return States[ID];
- }
-
- void addStartState(SymbolID Sym, StateID State) {
- StartStates.push_back({Sym, State});
- }
-
- LRGraph build() && {
- States.shrink_to_fit();
- Edges.shrink_to_fit();
- Recoveries.shrink_to_fit();
- llvm::sort(StartStates);
- StartStates.shrink_to_fit();
- return LRGraph(std::move(States), std::move(Edges), std::move(Recoveries),
- std::move(StartStates));
- }
-
- private:
- // Key is the **kernel** item sets.
- llvm::DenseMap<ItemSet, /*index of States*/ size_t> StatesIndex;
- std::vector<State> States;
- std::vector<Edge> Edges;
- std::vector<Recovery> Recoveries;
- const Grammar &G;
- std::vector<std::pair<SymbolID, StateID>> StartStates;
- } Builder(G);
-
- std::vector<StateID> PendingStates;
- // Initialize states with the start symbol.
- auto RRange = G.table().Nonterminals[G.underscore()].RuleRange;
- for (RuleID RID = RRange.Start; RID < RRange.End; ++RID) {
- auto StartState = std::vector<Item>{Item::start(RID, G)};
- auto Result = Builder.insert(std::move(StartState));
- assert(Result.second && "State must be new");
- PendingStates.push_back(Result.first);
-
- const Rule &StartRule = G.lookupRule(RID);
- assert(StartRule.Size == 2 &&
- StartRule.seq().back() == tokenSymbol(tok::eof) &&
- "Start rule must be of the form `_ := start-symbol EOF`!");
- Builder.addStartState(StartRule.seq().front(), Result.first);
- }
-
- while (!PendingStates.empty()) {
- auto StateID = PendingStates.back();
- PendingStates.pop_back();
- for (auto Next : nextAvailableKernelItems(Builder.find(StateID), G)) {
- auto Insert = Builder.insert(Next.second);
- if (Insert.second) // new state, insert to the pending queue.
- PendingStates.push_back(Insert.first);
- Builder.insertEdge(StateID, Insert.first, Next.first);
- }
- for (auto Recovery : availableRecovery(Builder.find(StateID), G))
- Builder.insertRecovery(StateID, Recovery.first, Recovery.second);
- }
- return std::move(Builder).build();
-}
-
-} // namespace pseudo
-} // namespace clang
diff --git a/clang-tools-extra/pseudo/lib/grammar/LRTable.cpp b/clang-tools-extra/pseudo/lib/grammar/LRTable.cpp
deleted file mode 100644
index 6a68f1489d57a5..00000000000000
--- a/clang-tools-extra/pseudo/lib/grammar/LRTable.cpp
+++ /dev/null
@@ -1,79 +0,0 @@
-//===--- LRTable.cpp - Parsing table for LR parsers --------------*- C++-*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "clang-pseudo/grammar/LRTable.h"
-#include "clang-pseudo/grammar/Grammar.h"
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/Support/FormatVariadic.h"
-#include "llvm/Support/raw_ostream.h"
-
-namespace clang {
-namespace pseudo {
-
-std::string LRTable::dumpStatistics() const {
- return llvm::formatv(R"(
-Statistics of the LR parsing table:
- number of states: {0}
- number of actions: shift={1} goto={2} reduce={3}
- size of the table (bytes): {4}
-)",
- numStates(), Shifts.size(), Gotos.size(), Reduces.size(),
- bytes())
- .str();
-}
-
-std::string LRTable::dumpForTests(const Grammar &G) const {
- std::string Result;
- llvm::raw_string_ostream OS(Result);
- OS << "LRTable:\n";
- for (StateID S = 0; S < numStates(); ++S) {
- OS << llvm::formatv("State {0}\n", S);
- for (uint16_t Terminal = 0; Terminal < NumTerminals; ++Terminal) {
- SymbolID TokID = tokenSymbol(static_cast<tok::TokenKind>(Terminal));
- if (auto SS = getShiftState(S, TokID))
- OS.indent(4) << llvm::formatv("{0}: shift state {1}\n",
- G.symbolName(TokID), SS);
- }
- for (RuleID R : getReduceRules(S)) {
- SymbolID Target = G.lookupRule(R).Target;
- std::vector<llvm::StringRef> Terminals;
- for (unsigned Terminal = 0; Terminal < NumTerminals; ++Terminal) {
- SymbolID TokID = tokenSymbol(static_cast<tok::TokenKind>(Terminal));
- if (canFollow(Target, TokID))
- Terminals.push_back(G.symbolName(TokID));
- }
- OS.indent(4) << llvm::formatv("{0}: reduce by rule {1} '{2}'\n",
- llvm::join(Terminals, " "), R,
- G.dumpRule(R));
- }
- for (SymbolID NontermID = 0; NontermID < G.table().Nonterminals.size();
- ++NontermID) {
- if (auto GS = getGoToState(S, NontermID)) {
- OS.indent(4) << llvm::formatv("{0}: go to state {1}\n",
- G.symbolName(NontermID), *GS);
- }
- }
- }
- return OS.str();
-}
-
-LRTable::StateID LRTable::getStartState(SymbolID Target) const {
- assert(llvm::is_sorted(StartStates) && "StartStates must be sorted!");
- auto It = llvm::partition_point(
- StartStates, [Target](const std::pair<SymbolID, StateID> &X) {
- return X.first < Target;
- });
- assert(It != StartStates.end() && It->first == Target &&
- "target symbol doesn't have a start state!");
- return It->second;
-}
-
-} // namespace pseudo
-} // namespace clang
diff --git a/clang-tools-extra/pseudo/lib/grammar/LRTableBuild.cpp b/clang-tools-extra/pseudo/lib/grammar/LRTableBuild.cpp
deleted file mode 100644
index 387e1c54ee99b3..00000000000000
--- a/clang-tools-extra/pseudo/lib/grammar/LRTableBuild.cpp
+++ /dev/null
@@ -1,121 +0,0 @@
-//===--- LRTableBuild.cpp - Build a LRTable from LRGraph ---------*- C++-*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "clang-pseudo/grammar/Grammar.h"
-#include "clang-pseudo/grammar/LRGraph.h"
-#include "clang-pseudo/grammar/LRTable.h"
-#include "clang/Basic/TokenKinds.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallSet.h"
-#include <cstdint>
-
-namespace clang {
-namespace pseudo {
-
-LRTable LRTable::Builder::build() && {
- assert(NumNonterminals != 0 && "Set NumNonterminals or init with grammar");
- LRTable Table;
-
- // Count number of states: every state has to be reachable somehow.
- StateID MaxState = 0;
- for (const auto &Entry : StartStates)
- MaxState = std::max(MaxState, Entry.second);
- for (const auto &Entry : Transition)
- MaxState = std::max(MaxState, Entry.second);
- unsigned NumStates = MaxState + 1;
-
- Table.StartStates = std::move(StartStates);
-
- // Compile the goto and shift actions into transition tables.
- llvm::DenseMap<unsigned, SymbolID> Gotos;
- llvm::DenseMap<unsigned, SymbolID> Shifts;
- for (const auto &E : Transition) {
- if (isToken(E.first.second))
- Shifts.try_emplace(shiftIndex(E.first.first, E.first.second, NumStates),
- E.second);
- else
- Gotos.try_emplace(gotoIndex(E.first.first, E.first.second, NumStates),
- E.second);
- }
- Table.Shifts = TransitionTable(Shifts, NumStates * NumTerminals);
- Table.Gotos = TransitionTable(Gotos, NumStates * NumNonterminals);
-
- // Compile the follow sets into a bitmap.
- Table.FollowSets.resize(tok::NUM_TOKENS * FollowSets.size());
- for (SymbolID NT = 0; NT < FollowSets.size(); ++NT)
- for (SymbolID Follow : FollowSets[NT])
- Table.FollowSets.set(NT * tok::NUM_TOKENS + symbolToToken(Follow));
-
- // Store the reduce actions in a vector partitioned by state.
- Table.ReduceOffset.reserve(NumStates + 1);
- std::vector<RuleID> StateRules;
- for (StateID S = 0; S < NumStates; ++S) {
- Table.ReduceOffset.push_back(Table.Reduces.size());
- auto It = Reduce.find(S);
- if (It == Reduce.end())
- continue;
- Table.Reduces.insert(Table.Reduces.end(), It->second.begin(),
- It->second.end());
- llvm::sort(Table.Reduces.begin() + Table.ReduceOffset.back(),
- Table.Reduces.end());
- }
- Table.ReduceOffset.push_back(Table.Reduces.size());
-
- // Error recovery entries: sort (no dups already), and build offset lookup.
- llvm::sort(Recoveries, [&](const auto &L, const auto &R) {
- return std::tie(L.first, L.second.Result, L.second.Strategy) <
- std::tie(R.first, R.second.Result, R.second.Strategy);
- });
- Table.Recoveries.reserve(Recoveries.size());
- for (const auto &R : Recoveries)
- Table.Recoveries.push_back({R.second.Strategy, R.second.Result});
- Table.RecoveryOffset = std::vector<uint32_t>(NumStates + 1, 0);
- unsigned SortedIndex = 0;
- for (StateID State = 0; State < NumStates; ++State) {
- Table.RecoveryOffset[State] = SortedIndex;
- while (SortedIndex < Recoveries.size() &&
- Recoveries[SortedIndex].first == State)
- SortedIndex++;
- }
- Table.RecoveryOffset[NumStates] = SortedIndex;
- assert(SortedIndex == Recoveries.size());
-
- return Table;
-}
-
-LRTable LRTable::buildSLR(const Grammar &G) {
- auto Graph = LRGraph::buildLR0(G);
- Builder Build(G);
- Build.StartStates = Graph.startStates();
- for (const auto &T : Graph.edges())
- Build.Transition.try_emplace({T.Src, T.Label}, T.Dst);
- for (const auto &Entry : Graph.recoveries())
- Build.Recoveries.push_back(
- {Entry.Src, Recovery{Entry.Strategy, Entry.Result}});
- Build.FollowSets = followSets(G);
- assert(Graph.states().size() <= (1 << StateBits) &&
- "Graph states execceds the maximum limit!");
- // Add reduce actions.
- for (StateID SID = 0; SID < Graph.states().size(); ++SID) {
- for (const Item &I : Graph.states()[SID].Items) {
- // If we've just parsed the start symbol, this means we successfully parse
- // the input. We don't add the reduce action of `_ := start_symbol` in the
- // LRTable (the GLR parser handles it specifically).
- if (G.lookupRule(I.rule()).Target == G.underscore() && !I.hasNext())
- continue;
- if (!I.hasNext())
- // If we've reached the end of a rule A := ..., then we can reduce if
- // the next token is in the follow set of A.
- Build.Reduce[SID].insert(I.rule());
- }
- }
- return std::move(Build).build();
-}
-
-} // namespace pseudo
-} // namespace clang
diff --git a/clang-tools-extra/pseudo/test/.clang-format b/clang-tools-extra/pseudo/test/.clang-format
deleted file mode 100644
index e3845288a2aece..00000000000000
--- a/clang-tools-extra/pseudo/test/.clang-format
+++ /dev/null
@@ -1 +0,0 @@
-DisableFormat: true
diff --git a/clang-tools-extra/pseudo/test/CMakeLists.txt b/clang-tools-extra/pseudo/test/CMakeLists.txt
deleted file mode 100644
index 712527f78140e0..00000000000000
--- a/clang-tools-extra/pseudo/test/CMakeLists.txt
+++ /dev/null
@@ -1,29 +0,0 @@
-set(CLANG_PSEUDO_TEST_DEPS
- clang-pseudo
- clang-pseudo-fuzzer
- ClangPseudoTests
- )
-
-foreach(dep FileCheck not count)
- if(TARGET ${dep})
- list(APPEND CLANG_PSEUDO_TEST_DEPS ${dep})
- endif()
-endforeach()
-
-configure_lit_site_cfg(
- ${CMAKE_CURRENT_SOURCE_DIR}/lit.site.cfg.py.in
- ${CMAKE_CURRENT_BINARY_DIR}/lit.site.cfg.py
- MAIN_CONFIG
- ${CMAKE_CURRENT_SOURCE_DIR}/lit.cfg.py
- )
-
-configure_lit_site_cfg(
- ${CMAKE_CURRENT_SOURCE_DIR}/Unit/lit.site.cfg.py.in
- ${CMAKE_CURRENT_BINARY_DIR}/Unit/lit.site.cfg.py
- MAIN_CONFIG
- ${CMAKE_CURRENT_SOURCE_DIR}/Unit/lit.cfg.py
- )
-
-add_lit_testsuite(check-clang-pseudo "Running the clang-pseudo regression tests"
- ${CMAKE_CURRENT_BINARY_DIR}
- DEPENDS ${CLANG_PSEUDO_TEST_DEPS})
diff --git a/clang-tools-extra/pseudo/test/Unit/lit.cfg.py b/clang-tools-extra/pseudo/test/Unit/lit.cfg.py
deleted file mode 100644
index 000a8a772c31b1..00000000000000
--- a/clang-tools-extra/pseudo/test/Unit/lit.cfg.py
+++ /dev/null
@@ -1,25 +0,0 @@
-import lit.formats
-
-config.name = "clangPseudo Unit Tests"
-config.test_format = lit.formats.GoogleTest(".", "Tests")
-config.test_source_root = config.clang_pseudo_binary_dir + "/unittests"
-config.test_exec_root = config.clang_pseudo_binary_dir + "/unittests"
-
-# Point the dynamic loader at dynamic libraries in 'lib'.
-# FIXME: it seems every project has a copy of this logic. Move it somewhere.
-import platform
-
-if platform.system() == "Darwin":
- shlibpath_var = "DYLD_LIBRARY_PATH"
-elif platform.system() == "Windows":
- shlibpath_var = "PATH"
-else:
- shlibpath_var = "LD_LIBRARY_PATH"
-config.environment[shlibpath_var] = os.path.pathsep.join(
- ("@SHLIBDIR@", "@LLVM_LIBS_DIR@", config.environment.get(shlibpath_var, ""))
-)
-
-# It is not realistically possible to account for all options that could
-# possibly be present in system and user configuration files, so disable
-# default configs for the test runs.
-config.environment["CLANG_NO_DEFAULT_CONFIG"] = "1"
diff --git a/clang-tools-extra/pseudo/test/Unit/lit.site.cfg.py.in b/clang-tools-extra/pseudo/test/Unit/lit.site.cfg.py.in
deleted file mode 100644
index 4107b0d2eb83fa..00000000000000
--- a/clang-tools-extra/pseudo/test/Unit/lit.site.cfg.py.in
+++ /dev/null
@@ -1,11 +0,0 @@
- at LIT_SITE_CFG_IN_HEADER@
-# This is a shim to run the gtest unittests in ../unittests using lit.
-
-config.llvm_libs_dir = "@LLVM_LIBS_DIR@"
-config.shlibdir = "@SHLIBDIR@"
-
-config.clang_pseudo_source_dir = "@CMAKE_CURRENT_SOURCE_DIR@/.."
-config.clang_pseudo_binary_dir = "@CMAKE_CURRENT_BINARY_DIR@/.."
-
-# Delegate logic to lit.cfg.py.
-lit_config.load_config(config, "@CMAKE_CURRENT_SOURCE_DIR@/Unit/lit.cfg.py")
diff --git a/clang-tools-extra/pseudo/test/check-cxx-bnf.test b/clang-tools-extra/pseudo/test/check-cxx-bnf.test
deleted file mode 100644
index b825ff32faa1c6..00000000000000
--- a/clang-tools-extra/pseudo/test/check-cxx-bnf.test
+++ /dev/null
@@ -1,2 +0,0 @@
-// verify clang/lib/Tooling/Syntax/Pseudo/cxx/cxx.bnf
-// RUN: clang-pseudo -grammar=%cxx-bnf-file
diff --git a/clang-tools-extra/pseudo/test/crash/backslashes.c b/clang-tools-extra/pseudo/test/crash/backslashes.c
deleted file mode 100644
index 4ca70c609a0e64..00000000000000
--- a/clang-tools-extra/pseudo/test/crash/backslashes.c
+++ /dev/null
@@ -1,4 +0,0 @@
-// We used to try to interpret these backslashes as UCNs.
-// RUN: clang-pseudo -source=%s -print-tokens
-\
-\ x
diff --git a/clang-tools-extra/pseudo/test/cxx/capture-list.cpp b/clang-tools-extra/pseudo/test/cxx/capture-list.cpp
deleted file mode 100644
index fde46e4f0e0383..00000000000000
--- a/clang-tools-extra/pseudo/test/cxx/capture-list.cpp
+++ /dev/null
@@ -1,23 +0,0 @@
-// RUN: clang-pseudo -grammar=cxx -source=%s --print-forest | FileCheck %s
-// We loosely allow capture defaults in any position/multiple times.
-auto lambda = [&, &foo, bar(x), =]{};
-// CHECK: lambda-introducer := [ capture-list ]
-// CHECK-NEXT: ├─[
-// CHECK-NEXT: ├─capture-list
-// CHECK-NEXT: │ ├─capture-list
-// CHECK-NEXT: │ │ ├─capture-list
-// CHECK-NEXT: │ │ │ ├─capture-list~& := tok[4]
-// CHECK-NEXT: │ │ │ ├─,
-// CHECK-NEXT: │ │ │ └─capture~simple-capture
-// CHECK-NEXT: │ │ │ ├─&
-// CHECK-NEXT: │ │ │ └─IDENTIFIER := tok[7]
-// CHECK-NEXT: │ │ ├─,
-// CHECK-NEXT: │ │ └─capture~init-capture
-// CHECK-NEXT: │ │ ├─IDENTIFIER := tok[9]
-// CHECK-NEXT: │ │ └─initializer := ( expression-list )
-// CHECK-NEXT: │ │ ├─(
-// CHECK-NEXT: │ │ ├─expression-list~IDENTIFIER := tok[11]
-// CHECK-NEXT: │ │ └─)
-// CHECK-NEXT: │ ├─,
-// CHECK-NEXT: │ └─capture~=
-// CHECK-NEXT: └─]
diff --git a/clang-tools-extra/pseudo/test/cxx/contextual-keywords.cpp b/clang-tools-extra/pseudo/test/cxx/contextual-keywords.cpp
deleted file mode 100644
index ae74353c0a1563..00000000000000
--- a/clang-tools-extra/pseudo/test/cxx/contextual-keywords.cpp
+++ /dev/null
@@ -1,9 +0,0 @@
-// RUN: clang-pseudo -grammar=cxx -source=%s --print-forest | FileCheck %s
-// Verify that the contextual-{final,override} rules are guarded conditionally,
-// No ambiguous parsing for the virt-specifier.
-class Foo {
- void foo1() override;
-// CHECK: virt-specifier-seq~IDENTIFIER := tok[7]
- void foo2() final;
-// CHECK: virt-specifier-seq~IDENTIFIER := tok[13]
-};
diff --git a/clang-tools-extra/pseudo/test/cxx/dangling-else.cpp b/clang-tools-extra/pseudo/test/cxx/dangling-else.cpp
deleted file mode 100644
index 151f3931b53f9e..00000000000000
--- a/clang-tools-extra/pseudo/test/cxx/dangling-else.cpp
+++ /dev/null
@@ -1,22 +0,0 @@
-// RUN: clang-pseudo -grammar=cxx -source=%s --start-symbol=statement-seq --print-forest | FileCheck %s
-
-// Verify the else should belong to the nested if statement
-if (true) if (true) {} else {}
-
-// CHECK: statement-seq~selection-statement := IF ( condition ) statement
-// CHECK-NEXT: ├─IF
-// CHECK-NEXT: ├─(
-// CHECK-NEXT: ├─condition~TRUE
-// CHECK-NEXT: ├─)
-// CHECK-NEXT: └─statement~selection-statement
-// CHECK-NEXT: ├─IF
-// CHECK-NEXT: ├─(
-// CHECK-NEXT: ├─condition~TRUE
-// CHECK-NEXT: ├─)
-// CHECK-NEXT: ├─statement~compound-statement := { }
-// CHECK-NEXT: │ ├─{
-// CHECK-NEXT: │ └─}
-// CHECK-NEXT: ├─ELSE
-// CHECK-NEXT: └─statement~compound-statement := { }
-// CHECK-NEXT: ├─{
-// CHECK-NEXT: └─}
diff --git a/clang-tools-extra/pseudo/test/cxx/decl-specfier-seq.cpp b/clang-tools-extra/pseudo/test/cxx/decl-specfier-seq.cpp
deleted file mode 100644
index 255e8bedac4975..00000000000000
--- a/clang-tools-extra/pseudo/test/cxx/decl-specfier-seq.cpp
+++ /dev/null
@@ -1,27 +0,0 @@
-// RUN: clang-pseudo -grammar=cxx -source=%s --print-forest | FileCheck %s
-
-// not parsed as Type{foo} Type{bar}
-foo bar;
-// CHECK-NOT: simple-declaration := decl-specifier-seq ;
-// CHECK: simple-declaration := decl-specifier-seq init-declarator-list ;
-// CHECK: ├─decl-specifier-seq~simple-type-specifier
-// CHECK: ├─init-declarator-list~IDENTIFIER
-// CHECK: └─;
-// CHECK-NOT: simple-declaration := decl-specifier-seq ;
-
-// not parsed as Type{std} Type{::string} Declarator{s};
-std::string s;
-// CHECK-NOT: nested-name-specifier := ::
-// CHECK: simple-declaration := decl-specifier-seq init-declarator-list ;
-// CHECK: ├─decl-specifier-seq~simple-type-specifier := <ambiguous>
-// CHECK: │ ├─simple-type-specifier := nested-name-specifier type-name
-// CHECK: │ │ ├─nested-name-specifier := <ambiguous> #1
-// CHECK: │ │ │ ├─nested-name-specifier := type-name ::
-// CHECK: │ │ │ └─nested-name-specifier := namespace-name ::
-// CHECK: │ │ └─type-name
-// CHECK: │ └─simple-type-specifier := nested-name-specifier template-name
-// CHECK: │ ├─nested-name-specifier =#1
-// CHECK: │ └─template-name~IDENTIFIER
-// CHECK: ├─init-declarator-list~IDENTIFIER
-// CHECK: └─;
-// CHECK-NOT: nested-name-specifier := ::
diff --git a/clang-tools-extra/pseudo/test/cxx/declarator-function.cpp b/clang-tools-extra/pseudo/test/cxx/declarator-function.cpp
deleted file mode 100644
index 4d7972807c6db7..00000000000000
--- a/clang-tools-extra/pseudo/test/cxx/declarator-function.cpp
+++ /dev/null
@@ -1,9 +0,0 @@
-// The standard grammar allows an init-list with any declarator, including
-// a function declarator. This creates an ambiguity where a function-definition
-// is misparsed as a simple-declaration.
-
-// RUN: clang-pseudo -grammar=cxx -source=%s --print-forest | FileCheck %s
-void s(){};
-// CHECK-NOT: simple-declaration
-// CHECK: function-definition := decl-specifier-seq function-declarator function-body
-// CHECK-NOT: simple-declaration
diff --git a/clang-tools-extra/pseudo/test/cxx/declarator-var.cpp b/clang-tools-extra/pseudo/test/cxx/declarator-var.cpp
deleted file mode 100644
index 5aedd8037513ff..00000000000000
--- a/clang-tools-extra/pseudo/test/cxx/declarator-var.cpp
+++ /dev/null
@@ -1,9 +0,0 @@
-// The standard grammar allows an function-body to use any declarator, including
-// a non-function declarator. This creates an ambiguity where a
-// simple-declaration is misparsed as a function-definition.
-
-// RUN: clang-pseudo -grammar=cxx -source=%s --print-forest | FileCheck %s
-void (*s)(){};
-// CHECK-NOT: function-definition
-// CHECK: init-declarator := non-function-declarator initializer
-// CHECK-NOT: function-definition
diff --git a/clang-tools-extra/pseudo/test/cxx/declator-member-function.cpp b/clang-tools-extra/pseudo/test/cxx/declator-member-function.cpp
deleted file mode 100644
index 58d0ff4ccae9a2..00000000000000
--- a/clang-tools-extra/pseudo/test/cxx/declator-member-function.cpp
+++ /dev/null
@@ -1,9 +0,0 @@
-// RUN: clang-pseudo -grammar=cxx -source=%s --print-forest | FileCheck %s
-
-// Similiar to declarator-function.cpp, but for member functions.
-class Foo {
- void foo() {};
-// CHECK-NOT: member-declarator := declarator brace-or-equal-initializer
-// CHECK: member-declaration~function-definition := decl-specifier-seq function-declarator function-body
-// CHECK-NOT: member-declarator := declarator brace-or-equal-initializer
-};
diff --git a/clang-tools-extra/pseudo/test/cxx/empty-member-declaration.cpp b/clang-tools-extra/pseudo/test/cxx/empty-member-declaration.cpp
deleted file mode 100644
index 2540dd010fceff..00000000000000
--- a/clang-tools-extra/pseudo/test/cxx/empty-member-declaration.cpp
+++ /dev/null
@@ -1,7 +0,0 @@
-// RUN: clang-pseudo -grammar=cxx -source=%s --print-forest --forest-abbrev=false | FileCheck %s
-class A {
- ;
-// CHECK-NOT: member-declaration := ;
-// CHECK: member-declaration := empty-declaration
-// CHECK-NOT: member-declaration := ;
-};
diff --git a/clang-tools-extra/pseudo/test/cxx/empty-member-spec.cpp b/clang-tools-extra/pseudo/test/cxx/empty-member-spec.cpp
deleted file mode 100644
index 4d15835565b7ec..00000000000000
--- a/clang-tools-extra/pseudo/test/cxx/empty-member-spec.cpp
+++ /dev/null
@@ -1,13 +0,0 @@
-// RUN: clang-pseudo -grammar=cxx -source=%s --print-forest | FileCheck %s
-class Foo {
-public:
-};
-// CHECK: decl-specifier-seq~class-specifier := class-head { member-specification [recover=Brackets] }
-// CHECK-NEXT: ├─class-head := class-key class-head-name
-// CHECK-NEXT: │ ├─class-key~CLASS := tok[0]
-// CHECK-NEXT: │ └─class-head-name~IDENTIFIER := tok[1]
-// CHECK-NEXT: ├─{ := tok[2]
-// CHECK-NEXT: ├─member-specification := access-specifier :
-// CHECK-NEXT: │ ├─access-specifier~PUBLIC := tok[3]
-// CHECK-NEXT: │ └─: := tok[4]
-// CHECK-NEXT: └─} := tok[5]
diff --git a/clang-tools-extra/pseudo/test/cxx/keyword.cpp b/clang-tools-extra/pseudo/test/cxx/keyword.cpp
deleted file mode 100644
index 318db4ccc49b9f..00000000000000
--- a/clang-tools-extra/pseudo/test/cxx/keyword.cpp
+++ /dev/null
@@ -1,12 +0,0 @@
-// RUN: clang-pseudo -grammar=cxx -source=%s --print-forest | FileCheck %s
-bool operator<();
-// CHECK: translation-unit~simple-declaration := decl-specifier-seq init-declarator-list ;
-// CHECK-NEXT: ├─decl-specifier-seq~BOOL
-// CHECK-NEXT: ├─init-declarator-list~noptr-declarator := noptr-declarator parameters-and-qualifiers
-// CHECK-NEXT: │ ├─noptr-declarator~operator-function-id := OPERATOR operator-name
-// CHECK-NEXT: │ │ ├─OPERATOR
-// CHECK-NEXT: │ │ └─operator-name~<
-// CHECK-NEXT: │ └─parameters-and-qualifiers := ( )
-// CHECK-NEXT: │ ├─(
-// CHECK-NEXT: │ └─)
-// CHECK-NEXT: └─;
diff --git a/clang-tools-extra/pseudo/test/cxx/literals.cpp b/clang-tools-extra/pseudo/test/cxx/literals.cpp
deleted file mode 100644
index e1cec8985b25f2..00000000000000
--- a/clang-tools-extra/pseudo/test/cxx/literals.cpp
+++ /dev/null
@@ -1,43 +0,0 @@
-// RUN: clang-pseudo -grammar=cxx -source=%s --print-forest -forest-abbrev=0 | FileCheck %s --implicit-check-not=ambiguous
-auto list = {
- 0, // CHECK: := integer-literal
- 0b1011, // CHECK: := integer-literal
- 0777, // CHECK: := integer-literal
- 42_u, // CHECK: := user-defined-integer-literal
- 0LL, // CHECK: := integer-literal
- 0h, // CHECK: := user-defined-integer-literal
- 0., // CHECK: := floating-point-literal
- .2, // CHECK: := floating-point-literal
- 2e1, // CHECK: := floating-point-literal
- 0x42d, // CHECK: := integer-literal
- 0x42_d, // CHECK: := user-defined-integer-literal
- 0x42ds, // CHECK: := user-defined-integer-literal
- 0x1.2p2,// CHECK: := floating-point-literal
-
- "", // CHECK: literal := string-literal
- L"", // CHECK: literal := string-literal
- u8"", // CHECK: literal := string-literal
- u"", // CHECK: literal := string-literal
- U"", // CHECK: literal := string-literal
- R"()", // CHECK: literal := string-literal
- uR"()", // CHECK: literal := string-literal
- "a" "b", // CHECK: literal := string-literal
- u8"a" "b", // CHECK: literal := string-literal
- u"a" u"b", // CHECK: literal := string-literal
- "a"_u "b", // CHECK: user-defined-literal := user-defined-string-literal
- "a"_u u"b", // CHECK: user-defined-literal := user-defined-string-literal
- R"(a)" "\n", // CHECK: literal := string-literal
- R"c(a)c"_u u"\n", // CHECK: user-defined-literal := user-defined-string-literal
-
- 'a', // CHECK: := character-literal
- 'abc', // CHECK: := character-literal
- 'abcdef', // CHECK: := character-literal
- u'a', // CHECK: := character-literal
- U'a', // CHECK: := character-literal
- L'a', // CHECK: := character-literal
- L'abc', // CHECK: := character-literal
- U'\u1234',// CHECK: := character-literal
- '\u1234', // CHECK: := character-literal
- u'a'_u, // CHECK: := user-defined-character-literal
-};
-
diff --git a/clang-tools-extra/pseudo/test/cxx/mixed-designator.cpp b/clang-tools-extra/pseudo/test/cxx/mixed-designator.cpp
deleted file mode 100644
index d605a3d66a5de8..00000000000000
--- a/clang-tools-extra/pseudo/test/cxx/mixed-designator.cpp
+++ /dev/null
@@ -1,27 +0,0 @@
-// RUN: clang-pseudo -grammar=cxx -source=%s --print-forest | FileCheck %s
-// FIXME: tighten CHECK to CHECK-NEXT once numeric literals are unambiguous.
-auto x = { 1, .f = 2, [c]{3} };
-// CHECK: initializer-clause~braced-init-list
-// CHECK-NEXT: ├─{ := tok[3]
-// CHECK-NEXT: ├─initializer-list
-// CHECK-NEXT: │ ├─initializer-list
-// CHECK-NEXT: │ │ ├─initializer-list~NUMERIC_CONSTANT
-// CHECK-NEXT: │ │ ├─, := tok[5]
-// CHECK-NEXT: │ │ └─initializer-list-item
-// CHECK-NEXT: │ │ ├─designator
-// CHECK-NEXT: │ │ │ ├─. := tok[6]
-// CHECK-NEXT: │ │ │ └─IDENTIFIER := tok[7]
-// CHECK-NEXT: │ │ └─brace-or-equal-initializer
-// CHECK-NEXT: │ │ ├─= := tok[8]
-// CHECK-NEXT: │ │ └─initializer-clause~NUMERIC_CONSTANT
-// CHECK-NEXT: │ ├─, := tok[10]
-// CHECK-NEXT: │ └─initializer-list-item
-// CHECK-NEXT: │ ├─designator
-// CHECK-NEXT: │ │ ├─[ := tok[11]
-// CHECK-NEXT: │ │ ├─expression~IDENTIFIER := tok[12]
-// CHECK-NEXT: │ │ └─] := tok[13]
-// CHECK-NEXT: │ └─brace-or-equal-initializer~braced-init-list
-// CHECK-NEXT: │ ├─{ := tok[14]
-// CHECK-NEXT: │ ├─initializer-list~NUMERIC_CONSTANT
-// CHECK: │ └─} := tok[16]
-// CHECK-NEXT: └─} := tok[17]
diff --git a/clang-tools-extra/pseudo/test/cxx/nested-name-specifier.cpp b/clang-tools-extra/pseudo/test/cxx/nested-name-specifier.cpp
deleted file mode 100644
index 41d0fa13ff6dd4..00000000000000
--- a/clang-tools-extra/pseudo/test/cxx/nested-name-specifier.cpp
+++ /dev/null
@@ -1,28 +0,0 @@
-// RUN: clang-pseudo -grammar=cxx -source=%s --print-forest | FileCheck %s
-
-// Verify that we don't form a complete `::` nested-name-specifier if there is
-// an identifier preceding it.
-Foo::Foo() {} // No "Foo ::Foo()" false parse
-// CHECK: ├─declaration-seq~function-definition := function-declarator function-body
-// CHECK-NEXT: │ ├─function-declarator~noptr-declarator := noptr-declarator parameters-and-qualifiers
-
-int ::x;
-// CHECK: declaration~simple-declaration := decl-specifier-seq init-declarator-list ;
-// CHECK-NEXT: ├─decl-specifier-seq~INT
-
-void test() {
- X::Y::Z; // No false qualified-declarator parses "X ::Y::Z" and "X::Y ::Z".
-// CHECK: statement-seq~statement := <ambiguous>
-// CHECK: statement~expression-statement := expression ;
-// CHECK: statement~simple-declaration := decl-specifier-seq ;
-// CHECK-NOT: simple-declaration := decl-specifier-seq init-declarator-list ;
-
- // FIXME: eliminate the false `a<b> ::c` declaration parse.
- a<b>::c;
-// CHECK: statement := <ambiguous>
-// CHECK-NEXT: ├─statement~expression-statement := expression ;
-// CHECK-NEXT: │ ├─expression~relational-expression :=
-// CHECK: └─statement~simple-declaration := <ambiguous>
-// CHECK-NEXT: ├─simple-declaration := decl-specifier-seq ;
-// CHECK: └─simple-declaration := decl-specifier-seq init-declarator-list ;
-}
diff --git a/clang-tools-extra/pseudo/test/cxx/parameter-decl-clause.cpp b/clang-tools-extra/pseudo/test/cxx/parameter-decl-clause.cpp
deleted file mode 100644
index 1426f4e0a9bc05..00000000000000
--- a/clang-tools-extra/pseudo/test/cxx/parameter-decl-clause.cpp
+++ /dev/null
@@ -1,14 +0,0 @@
-// RUN: clang-pseudo -grammar=cxx -source=%s --print-forest | FileCheck %s
-void foo2(int, ...);
-// CHECK: translation-unit~simple-declaration := decl-specifier-seq init-declarator-list ;
-// CHECK-NEXT: ├─decl-specifier-seq~VOID :=
-// CHECK-NEXT: ├─init-declarator-list~noptr-declarator := noptr-declarator parameters-and-qualifiers
-// CHECK-NEXT: │ ├─noptr-declarator~IDENTIFIER :=
-// CHECK-NEXT: │ └─parameters-and-qualifiers := ( parameter-declaration-clause [recover=Brackets] )
-// CHECK-NEXT: │ ├─( :=
-// CHECK-NEXT: │ ├─parameter-declaration-clause := parameter-declaration-list , ...
-// CHECK-NEXT: │ │ ├─parameter-declaration-list~INT :=
-// CHECK-NEXT: │ │ ├─, :=
-// CHECK-NEXT: │ │ └─... :=
-// CHECK-NEXT: │ └─) :=
-// CHECK-NEXT: └─; :=
diff --git a/clang-tools-extra/pseudo/test/cxx/predefined-identifier.cpp b/clang-tools-extra/pseudo/test/cxx/predefined-identifier.cpp
deleted file mode 100644
index 5d48a3a43d0270..00000000000000
--- a/clang-tools-extra/pseudo/test/cxx/predefined-identifier.cpp
+++ /dev/null
@@ -1,5 +0,0 @@
-// RUN: clang-pseudo -grammar=cxx -source=%s --print-forest | FileCheck %s
-void s() {
- __func__;
- // CHECK: expression~__FUNC__ := tok[5]
-}
diff --git a/clang-tools-extra/pseudo/test/cxx/recovery-func-parameters.cpp b/clang-tools-extra/pseudo/test/cxx/recovery-func-parameters.cpp
deleted file mode 100644
index 0b41f881fa3bf8..00000000000000
--- a/clang-tools-extra/pseudo/test/cxx/recovery-func-parameters.cpp
+++ /dev/null
@@ -1,13 +0,0 @@
-// RUN: clang-pseudo -grammar=cxx -source=%s --print-forest | FileCheck %s
-void foo(complete garbage???) {}
-// CHECK: translation-unit~function-definition := decl-specifier-seq function-declarator function-body
-// CHECK-NEXT: ├─decl-specifier-seq~VOID := tok[0]
-// CHECK-NEXT: ├─function-declarator~noptr-declarator := noptr-declarator parameters-and-qualifiers
-// CHECK-NEXT: │ ├─noptr-declarator~IDENTIFIER := tok[1]
-// CHECK-NEXT: │ └─parameters-and-qualifiers := ( parameter-declaration-clause [recover=Brackets] )
-// CHECK-NEXT: │ ├─( := tok[2]
-// CHECK-NEXT: │ ├─parameter-declaration-clause := <opaque>
-// CHECK-NEXT: │ └─) := tok[8]
-// CHECK-NEXT: └─function-body~compound-statement := { }
-// CHECK-NEXT: ├─{ := tok[9]
-// CHECK-NEXT: └─} := tok[10]
diff --git a/clang-tools-extra/pseudo/test/cxx/recovery-init-list.cpp b/clang-tools-extra/pseudo/test/cxx/recovery-init-list.cpp
deleted file mode 100644
index 38216ad9647720..00000000000000
--- a/clang-tools-extra/pseudo/test/cxx/recovery-init-list.cpp
+++ /dev/null
@@ -1,13 +0,0 @@
-// RUN: clang-pseudo -grammar=cxx -source=%s --print-forest | FileCheck %s
-auto x = { complete garbage };
-// CHECK: translation-unit~simple-declaration
-// CHECK-NEXT: ├─decl-specifier-seq~AUTO := tok[0]
-// CHECK-NEXT: ├─init-declarator-list~init-declarator
-// CHECK-NEXT: │ ├─non-function-declarator~IDENTIFIER := tok[1]
-// CHECK-NEXT: │ └─initializer~brace-or-equal-initializer
-// CHECK-NEXT: │ ├─= := tok[2]
-// CHECK-NEXT: │ └─initializer-clause~braced-init-list
-// CHECK-NEXT: │ ├─{ := tok[3]
-// CHECK-NEXT: │ ├─initializer-list := <opaque>
-// CHECK-NEXT: │ └─} := tok[6]
-// CHECK-NEXT: └─; := tok[7]
diff --git a/clang-tools-extra/pseudo/test/cxx/structured-binding.cpp b/clang-tools-extra/pseudo/test/cxx/structured-binding.cpp
deleted file mode 100644
index 1c68e928ddd624..00000000000000
--- a/clang-tools-extra/pseudo/test/cxx/structured-binding.cpp
+++ /dev/null
@@ -1,6 +0,0 @@
-// RUN: clang-pseudo -grammar=cxx -source=%s --start-symbol=statement-seq --print-forest | FileCheck %s
-
-// Verify there is no false parse of the structured binding declaration.
-ABC[post] = abc;
-// CHECK: statement-seq~expression-statement := expression ;
-// CHECK: postfix-expression [ expr-or-braced-init-list ]
diff --git a/clang-tools-extra/pseudo/test/cxx/template-empty-type-parameter.cpp b/clang-tools-extra/pseudo/test/cxx/template-empty-type-parameter.cpp
deleted file mode 100644
index 02aff285f838c7..00000000000000
--- a/clang-tools-extra/pseudo/test/cxx/template-empty-type-parameter.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-// RUN: clang-pseudo -grammar=cxx -source=%s --print-forest | FileCheck %s
-template <typename> struct MatchParents;
-// CHECK: template-parameter-list~TYPENAME := tok[2]
diff --git a/clang-tools-extra/pseudo/test/cxx/unsized-array.cpp b/clang-tools-extra/pseudo/test/cxx/unsized-array.cpp
deleted file mode 100644
index 1f7b106e0e93bc..00000000000000
--- a/clang-tools-extra/pseudo/test/cxx/unsized-array.cpp
+++ /dev/null
@@ -1,7 +0,0 @@
-// RUN: clang-pseudo -grammar=cxx -source=%s --print-forest | FileCheck %s
-void s(int[]);
-// CHECK: parameter-declaration-clause~parameter-declaration := decl-specifier-seq abstract-declarator
-// CHECK-NEXT: ├─decl-specifier-seq~INT := tok[3]
-// CHECK-NEXT: └─abstract-declarator~noptr-abstract-declarator := [ ]
-// CHECK-NEXT: ├─[ := tok[4]
-// CHECK-NEXT: └─] := tok[5]
diff --git a/clang-tools-extra/pseudo/test/fuzzer.cpp b/clang-tools-extra/pseudo/test/fuzzer.cpp
deleted file mode 100644
index 400746a9d12d58..00000000000000
--- a/clang-tools-extra/pseudo/test/fuzzer.cpp
+++ /dev/null
@@ -1,4 +0,0 @@
-// RUN: clang-pseudo-fuzzer -grammar=%cxx-bnf-file -print %s | FileCheck %s
-int x;
-// CHECK: translation-unit := declaration-seq
-// CHECK: builtin-type := INT
diff --git a/clang-tools-extra/pseudo/test/glr-variant-start.cpp b/clang-tools-extra/pseudo/test/glr-variant-start.cpp
deleted file mode 100644
index 1bd073707353b8..00000000000000
--- a/clang-tools-extra/pseudo/test/glr-variant-start.cpp
+++ /dev/null
@@ -1,9 +0,0 @@
-// RUN: clang-pseudo -grammar=%cxx-bnf-file -source=%s --start-symbol=statement-seq --print-forest | FileCheck %s
-
-a + a;
-// CHECK: statement-seq~expression-statement := expression ;
-// CHECK-NEXT: ├─expression~additive-expression := additive-expression + multiplicative-expression
-// CHECK-NEXT: │ ├─additive-expression~IDENTIFIER :=
-// CHECK-NEXT: │ ├─+ :=
-// CHECK-NEXT: │ └─multiplicative-expression~IDENTIFIER :=
-// CHECK-NEXT: └─; :=
diff --git a/clang-tools-extra/pseudo/test/glr.cpp b/clang-tools-extra/pseudo/test/glr.cpp
deleted file mode 100644
index f805e42ffa6ddb..00000000000000
--- a/clang-tools-extra/pseudo/test/glr.cpp
+++ /dev/null
@@ -1,30 +0,0 @@
-// RUN: clang-pseudo -grammar=cxx -source=%s --print-forest -print-statistics | FileCheck %s
-
-void foo() {
- T* a; // a multiply expression or a pointer declaration?
-// CHECK: statement-seq~statement := <ambiguous>
-// CHECK-NEXT: ├─statement~expression-statement := expression ;
-// CHECK-NEXT: │ ├─expression~multiplicative-expression := multiplicative-expression * pm-expression
-// CHECK-NEXT: │ │ ├─multiplicative-expression~IDENTIFIER := tok[5]
-// CHECK-NEXT: │ │ ├─* := tok[6]
-// CHECK-NEXT: │ │ └─pm-expression~id-expression := unqualified-id #1
-// CHECK-NEXT: │ │ └─unqualified-id~IDENTIFIER := tok[7]
-// CHECK-NEXT: │ └─; := tok[8]
-// CHECK-NEXT: └─statement~simple-declaration := decl-specifier-seq init-declarator-list ;
-// CHECK-NEXT: ├─decl-specifier-seq~simple-type-specifier := <ambiguous>
-// CHECK-NEXT: │ ├─simple-type-specifier~IDENTIFIER := tok[5]
-// CHECK-NEXT: │ └─simple-type-specifier~IDENTIFIER := tok[5]
-// CHECK-NEXT: ├─init-declarator-list~ptr-declarator := ptr-operator ptr-declarator
-// CHECK-NEXT: │ ├─ptr-operator~* := tok[6]
-// CHECK-NEXT: │ └─ptr-declarator~id-expression =#1
-// CHECK-NEXT: └─; := tok[8]
-}
-
-// CHECK: 2 Ambiguous nodes:
-// CHECK-NEXT: 1 simple-type-specifier
-// CHECK-NEXT: 1 statement
-// CHECK-EMPTY:
-// CHECK-NEXT: 0 Opaque nodes:
-// CHECK-EMPTY:
-// CHECK-NEXT: Ambiguity: 0.20 misparses/token
-// CHECK-NEXT: Unparsed: 0.00%
diff --git a/clang-tools-extra/pseudo/test/html-forest.c b/clang-tools-extra/pseudo/test/html-forest.c
deleted file mode 100644
index 0be08da49f4a71..00000000000000
--- a/clang-tools-extra/pseudo/test/html-forest.c
+++ /dev/null
@@ -1,8 +0,0 @@
-// RUN: clang-pseudo -source %s -html-forest=%t.html
-// RUN: FileCheck %s < %t.html
-int main() {
-}
-// Sanity check for some obvious strings.
-// CHECK-DAG: <body>
-// CHECK-DAG: "compound-statement"
-// CHECK-DAG: main
diff --git a/clang-tools-extra/pseudo/test/lex.c b/clang-tools-extra/pseudo/test/lex.c
deleted file mode 100644
index ebebd2e0fb72ff..00000000000000
--- a/clang-tools-extra/pseudo/test/lex.c
+++ /dev/null
@@ -1,42 +0,0 @@
-int is_debug() {
-#ifndef NDEBUG
- return 1; // in debug mode
-#else
- return 0;
-#endif
-}
-
-/* This comment gets lexed along with the input above! We just don't CHECK it.
-
-RUN: clang-pseudo -source %s -print-source | FileCheck %s -check-prefix=SOURCE --strict-whitespace
- SOURCE: int is_debug() {
-SOURCE-NEXT: #ifndef NDEBUG
-SOURCE-NEXT: return 1; // in debug mode
-SOURCE-NEXT: #else
-SOURCE-NEXT: return 0;
-SOURCE-NEXT: #end
-SOURCE-NEXT: }
-
-RUN: clang-pseudo -source %s -print-tokens | FileCheck %s -check-prefix=TOKEN
- TOKEN: 0: raw_identifier 0:0 "int" flags=1
-TOKEN-NEXT: raw_identifier 0:0 "is_debug"
-TOKEN-NEXT: l_paren 0:0 "("
-TOKEN-NEXT: r_paren 0:0 ")"
-TOKEN-NEXT: l_brace 0:0 "{"
-TOKEN-NEXT: hash 1:0 "#" flags=1
-TOKEN-NEXT: raw_identifier 1:0 "ifndef"
-TOKEN-NEXT: raw_identifier 1:0 "NDEBUG"
-TOKEN-NEXT: raw_identifier 2:2 "return" flags=1
-TOKEN-NEXT: numeric_constant 2:2 "1"
-TOKEN-NEXT: semi 2:2 ";"
-TOKEN-NEXT: comment 2:2 "// in debug mode"
-TOKEN-NEXT: hash 3:0 "#" flags=1
-TOKEN-NEXT: raw_identifier 3:0 "else"
-TOKEN-NEXT: raw_identifier 4:2 "return" flags=1
-TOKEN-NEXT: numeric_constant 4:2 "0"
-TOKEN-NEXT: semi 4:2 ";"
-TOKEN-NEXT: hash 5:0 "#" flags=1
-TOKEN-NEXT: raw_identifier 5:0 "endif"
-TOKEN-NEXT: r_brace 6:0 "}" flags=1
-
-*******************************************************************************/
diff --git a/clang-tools-extra/pseudo/test/lit.cfg.py b/clang-tools-extra/pseudo/test/lit.cfg.py
deleted file mode 100644
index 2ba1558b2ed7d5..00000000000000
--- a/clang-tools-extra/pseudo/test/lit.cfg.py
+++ /dev/null
@@ -1,20 +0,0 @@
-import lit.llvm
-
-lit.llvm.initialize(lit_config, config)
-lit.llvm.llvm_config.use_default_substitutions()
-
-config.name = "ClangPseudo"
-config.suffixes = [".test", ".c", ".cpp"]
-config.excludes = ["Inputs"]
-config.test_format = lit.formats.ShTest(not lit.llvm.llvm_config.use_lit_shell)
-config.test_source_root = config.clang_pseudo_source_dir + "/test"
-config.test_exec_root = config.clang_pseudo_binary_dir + "/test"
-
-config.environment["PATH"] = os.path.pathsep.join(
- (config.clang_tools_dir, config.llvm_tools_dir, config.environment["PATH"])
-)
-
-# It is not realistically possible to account for all options that could
-# possibly be present in system and user configuration files, so disable
-# default configs for the test runs.
-config.environment["CLANG_NO_DEFAULT_CONFIG"] = "1"
diff --git a/clang-tools-extra/pseudo/test/lit.local.cfg b/clang-tools-extra/pseudo/test/lit.local.cfg
deleted file mode 100644
index 53079a0b538aeb..00000000000000
--- a/clang-tools-extra/pseudo/test/lit.local.cfg
+++ /dev/null
@@ -1,2 +0,0 @@
-cxx_bnf_file = os.path.join(config.clang_pseudo_source_dir, "lib", "cxx", "cxx.bnf")
-config.substitutions.append(("%cxx-bnf-file", cxx_bnf_file))
diff --git a/clang-tools-extra/pseudo/test/lit.site.cfg.py.in b/clang-tools-extra/pseudo/test/lit.site.cfg.py.in
deleted file mode 100644
index 3a969381ca613d..00000000000000
--- a/clang-tools-extra/pseudo/test/lit.site.cfg.py.in
+++ /dev/null
@@ -1,14 +0,0 @@
- at LIT_SITE_CFG_IN_HEADER@
-
-# Variables needed for common llvm config.
-config.clang_tools_dir = "@CURRENT_TOOLS_DIR@"
-config.lit_tools_dir = "@LLVM_LIT_TOOLS_DIR@"
-config.llvm_tools_dir = lit_config.substitute("@LLVM_TOOLS_DIR@")
-config.llvm_libs_dir = lit_config.substitute("@LLVM_LIBS_DIR@")
-config.target_triple = "@TARGET_TRIPLE@"
-config.python_executable = "@Python3_EXECUTABLE@"
-
-config.clang_pseudo_source_dir = "@CMAKE_CURRENT_SOURCE_DIR@/.."
-config.clang_pseudo_binary_dir = "@CMAKE_CURRENT_BINARY_DIR@/.."
-# Delegate logic to lit.cfg.py.
-lit_config.load_config(config, "@CMAKE_CURRENT_SOURCE_DIR@/lit.cfg.py")
diff --git a/clang-tools-extra/pseudo/test/lr-build-basic.test b/clang-tools-extra/pseudo/test/lr-build-basic.test
deleted file mode 100644
index 13036349eb8c1a..00000000000000
--- a/clang-tools-extra/pseudo/test/lr-build-basic.test
+++ /dev/null
@@ -1,32 +0,0 @@
-_ := expr EOF
-expr := id
-id := IDENTIFIER
-
-# RUN: clang-pseudo -grammar %s -print-graph | FileCheck %s --check-prefix=GRAPH
-# GRAPH: States:
-# GRAPH-NEXT: State 0
-# GRAPH-NEXT: _ := • expr EOF
-# GRAPH-NEXT: expr := • id
-# GRAPH-NEXT: id := • IDENTIFIER
-# GRAPH-NEXT: State 1
-# GRAPH-NEXT: _ := expr • EOF
-# GRAPH-NEXT: State 2
-# GRAPH-NEXT: expr := id •
-# GRAPH-NEXT: State 3
-# GRAPH-NEXT: id := IDENTIFIER •
-# GRAPH-NEXT: State 4
-# GRAPH-NEXT: _ := expr EOF •
-
-# RUN: clang-pseudo -grammar %s -print-table | FileCheck %s --check-prefix=TABLE
-# TABLE: LRTable:
-# TABLE-NEXT: State 0
-# TABLE-NEXT: IDENTIFIER: shift state 3
-# TABLE-NEXT: expr: go to state 1
-# TABLE-NEXT: id: go to state 2
-# TABLE-NEXT: State 1
-# TABLE-NEXT: EOF: shift state 4
-# TABLE-NEXT: State 2
-# TABLE-NEXT: EOF: reduce by rule 2 'expr := id'
-# TABLE-NEXT: State 3
-# TABLE-NEXT: EOF: reduce by rule 1 'id := IDENTIFIER'
-# TABLE-NEXT: State 4
diff --git a/clang-tools-extra/pseudo/test/lr-build-conflicts.test b/clang-tools-extra/pseudo/test/lr-build-conflicts.test
deleted file mode 100644
index a66ce4d622ca1c..00000000000000
--- a/clang-tools-extra/pseudo/test/lr-build-conflicts.test
+++ /dev/null
@@ -1,49 +0,0 @@
-_ := expr EOF
-expr := expr - expr # S/R conflict at state 4 on '-' token
-expr := IDENTIFIER
-
-# RUN: clang-pseudo -grammar %s -print-graph | FileCheck %s --check-prefix=GRAPH
-# GRAPH: States
-# GRAPH-NEXT: State 0
-# GRAPH-NEXT: _ := • expr EOF
-# GRAPH-NEXT: expr := • expr - expr
-# GRAPH-NEXT: expr := • IDENTIFIER
-# GRAPH-NEXT: State 1
-# GRAPH-NEXT: _ := expr • EOF
-# GRAPH-NEXT: expr := expr • - expr
-# GRAPH-NEXT: State 2
-# GRAPH-NEXT: expr := IDENTIFIER •
-# GRAPH-NEXT: State 3
-# GRAPH-NEXT: _ := expr EOF •
-# GRAPH-NEXT: State 4
-# GRAPH-NEXT: expr := • expr - expr
-# GRAPH-NEXT: expr := expr - • expr
-# GRAPH-NEXT: expr := • IDENTIFIER
-# GRAPH-NEXT: State 5
-# GRAPH-NEXT: expr := expr - expr •
-# GRAPH-NEXT: expr := expr • - expr
-# GRAPH-NEXT: 0 ->[expr] 1
-# GRAPH-NEXT: 0 ->[IDENTIFIER] 2
-# GRAPH-NEXT: 1 ->[EOF] 3
-# GRAPH-NEXT: 1 ->[-] 4
-# GRAPH-NEXT: 4 ->[expr] 5
-# GRAPH-NEXT: 4 ->[IDENTIFIER] 2
-# GRAPH-NEXT: 5 ->[-] 4
-
-# RUN: clang-pseudo -grammar %s -print-table | FileCheck %s --check-prefix=TABLE
-# TABLE: LRTable:
-# TABLE-NEXT: State 0
-# TABLE-NEXT: IDENTIFIER: shift state 2
-# TABLE-NEXT: expr: go to state 1
-# TABLE-NEXT: State 1
-# TABLE-NEXT: EOF: shift state 3
-# TABLE-NEXT: -: shift state 4
-# TABLE-NEXT: State 2
-# TABLE-NEXT: EOF -: reduce by rule 2 'expr := IDENTIFIER'
-# TABLE-NEXT: State 3
-# TABLE-NEXT: State 4
-# TABLE-NEXT: IDENTIFIER: shift state 2
-# TABLE-NEXT: expr: go to state 5
-# TABLE-NEXT: State 5
-# TABLE-NEXT: -: shift state 4
-# TABLE-NEXT: EOF -: reduce by rule 1 'expr := expr - expr'
diff --git a/clang-tools-extra/pseudo/test/strip-directives.c b/clang-tools-extra/pseudo/test/strip-directives.c
deleted file mode 100644
index c7878d9295a08a..00000000000000
--- a/clang-tools-extra/pseudo/test/strip-directives.c
+++ /dev/null
@@ -1,49 +0,0 @@
-#include <stdio.h>
-int main() {
-#error This was inevitable...
-#if HELLO
- printf("hello, world\n");
- return 0;
-#else
- abort();
-#endif
-}
-
-/* This comment gets lexed along with the input above! We just don't CHECK it.
-
-RUN: clang-pseudo -source %s -print-directive-tree | FileCheck %s -check-prefix=PPT --strict-whitespace
- PPT: #include (7 tokens)
-PPT-NEXT: code (5 tokens)
-PPT-NEXT: #error (6 tokens)
-PPT-NEXT: #if (3 tokens) TAKEN
-PPT-NEXT: code (8 tokens)
-PPT-NEXT: #else (2 tokens)
-PPT-NEXT: code (4 tokens)
-PPT-NEXT: #endif (2 tokens)
-PPT-NEXT: code (2 tokens)
- ^ including this block comment
-
-RUN: clang-pseudo -source %s -strip-directives -print-source | FileCheck %s --strict-whitespace
- CHECK: int main() {
-CHECK-NEXT: printf("hello, world\n");
-CHECK-NEXT: return 0;
-CHECK-NEXT: }
-
-RUN: clang-pseudo -source %s -strip-directives -print-tokens | FileCheck %s --check-prefix=TOKEN
- TOKEN: 0: raw_identifier 1:0 "int" flags=1
-TOKEN-NEXT: raw_identifier 1:0 "main"
-TOKEN-NEXT: l_paren 1:0 "("
-TOKEN-NEXT: r_paren 1:0 ")"
-TOKEN-NEXT: l_brace 1:0 "{"
-TOKEN-NEXT: raw_identifier 4:2 "printf" flags=1
-TOKEN-NEXT: l_paren 4:2 "("
-TOKEN-NEXT: string_literal 4:2 "\22hello, world\\n\22"
-TOKEN-NEXT: r_paren 4:2 ")"
-TOKEN-NEXT: semi 4:2 ";"
-TOKEN-NEXT: raw_identifier 5:2 "return" flags=1
-TOKEN-NEXT: numeric_constant 5:2 "0"
-TOKEN-NEXT: semi 5:2 ";"
-TOKEN-NEXT: r_brace 9:0 "}" flags=1
-
-*******************************************************************************/
-
diff --git a/clang-tools-extra/pseudo/tool/CMakeLists.txt b/clang-tools-extra/pseudo/tool/CMakeLists.txt
deleted file mode 100644
index bead383228396e..00000000000000
--- a/clang-tools-extra/pseudo/tool/CMakeLists.txt
+++ /dev/null
@@ -1,30 +0,0 @@
-set(LLVM_LINK_COMPONENTS support)
-
-add_clang_tool(clang-pseudo
- ClangPseudo.cpp
- HTMLForest.cpp
- )
-
-clang_target_link_libraries(clang-pseudo
- PRIVATE
- clangBasic
- )
-
-target_link_libraries(clang-pseudo
- PRIVATE
- clangPseudo
- clangPseudoGrammar
- clangPseudoCLI
- )
-
-add_custom_command(OUTPUT HTMLForestResources.inc
- COMMAND "${Python3_EXECUTABLE}" ${CLANG_SOURCE_DIR}/utils/bundle_resources.py
- ${CMAKE_CURRENT_BINARY_DIR}/HTMLForestResources.inc
- HTMLForest.css HTMLForest.js HTMLForest.html
- WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
- COMMENT "Bundling HTMLForest resources"
- DEPENDS ${CLANG_SOURCE_DIR}/utils/bundle_resources.py HTMLForest.css HTMLForest.js HTMLForest.html
- VERBATIM)
-add_custom_target(clang-pseudo-resources DEPENDS HTMLForestResources.inc)
-set_target_properties(clang-pseudo-resources PROPERTIES FOLDER "Clang Tools Extra/Resources")
-add_dependencies(clang-pseudo clang-pseudo-resources)
diff --git a/clang-tools-extra/pseudo/tool/ClangPseudo.cpp b/clang-tools-extra/pseudo/tool/ClangPseudo.cpp
deleted file mode 100644
index 6a64760749cefe..00000000000000
--- a/clang-tools-extra/pseudo/tool/ClangPseudo.cpp
+++ /dev/null
@@ -1,243 +0,0 @@
-//===-- ClangPseudo.cpp - Clang pseudoparser tool -------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "clang-pseudo/Bracket.h"
-#include "clang-pseudo/DirectiveTree.h"
-#include "clang-pseudo/Disambiguate.h"
-#include "clang-pseudo/Forest.h"
-#include "clang-pseudo/GLR.h"
-#include "clang-pseudo/Language.h"
-#include "clang-pseudo/Token.h"
-#include "clang-pseudo/cli/CLI.h"
-#include "clang-pseudo/grammar/Grammar.h"
-#include "clang-pseudo/grammar/LRGraph.h"
-#include "clang-pseudo/grammar/LRTable.h"
-#include "clang/Basic/LangOptions.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/STLFunctionalExtras.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/FormatVariadic.h"
-#include "llvm/Support/MemoryBuffer.h"
-#include "llvm/Support/Signals.h"
-#include <optional>
-
-using clang::pseudo::ForestNode;
-using clang::pseudo::Token;
-using clang::pseudo::TokenStream;
-using llvm::cl::desc;
-using llvm::cl::init;
-using llvm::cl::opt;
-
-static opt<bool> PrintGrammar("print-grammar", desc("Print the grammar"));
-static opt<bool> PrintGraph("print-graph",
- desc("Print the LR graph for the grammar"));
-static opt<bool> PrintTable("print-table",
- desc("Print the LR table for the grammar"));
-static opt<std::string> Source("source", desc("Source file"));
-static opt<bool> PrintSource("print-source", desc("Print token stream"));
-static opt<bool> PrintTokens("print-tokens", desc("Print detailed token info"));
-static opt<bool>
- PrintDirectiveTree("print-directive-tree",
- desc("Print directive structure of source code"));
-static opt<bool>
- StripDirectives("strip-directives",
- desc("Strip directives and select conditional sections"));
-static opt<bool> Disambiguate("disambiguate",
- desc("Choose best tree from parse forest"));
-static opt<bool> PrintStatistics("print-statistics", desc("Print GLR parser statistics"));
-static opt<bool> PrintForest("print-forest", desc("Print parse forest"));
-static opt<bool> ForestAbbrev("forest-abbrev", desc("Abbreviate parse forest"),
- init(true));
-static opt<std::string> HTMLForest("html-forest",
- desc("output file for HTML forest"));
-static opt<std::string> StartSymbol("start-symbol",
- desc("Specify the start symbol to parse"),
- init("translation-unit"));
-
-static std::string readOrDie(llvm::StringRef Path) {
- llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> Text =
- llvm::MemoryBuffer::getFile(Path);
- if (std::error_code EC = Text.getError()) {
- llvm::errs() << "Error: can't read file '" << Path
- << "': " << EC.message() << "\n";
- ::exit(1);
- }
- return Text.get()->getBuffer().str();
-}
-
-namespace clang {
-namespace pseudo {
-// Defined in HTMLForest.cpp
-void writeHTMLForest(llvm::raw_ostream &OS, const Grammar &,
- const ForestNode &Root, const Disambiguation &,
- const TokenStream &);
-namespace {
-
-struct NodeStats {
- unsigned Total = 0;
- std::vector<std::pair<SymbolID, unsigned>> BySymbol;
-
- NodeStats(const ForestNode &Root,
- llvm::function_ref<bool(const ForestNode &)> Filter) {
- llvm::DenseMap<SymbolID, unsigned> Map;
- for (const ForestNode &N : Root.descendants())
- if (Filter(N)) {
- ++Total;
- ++Map[N.symbol()];
- }
- BySymbol = {Map.begin(), Map.end()};
- // Sort by count descending, then symbol ascending.
- llvm::sort(BySymbol, [](const auto &L, const auto &R) {
- return std::tie(R.second, L.first) < std::tie(L.second, R.first);
- });
- }
-};
-
-} // namespace
-} // namespace pseudo
-} // namespace clang
-
-int main(int argc, char *argv[]) {
- llvm::cl::ParseCommandLineOptions(argc, argv, "");
- llvm::sys::PrintStackTraceOnErrorSignal(argv[0]);
-
- clang::LangOptions LangOpts = clang::pseudo::genericLangOpts();
- std::string SourceText;
- std::optional<clang::pseudo::TokenStream> RawStream;
- std::optional<TokenStream> PreprocessedStream;
- std::optional<clang::pseudo::TokenStream> ParseableStream;
- if (Source.getNumOccurrences()) {
- SourceText = readOrDie(Source);
- RawStream = clang::pseudo::lex(SourceText, LangOpts);
- TokenStream *Stream = &*RawStream;
-
- auto DirectiveStructure = clang::pseudo::DirectiveTree::parse(*RawStream);
- clang::pseudo::chooseConditionalBranches(DirectiveStructure, *RawStream);
-
- std::optional<TokenStream> Preprocessed;
- if (StripDirectives) {
- Preprocessed = DirectiveStructure.stripDirectives(*Stream);
- Stream = &*Preprocessed;
- }
-
- if (PrintSource)
- Stream->print(llvm::outs());
- if (PrintTokens)
- llvm::outs() << *Stream;
- if (PrintDirectiveTree)
- llvm::outs() << DirectiveStructure;
-
- ParseableStream = clang::pseudo::stripComments(cook(*Stream, LangOpts));
- pairBrackets(*ParseableStream);
- }
-
- const auto &Lang = clang::pseudo::getLanguageFromFlags();
- if (PrintGrammar)
- llvm::outs() << Lang.G.dump();
- if (PrintGraph)
- llvm::outs() << clang::pseudo::LRGraph::buildLR0(Lang.G).dumpForTests(
- Lang.G);
-
- if (PrintTable)
- llvm::outs() << Lang.Table.dumpForTests(Lang.G);
- if (PrintStatistics)
- llvm::outs() << Lang.Table.dumpStatistics();
-
- if (ParseableStream) {
- clang::pseudo::ForestArena Arena;
- clang::pseudo::GSS GSS;
- std::optional<clang::pseudo::SymbolID> StartSymID =
- Lang.G.findNonterminal(StartSymbol);
- if (!StartSymID) {
- llvm::errs() << llvm::formatv(
- "The start symbol {0} doesn't exit in the grammar!\n", StartSymbol);
- return 2;
- }
- auto &Root =
- glrParse(clang::pseudo::ParseParams{*ParseableStream, Arena, GSS},
- *StartSymID, Lang);
- // If we're disambiguating, we'll print at the end instead.
- if (PrintForest && !Disambiguate)
- llvm::outs() << Root.dumpRecursive(Lang.G, /*Abbreviated=*/ForestAbbrev);
- clang::pseudo::Disambiguation Disambig;
- if (Disambiguate)
- Disambig = clang::pseudo::disambiguate(&Root, {});
-
- if (HTMLForest.getNumOccurrences()) {
- std::error_code EC;
- llvm::raw_fd_ostream HTMLOut(HTMLForest, EC);
- if (EC) {
- llvm::errs() << "Couldn't write " << HTMLForest << ": " << EC.message()
- << "\n";
- return 2;
- }
- clang::pseudo::writeHTMLForest(HTMLOut, Lang.G, Root, Disambig,
- *ParseableStream);
- }
-
- if (PrintStatistics) {
- llvm::outs() << "Forest bytes: " << Arena.bytes()
- << " nodes: " << Arena.nodeCount() << "\n";
- llvm::outs() << "GSS bytes: " << GSS.bytes()
- << " nodes: " << GSS.nodesCreated() << "\n";
-
- for (auto &P : {std::make_pair("Ambiguous", ForestNode::Ambiguous),
- std::make_pair("Opaque", ForestNode::Opaque)}) {
- clang::pseudo::NodeStats Stats(
- Root, [&](const auto &N) { return N.kind() == P.second; });
- llvm::outs() << "\n" << Stats.Total << " " << P.first << " nodes:\n";
- for (const auto &S : Stats.BySymbol)
- llvm::outs() << llvm::formatv(" {0,3} {1}\n", S.second,
- Lang.G.symbolName(S.first));
- }
-
- // Metrics for how imprecise parsing was.
- // These are rough but aim to be:
- // - linear: if we eliminate half the errors the metric should halve
- // - length-independent
- unsigned UnparsedTokens = 0; // Tokens covered by Opaque. (not unique)
- unsigned Misparses = 0; // Sum of alternatives-1
- llvm::DenseSet<const ForestNode *> Visited;
- auto DFS = [&](const ForestNode &N, Token::Index End, auto &DFS) -> void {
- if (N.kind() == ForestNode::Opaque) {
- UnparsedTokens += End - N.startTokenIndex();
- } else if (N.kind() == ForestNode::Ambiguous) {
- Misparses += N.alternatives().size() - 1;
- for (const auto *C : N.alternatives())
- if (Visited.insert(C).second)
- DFS(*C, End, DFS);
- } else if (N.kind() == ForestNode::Sequence) {
- for (unsigned I = 0, E = N.children().size(); I < E; ++I)
- if (Visited.insert(N.children()[I]).second)
- DFS(*N.children()[I],
- I + 1 == N.children().size()
- ? End
- : N.children()[I + 1]->startTokenIndex(),
- DFS);
- }
- };
- unsigned Len = ParseableStream->tokens().size();
- DFS(Root, Len, DFS);
- llvm::outs() << "\n";
- llvm::outs() << llvm::formatv("Ambiguity: {0} misparses/token\n",
- double(Misparses) / Len);
- llvm::outs() << llvm::formatv("Unparsed: {0}%\n",
- 100.0 * UnparsedTokens / Len);
- }
-
- if (Disambiguate && PrintForest) {
- ForestNode *DisambigRoot = &Root;
- removeAmbiguities(DisambigRoot, Disambig);
- llvm::outs() << "Disambiguated tree:\n";
- llvm::outs() << DisambigRoot->dumpRecursive(Lang.G,
- /*Abbreviated=*/ForestAbbrev);
- }
- }
-
- return 0;
-}
diff --git a/clang-tools-extra/pseudo/tool/HTMLForest.cpp b/clang-tools-extra/pseudo/tool/HTMLForest.cpp
deleted file mode 100644
index 184430bddd8d64..00000000000000
--- a/clang-tools-extra/pseudo/tool/HTMLForest.cpp
+++ /dev/null
@@ -1,192 +0,0 @@
-//===-- HTMLForest.cpp - browser-based parse forest explorer
-//---------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// The plain text forest node dump (clang-pseudo -print-forest) is useful but
-// hard to reconcile with the code being examined, especially when it is large.
-//
-// HTMLForest produces a self-contained HTML file containing both the code and
-// the forest representation, linking them interactively with javascript.
-// At any given time, a single parse tree is shown (ambiguities resolved).
-// The user can switch between ambiguous alternatives.
-//
-// +-------+---------------+
-// | | +-----+|
-// | #tree | #code |#info||
-// | | +-----+|
-// | | |
-// +-------+---------------+
-//
-// #tree is a hierarchical view of the nodes (nested <ul>s), like -print-forest.
-// (It is a simple tree, not a DAG, because ambiguities have been resolved).
-// Like -print-forest, trivial sequences are collapsed (expression~IDENTIFIER).
-//
-// #code is the source code, annotated with <span>s marking the node ranges.
-// These spans are usually invisible (exception: ambiguities are marked), but
-// they are used to show and change the selection.
-//
-// #info is a floating box that shows details of the currently selected node:
-// - rule (for sequence nodes). Abbreviated rules are also shown.
-// - alternatives (for ambiguous nodes). The user can choose an alternative.
-// - ancestors. The parent nodes show how this node fits in translation-unit.
-//
-// There are two types of 'active' node:
-// - *highlight* is what the cursor is over, and is colored blue.
-// Near ancestors are shaded faintly (onion-skin) to show local structure.
-// - *selection* is set by clicking.
-// The #info box shows the selection, and selected nodes have a dashed ring.
-//
-//===----------------------------------------------------------------------===//
-
-#include "clang-pseudo/Disambiguate.h"
-#include "clang-pseudo/Forest.h"
-#include "clang-pseudo/grammar/Grammar.h"
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/Support/JSON.h"
-#include "llvm/Support/raw_ostream.h"
-namespace clang {
-namespace pseudo {
-namespace {
-
-// Defines const char HTMLForest_css[] = "...contents of HTMLForest.css..."; etc
-#include "HTMLForestResources.inc"
-
-struct Writer {
- llvm::raw_ostream &Out;
- const Grammar &G;
- const ForestNode &Root;
- const TokenStream &Stream;
- const Disambiguation &Disambig;
-
- void write() {
- Out << "<!doctype html>\n";
- tag("html", [&] {
- tag("head", [&] {
- tag("title", [&] { Out << "HTMLForest"; });
- tag("script", [&] { Out << HTMLForest_js; });
- tag("style", [&] { Out << HTMLForest_css; });
- tag("script", [&] {
- Out << "var forest=";
- writeForestJSON();
- Out << ";";
- });
- tag("pre id='hidden-code' hidden", [&] { writeCode(); });
- });
- tag("body", [&] { Out << HTMLForest_html; });
- });
- }
-
- void writeCode();
- void writeForestJSON();
- void tag(llvm::StringRef Opener, llvm::function_ref<void()> Body) {
- Out << "<" << Opener << ">";
- Body();
- Out << "</" << Opener.split(' ').first << ">\n";
- }
-};
-
-void Writer::writeCode() {
- // This loop (whitespace logic) is cribbed from TokenStream::Print.
- bool FirstToken = true;
- unsigned LastLine = -1;
- StringRef LastText;
- for (const auto &T : Stream.tokens()) {
- StringRef Text = T.text();
- if (FirstToken) {
- FirstToken = false;
- } else if (T.Line == LastLine) {
- if (LastText.data() + LastText.size() != Text.data())
- Out << ' ';
- } else {
- Out << " \n"; // Extra space aids selection.
- Out.indent(T.Indent);
- }
- Out << "<span class='token' id='t" << Stream.index(T) << "'>";
- llvm::printHTMLEscaped(Text, Out);
- Out << "</span>";
- LastLine = T.Line;
- LastText = Text;
- }
- if (!FirstToken)
- Out << '\n';
-}
-
-// Writes a JSON array of forest nodes. Items are e.g.:
-// {kind:'sequence', symbol:'compound-stmt', children:[5,8,33],
-// rule:'compound-stmt := ...'} {kind:'terminal', symbol:'VOID', token:'t52'}
-// {kind:'ambiguous', symbol:'type-specifier', children:[3,100] selected:3}
-// {kind:'opaque', symbol:'statement-seq', firstToken:'t5', lastToken:'t6'}
-void Writer::writeForestJSON() {
- // This is the flat array of nodes: the index into this array is the node ID.
- std::vector<std::pair<const ForestNode *, /*End*/ Token::Index>> Sequence;
- llvm::DenseMap<const ForestNode *, unsigned> Index;
- auto AssignID = [&](const ForestNode *N, Token::Index End) -> unsigned {
- auto R = Index.try_emplace(N, Sequence.size());
- if (R.second)
- Sequence.push_back({N, End});
- return R.first->second;
- };
- AssignID(&Root, Stream.tokens().size());
- auto TokenID = [](Token::Index I) { return ("t" + llvm::Twine(I)).str(); };
-
- llvm::json::OStream Out(this->Out, 2);
- Out.array([&] {
- for (unsigned I = 0; I < Sequence.size(); ++I) {
- const ForestNode *N = Sequence[I].first;
- Token::Index End = Sequence[I].second;
- Out.object([&] {
- Out.attribute("symbol", G.symbolName(N->symbol()));
- switch (N->kind()) {
- case ForestNode::Terminal:
- Out.attribute("kind", "terminal");
- Out.attribute("token", TokenID(N->startTokenIndex()));
- break;
- case ForestNode::Sequence:
- Out.attribute("kind", "sequence");
- Out.attribute("rule", G.dumpRule(N->rule()));
- break;
- case ForestNode::Ambiguous:
- Out.attribute("kind", "ambiguous");
- Out.attribute("selected",
- AssignID(N->children()[Disambig.lookup(N)], End));
- break;
- case ForestNode::Opaque:
- Out.attribute("kind", "opaque");
- Out.attribute("firstToken", TokenID(N->startTokenIndex()));
- // [firstToken, lastToken] is a closed range.
- // If empty, lastToken is omitted.
- if (N->startTokenIndex() != End)
- Out.attribute("lastToken", TokenID(End - 1));
- break;
- }
- auto Children = N->children();
- if (!Children.empty())
- Out.attributeArray("children", [&] {
- for (unsigned I = 0; I < Children.size(); ++I)
- Out.value(AssignID(Children[I],
- I + 1 == Children.size()
- ? End
- : Children[I + 1]->startTokenIndex()));
- });
- });
- }
- });
-}
-
-} // namespace
-
-// We only accept the derived stream here.
-// FIXME: allow the original stream instead?
-void writeHTMLForest(llvm::raw_ostream &OS, const Grammar &G,
- const ForestNode &Root, const Disambiguation &Disambig,
- const TokenStream &Stream) {
- Writer{OS, G, Root, Stream, Disambig}.write();
-}
-
-} // namespace pseudo
-} // namespace clang
diff --git a/clang-tools-extra/pseudo/tool/HTMLForest.css b/clang-tools-extra/pseudo/tool/HTMLForest.css
deleted file mode 100644
index 674cd59f0e76b3..00000000000000
--- a/clang-tools-extra/pseudo/tool/HTMLForest.css
+++ /dev/null
@@ -1,93 +0,0 @@
-body {
- position: absolute;
- top: 0;
- bottom: 0;
- right: 0;
- left: 0;
-
- display: flex;
- align-items: stretch;
- margin: 0;
- font-family: sans-serif;
- white-space: nowrap;
- height: 100%;
-}
-body > * {
- overflow-y: auto; /* Scroll sections independently*/
- margin: 0;
-}
-
-#code {
- font-size: 18px;
- line-height: 36px;
- flex-grow: 1;
- padding-right: 10em; /* Leave space for #info */
-}
-#code span {
- padding: 9px 0; /* No "gaps" between lines due to line-height */
-}
-.node.ambiguous::before, .ancestors.ambiguous::after, .tree-node.ambiguous > header::after {
- content: /*the thinking man's emoji*/'\01F914';
-}
-
-#info {
- position: fixed;
- right: 2em;
- top: 1em;
- width: 25em;
- border: 1px solid black;
- min-height: 20em;
- background-color: whiteSmoke;
- overflow-x: clip;
- box-shadow: 3px 3px 5px rgba(0,0,0,0.2);
-}
-#info header {
- background-color: black;
- color: white;
- font-size: larger;
- padding: 0.5em;
-}
-#info.ambiguous header { background-color: #803; }
-#info.sequence header { background-color: darkBlue; }
-#info.terminal header { background-color: darkGreen; }
-#info.opaque header { background-color: orangeRed; }
-#i_kind {
- float: right;
- font-size: small;
-}
-#info section {
- padding: 0.5em;
- border-top: 1px solid lightGray;
- overflow-x: auto;
-}
-#i_ancestors { font-size: small; }
-
-#tree {
- flex-grow: 0;
- min-width: 20em;
- margin-right: 1em;
- border-right: 1px solid darkGray;
- background-color: azure;
- font-size: small;
- overflow-x: auto;
- resize: horizontal;
-}
-#tree ul {
- margin: 0;
- display: inline-block;
- padding-left: 6px;
- border-left: 1px solid rgba(0,0,0,0.2);
- list-style: none;
-}
-#tree > ul { border-left: none; }
-.tree-node.selected > header .name { font-weight: bold; }
-.tree-node.terminal .name { font-family: monospace; }
-.tree-node.ambiguous > header .name { color: #803; font-weight: bold; }
-.tree-node.sequence > header .name { color: darkBlue; }
-.tree-node.terminal > header .name { color: darkGreen; }
-.tree-node.opaque > header .name { color: orangeRed; }
-
-.selected { outline: 1px dashed black; }
-.abbrev { opacity: 50%; }
-.abbrev::after { content: '~'; }
-.opaque { background-color: bisque; }
diff --git a/clang-tools-extra/pseudo/tool/HTMLForest.html b/clang-tools-extra/pseudo/tool/HTMLForest.html
deleted file mode 100644
index 4cf98cbbb2cc93..00000000000000
--- a/clang-tools-extra/pseudo/tool/HTMLForest.html
+++ /dev/null
@@ -1,15 +0,0 @@
-<div id="tree"><ul></ul></div>
-<pre id="code"></pre>
-<div id="info" hidden>
- <header>
- <span id="i_symbol"></span>
- <span id="i_kind"></span>
- </header>
- <section>
- <div id="i_rules"></div>
- <div id="i_alternatives"></div>
- </section>
- <section>
- <div id="i_ancestors"></div>
- </section>
-</div>
diff --git a/clang-tools-extra/pseudo/tool/HTMLForest.js b/clang-tools-extra/pseudo/tool/HTMLForest.js
deleted file mode 100644
index 24b88a5c10b471..00000000000000
--- a/clang-tools-extra/pseudo/tool/HTMLForest.js
+++ /dev/null
@@ -1,290 +0,0 @@
-// The global map of forest node index => NodeView.
-views = [];
-// NodeView is a visible forest node.
-// It has an entry in the navigation tree, and a span in the code itself.
-// Each NodeView is associated with a forest node, but not all nodes have views:
-// - nodes not reachable though current ambiguity selection
-// - trivial "wrapping" sequence nodes are abbreviated away
-class NodeView {
- // Builds a node representing forest[index], or its target if it is a wrapper.
- // Registers the node in the global map.
- static make(index, parent, abbrev) {
- var node = forest[index];
- if (node.kind == 'sequence' && node.children.length == 1 &&
- forest[node.children[0]].kind != 'ambiguous') {
- abbrev ||= [];
- abbrev.push(index);
- return NodeView.make(node.children[0], parent, abbrev);
- }
- return views[index] = new NodeView(index, parent, node, abbrev);
- }
-
- constructor(index, parent, node, abbrev) {
- this.abbrev = abbrev || [];
- this.parent = parent;
- this.children =
- (node.kind == 'ambiguous' ? [ node.selected ] : node.children || [])
- .map((c) => NodeView.make(c, this));
- this.index = index;
- this.node = node;
- views[index] = this;
-
- this.span = this.buildSpan();
- this.tree = this.buildTree();
- }
-
- // Replaces the token sequence in #code with a <span class=node>.
- buildSpan() {
- var elt = document.createElement('span');
- elt.dataset['index'] = this.index;
- elt.classList.add("node");
- elt.classList.add("selectable-node");
- elt.classList.add(this.node.kind);
-
- var begin = null, end = null;
- if (this.children.length != 0) {
- begin = this.children[0].span;
- end = this.children[this.children.length - 1].span.nextSibling;
- } else if (this.node.kind == 'terminal') {
- begin = document.getElementById(this.node.token);
- end = begin.nextSibling;
- } else if (this.node.kind == 'opaque') {
- begin = document.getElementById(this.node.firstToken);
- end = (this.node.lastToken == null)
- ? begin
- : document.getElementById(this.node.lastToken).nextSibling;
- }
- var parent = begin.parentNode;
- splice(begin, end, elt);
- parent.insertBefore(elt, end);
- return elt;
- }
-
- // Returns a (detached) <li class=tree-node> suitable for use in #tree.
- buildTree() {
- var elt = document.createElement('li');
- elt.dataset['index'] = this.index;
- elt.classList.add('tree-node');
- elt.classList.add('selectable-node');
- elt.classList.add(this.node.kind);
- var header = document.createElement('header');
- elt.appendChild(header);
-
- if (this.abbrev.length > 0) {
- var abbrev = document.createElement('span');
- abbrev.classList.add('abbrev');
- abbrev.innerText = forest[this.abbrev[0]].symbol;
- header.appendChild(abbrev);
- }
- var name = document.createElement('span');
- name.classList.add('name');
- name.innerText = this.node.symbol;
- header.appendChild(name);
-
- if (this.children.length != 0) {
- var sublist = document.createElement('ul');
- this.children.forEach((c) => sublist.appendChild(c.tree));
- elt.appendChild(sublist);
- }
- return elt;
- }
-
- // Make this view visible on the screen by scrolling if needed.
- scrollVisible() {
- scrollIntoViewV(document.getElementById('tree'), this.tree.firstChild);
- scrollIntoViewV(document.getElementById('code'), this.span);
- }
-
- // Fill #info with details of this node.
- renderInfo() {
- document.getElementById('info').classList = this.node.kind;
- document.getElementById('i_symbol').innerText = this.node.symbol;
- document.getElementById('i_kind').innerText = this.node.kind;
-
- // For sequence nodes, add LHS := RHS rule.
- // If this node abbreviates trivial sequences, we want those rules too.
- var rules = document.getElementById('i_rules');
- rules.textContent = '';
- function addRule(i) {
- var ruleText = forest[i].rule;
- if (ruleText == null)
- return;
- var rule = document.createElement('div');
- rule.classList.add('rule');
- rule.innerText = ruleText;
- rules.insertBefore(rule, rules.firstChild);
- }
- this.abbrev.forEach(addRule);
- addRule(this.index);
-
- // For ambiguous nodes, show a selectable list of alternatives.
- var alternatives = document.getElementById('i_alternatives');
- alternatives.textContent = '';
- var that = this;
- function addAlternative(i) {
- var altNode = forest[i];
- var text = altNode.rule || altNode.kind;
- var alt = document.createElement('div');
- alt.classList.add('alternative');
- alt.innerText = text;
- alt.dataset['index'] = i;
- alt.dataset['parent'] = that.index;
- if (i == that.node.selected)
- alt.classList.add('selected');
- alternatives.appendChild(alt);
- }
- if (this.node.kind == 'ambiguous')
- this.node.children.forEach(addAlternative);
-
- // Show the stack of ancestor nodes.
- // The part of each rule that leads to the current node is bolded.
- var ancestors = document.getElementById('i_ancestors');
- ancestors.textContent = '';
- var child = this;
- for (var view = this.parent; view != null;
- child = view, view = view.parent) {
- var indexInParent = view.children.indexOf(child);
-
- var ctx = document.createElement('div');
- ctx.classList.add('ancestors');
- ctx.classList.add('selectable-node');
- ctx.classList.add(view.node.kind);
- if (view.node.rule) {
- // Rule syntax is LHS := RHS1 [annotation] RHS2.
- // We walk through the chunks and bold the one at parentInIndex.
- var chunkCount = 0;
- ctx.innerHTML = view.node.rule.replaceAll(/[^ ]+/g, function(match) {
- if (!(match.startsWith('[') && match.endsWith(']')) /*annotations*/
- && chunkCount++ == indexInParent + 2 /*skip LHS :=*/)
- return '<b>' + match + '</b>';
- return match;
- });
- } else /*ambiguous*/ {
- ctx.innerHTML = '<b>' + view.node.symbol + '</b>';
- }
- ctx.dataset['index'] = view.index;
- if (view.abbrev.length > 0) {
- var abbrev = document.createElement('span');
- abbrev.classList.add('abbrev');
- abbrev.innerText = forest[view.abbrev[0]].symbol;
- ctx.insertBefore(abbrev, ctx.firstChild);
- }
-
- ctx.dataset['index'] = view.index;
- ancestors.appendChild(ctx, ancestors.firstChild);
- }
- }
-
- remove() {
- this.children.forEach((c) => c.remove());
- splice(this.span.firstChild, null, this.span.parentNode,
- this.span.nextSibling);
- detach(this.span);
- delete views[this.index];
- }
-};
-
-var selection = null;
-function selectView(view) {
- var old = selection;
- selection = view;
- if (view == old)
- return;
-
- if (old) {
- old.tree.classList.remove('selected');
- old.span.classList.remove('selected');
- }
- document.getElementById('info').hidden = (view == null);
- if (!view)
- return;
- view.tree.classList.add('selected');
- view.span.classList.add('selected');
- view.renderInfo();
- view.scrollVisible();
-}
-
-// To highlight nodes on hover, we create dynamic CSS rules of the form
-// .selectable-node[data-index="42"] { background-color: blue; }
-// This avoids needing to find all the related nodes and update their classes.
-var highlightSheet = new CSSStyleSheet();
-document.adoptedStyleSheets.push(highlightSheet);
-function highlightView(view) {
- var text = '';
- for (const color of ['#6af', '#bbb', '#ddd', '#eee']) {
- if (view == null)
- break;
- text += '.selectable-node[data-index="' + view.index + '"] '
- text += '{ background-color: ' + color + '; }\n';
- view = view.parent;
- }
- highlightSheet.replace(text);
-}
-
-// Select which branch of an ambiguous node is taken.
-function chooseAlternative(parent, index) {
- var parentView = views[parent];
- parentView.node.selected = index;
- var oldChild = parentView.children[0];
- oldChild.remove();
- var newChild = NodeView.make(index, parentView);
- parentView.children[0] = newChild;
- parentView.tree.lastChild.replaceChild(newChild.tree, oldChild.tree);
-
- highlightView(null);
- // Force redraw of the info box.
- selectView(null);
- selectView(parentView);
-}
-
-// Attach event listeners and build content once the document is ready.
-document.addEventListener("DOMContentLoaded", function() {
- var code = document.getElementById('code');
- var tree = document.getElementById('tree');
- var ancestors = document.getElementById('i_ancestors');
- var alternatives = document.getElementById('i_alternatives');
-
- [code, tree, ancestors].forEach(function(container) {
- container.addEventListener('click', function(e) {
- var nodeElt = e.target.closest('.selectable-node');
- selectView(nodeElt && views[Number(nodeElt.dataset['index'])]);
- });
- container.addEventListener('mousemove', function(e) {
- var nodeElt = e.target.closest('.selectable-node');
- highlightView(nodeElt && views[Number(nodeElt.dataset['index'])]);
- });
- });
-
- alternatives.addEventListener('click', function(e) {
- var altElt = e.target.closest('.alternative');
- if (altElt)
- chooseAlternative(Number(altElt.dataset['parent']),
- Number(altElt.dataset['index']));
- });
-
- // The HTML provides #code content in a hidden DOM element, move it.
- var hiddenCode = document.getElementById('hidden-code');
- splice(hiddenCode.firstChild, hiddenCode.lastChild, code);
- detach(hiddenCode);
-
- // Build the tree of NodeViews and attach to #tree.
- tree.firstChild.appendChild(NodeView.make(0).tree);
-});
-
-// Helper DOM functions //
-
-// Moves the sibling range [first, until) into newParent.
-function splice(first, until, newParent, before) {
- for (var next = first; next != until;) {
- var elt = next;
- next = next.nextSibling;
- newParent.insertBefore(elt, before);
- }
-}
-function detach(node) { node.parentNode.removeChild(node); }
-// Like scrollIntoView, but vertical only!
-function scrollIntoViewV(container, elt) {
- if (container.scrollTop > elt.offsetTop + elt.offsetHeight ||
- container.scrollTop + container.clientHeight < elt.offsetTop)
- container.scrollTo({top : elt.offsetTop, behavior : 'smooth'});
-}
diff --git a/clang-tools-extra/pseudo/unittests/BracketTest.cpp b/clang-tools-extra/pseudo/unittests/BracketTest.cpp
deleted file mode 100644
index 2fbfc641513648..00000000000000
--- a/clang-tools-extra/pseudo/unittests/BracketTest.cpp
+++ /dev/null
@@ -1,117 +0,0 @@
-//===--- BracketTest.cpp -------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "clang-pseudo/Bracket.h"
-#include "clang-pseudo/Token.h"
-#include "clang/Basic/LangOptions.h"
-#include "llvm/Testing/Annotations/Annotations.h"
-#include "gmock/gmock.h"
-#include "gtest/gtest.h"
-
-namespace clang {
-namespace pseudo {
-
-// Return a version of Code with each paired bracket marked with ^.
-std::string decorate(llvm::StringRef Code, const TokenStream &Stream) {
- std::string Result;
- const char *Pos = Code.data();
- for (const Token &Tok : Stream.tokens()) {
- if (Tok.Pair == 0)
- continue;
- const char *NewPos = Tok.text().begin();
- assert(NewPos >= Code.begin() && NewPos < Code.end());
- Result.append(Pos, NewPos - Pos);
- Result.push_back('^');
- Pos = NewPos;
- }
- Result.append(Pos, Code.end() - Pos);
- return Result;
-}
-
-// Checks that the brackets matched in Stream are those annotated in MarkedCode.
-void verifyMatchedSet(llvm::StringRef Code, llvm::StringRef MarkedCode,
- const TokenStream &Stream) {
- EXPECT_EQ(MarkedCode, decorate(Code, Stream));
-}
-
-// Checks that paired brackets within the stream nest properly.
-void verifyNesting(const TokenStream &Stream) {
- std::vector<const Token *> Stack;
- for (const auto &Tok : Stream.tokens()) {
- if (Tok.Pair > 0)
- Stack.push_back(&Tok);
- else if (Tok.Pair < 0) {
- ASSERT_FALSE(Stack.empty()) << Tok;
- ASSERT_EQ(Stack.back(), Tok.pair())
- << *Stack.back() << " != " << *Tok.pair() << " = pair of " << Tok;
- Stack.pop_back();
- }
- }
- ASSERT_THAT(Stack, testing::IsEmpty());
-}
-
-// Checks that ( pairs with a ) on its right, etc.
-void verifyMatchKind(const TokenStream &Stream) {
- for (const auto &Tok : Stream.tokens()) {
- if (Tok.Pair == 0)
- continue;
- auto Want = [&]() -> std::pair<bool, tok::TokenKind> {
- switch (Tok.Kind) {
- case tok::l_paren:
- return {true, tok::r_paren};
- case tok::r_paren:
- return {false, tok::l_paren};
- case tok::l_brace:
- return {true, tok::r_brace};
- case tok::r_brace:
- return {false, tok::l_brace};
- case tok::l_square:
- return {true, tok::r_square};
- case tok::r_square:
- return {false, tok::l_square};
- default:
- ADD_FAILURE() << "Paired non-bracket " << Tok;
- return {false, tok::eof};
- }
- }();
- EXPECT_EQ(Tok.Pair > 0, Want.first) << Tok;
- EXPECT_EQ(Tok.pair()->Kind, Want.second) << Tok;
- }
-}
-
-// Verifies an expected bracket pairing like:
-// ^( [ ^)
-// The input is annotated code, with the brackets expected to be matched marked.
-//
-// The input doesn't specify which bracket matches with which, but we verify:
-// - exactly the marked subset are paired
-// - ( is paired to a later ), etc
-// - brackets properly nest
-// This uniquely determines the bracket structure, so we indirectly verify it.
-// If particular tests should emphasize which brackets are paired, use comments.
-void verifyBrackets(llvm::StringRef MarkedCode) {
- SCOPED_TRACE(MarkedCode);
- llvm::Annotations A(MarkedCode);
- std::string Code = A.code().str();
- LangOptions LangOpts;
- auto Stream = lex(Code, LangOpts);
- pairBrackets(Stream);
-
- verifyMatchedSet(Code, MarkedCode, Stream);
- verifyNesting(Stream);
- verifyMatchKind(Stream);
-}
-
-TEST(Bracket, SimplePair) {
- verifyBrackets("^{ ^[ ^( ^) ^( ^) ^] ^}");
- verifyBrackets(") ^{ ^[ ^] ^} (");
- verifyBrackets("{ [ ( ] }"); // FIXME
-}
-
-} // namespace pseudo
-} // namespace clang
diff --git a/clang-tools-extra/pseudo/unittests/CMakeLists.txt b/clang-tools-extra/pseudo/unittests/CMakeLists.txt
deleted file mode 100644
index 53583ceb618640..00000000000000
--- a/clang-tools-extra/pseudo/unittests/CMakeLists.txt
+++ /dev/null
@@ -1,32 +0,0 @@
-set(LLVM_LINK_COMPONENTS
- Support
- )
-
-add_custom_target(ClangPseudoUnitTests)
-set_target_properties(ClangPseudoUnitTests PROPERTIES FOLDER "Clang Tools Extra/Tests")
-add_unittest(ClangPseudoUnitTests ClangPseudoTests
- BracketTest.cpp
- CXXTest.cpp
- DirectiveTreeTest.cpp
- DisambiguateTest.cpp
- ForestTest.cpp
- GLRTest.cpp
- GrammarTest.cpp
- LRTableTest.cpp
- TokenTest.cpp
-)
-
-clang_target_link_libraries(ClangPseudoTests
- PRIVATE
- clangBasic
- clangLex
- )
-
-target_link_libraries(ClangPseudoTests
- PRIVATE
- clangPseudo
- clangPseudoCXX
- clangPseudoGrammar
- LLVMTestingAnnotations
- LLVMTestingSupport
- )
diff --git a/clang-tools-extra/pseudo/unittests/CXXTest.cpp b/clang-tools-extra/pseudo/unittests/CXXTest.cpp
deleted file mode 100644
index 505f958ae7556c..00000000000000
--- a/clang-tools-extra/pseudo/unittests/CXXTest.cpp
+++ /dev/null
@@ -1,30 +0,0 @@
-//===--- CXXTest.cpp ------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "clang-pseudo/cxx/CXX.h"
-#include "gtest/gtest.h"
-
-namespace clang {
-namespace pseudo {
-namespace cxx {
-namespace {
-
-TEST(CXX, GeneratedEnums) {
- const auto &Lang = clang::pseudo::cxx::getLanguage();
- EXPECT_EQ("iteration-statement",
- Lang.G.symbolName(Symbol::iteration_statement));
- EXPECT_EQ("iteration-statement := DO statement WHILE ( expression ) ;",
- Lang.G.dumpRule(
- rule::iteration_statement::
- DO__statement__WHILE__L_PAREN__expression__R_PAREN__SEMI));
-}
-
-} // namespace
-} // namespace cxx
-} // namespace pseudo
-} // namespace clang
diff --git a/clang-tools-extra/pseudo/unittests/DirectiveTreeTest.cpp b/clang-tools-extra/pseudo/unittests/DirectiveTreeTest.cpp
deleted file mode 100644
index 19e5e0526142a0..00000000000000
--- a/clang-tools-extra/pseudo/unittests/DirectiveTreeTest.cpp
+++ /dev/null
@@ -1,357 +0,0 @@
-//===--- DirectiveTreeTest.cpp --------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "clang-pseudo/DirectiveTree.h"
-
-#include "clang-pseudo/Token.h"
-#include "clang/Basic/LangOptions.h"
-#include "clang/Basic/TokenKinds.h"
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/ADT/StringRef.h"
-#include "gmock/gmock.h"
-#include "gtest/gtest.h"
-
-namespace clang {
-namespace pseudo {
-namespace {
-
-using testing::_;
-using testing::ElementsAre;
-using testing::Matcher;
-using testing::Pair;
-using testing::StrEq;
-using Chunk = DirectiveTree::Chunk;
-
-// Matches text of a list of tokens against a string (joined with spaces).
-// e.g. EXPECT_THAT(Stream.tokens(), tokens("int main ( ) { }"));
-MATCHER_P(tokens, Tokens, "") {
- std::vector<llvm::StringRef> Texts;
- for (const Token &Tok : arg)
- Texts.push_back(Tok.text());
- return Matcher<std::string>(StrEq(Tokens))
- .MatchAndExplain(llvm::join(Texts, " "), result_listener);
-}
-
-// Matches tokens covered a directive chunk (with a Tokens property) against a
-// string, similar to tokens() above.
-// e.g. EXPECT_THAT(SomeDirective, tokensAre(Stream, "# include < vector >"));
-MATCHER_P2(tokensAre, TS, Tokens, "tokens are " + std::string(Tokens)) {
- return testing::Matches(tokens(Tokens))(TS.tokens(arg.Tokens));
-}
-
-MATCHER(directiveChunk, "") {
- return std::holds_alternative<DirectiveTree::Directive>(arg);
-}
-MATCHER(codeChunk, "") {
- return std::holds_alternative<DirectiveTree::Code>(arg);
-}
-MATCHER(conditionalChunk, "") {
- return std::holds_alternative<DirectiveTree::Conditional>(arg);
-}
-
-TEST(DirectiveTree, Parse) {
- LangOptions Opts;
- std::string Code = R"cpp(
- #include <foo.h>
-
- int main() {
- #ifdef HAS_FOO
- #if HAS_BAR
- foo(bar);
- #else
- foo(0)
- #endif
- #elif NEEDS_FOO
- #error missing_foo
- #endif
- }
- )cpp";
-
- TokenStream S = cook(lex(Code, Opts), Opts);
- DirectiveTree PP = DirectiveTree::parse(S);
- ASSERT_THAT(PP.Chunks, ElementsAre(directiveChunk(), codeChunk(),
- conditionalChunk(), codeChunk()));
-
- EXPECT_THAT(std::get<DirectiveTree::Directive>(PP.Chunks[0]),
- tokensAre(S, "# include < foo . h >"));
- EXPECT_THAT(std::get<DirectiveTree::Code>(PP.Chunks[1]),
- tokensAre(S, "int main ( ) {"));
- EXPECT_THAT(std::get<DirectiveTree::Code>(PP.Chunks[3]), tokensAre(S, "}"));
-
- const auto &Ifdef = std::get<DirectiveTree::Conditional>(PP.Chunks[2]);
- EXPECT_THAT(Ifdef.Branches,
- ElementsAre(Pair(tokensAre(S, "# ifdef HAS_FOO"), _),
- Pair(tokensAre(S, "# elif NEEDS_FOO"), _)));
- EXPECT_THAT(Ifdef.End, tokensAre(S, "# endif"));
-
- const DirectiveTree &HasFoo(Ifdef.Branches[0].second);
- const DirectiveTree &NeedsFoo(Ifdef.Branches[1].second);
-
- EXPECT_THAT(HasFoo.Chunks, ElementsAre(conditionalChunk()));
- const auto &If = std::get<DirectiveTree::Conditional>(HasFoo.Chunks[0]);
- EXPECT_THAT(If.Branches, ElementsAre(Pair(tokensAre(S, "# if HAS_BAR"), _),
- Pair(tokensAre(S, "# else"), _)));
- EXPECT_THAT(If.Branches[0].second.Chunks, ElementsAre(codeChunk()));
- EXPECT_THAT(If.Branches[1].second.Chunks, ElementsAre(codeChunk()));
-
- EXPECT_THAT(NeedsFoo.Chunks, ElementsAre(directiveChunk()));
- const auto &Error = std::get<DirectiveTree::Directive>(NeedsFoo.Chunks[0]);
- EXPECT_THAT(Error, tokensAre(S, "# error missing_foo"));
- EXPECT_EQ(Error.Kind, tok::pp_error);
-}
-
-TEST(DirectiveTree, ParseUgly) {
- LangOptions Opts;
- std::string Code = R"cpp(
- /*A*/ # /*B*/ \
- /*C*/ \
-define \
-BAR /*D*/
-/*E*/
-)cpp";
- TokenStream S = cook(lex(Code, Opts), Opts);
- DirectiveTree PP = DirectiveTree::parse(S);
-
- ASSERT_THAT(PP.Chunks,
- ElementsAre(codeChunk(), directiveChunk(), codeChunk()));
- EXPECT_THAT(std::get<DirectiveTree::Code>(PP.Chunks[0]),
- tokensAre(S, "/*A*/"));
- const auto &Define = std::get<DirectiveTree::Directive>(PP.Chunks[1]);
- EXPECT_EQ(Define.Kind, tok::pp_define);
- EXPECT_THAT(Define, tokensAre(S, "# /*B*/ /*C*/ define BAR /*D*/"));
- EXPECT_THAT(std::get<DirectiveTree::Code>(PP.Chunks[2]),
- tokensAre(S, "/*E*/"));
-}
-
-TEST(DirectiveTree, ParseBroken) {
- LangOptions Opts;
- std::string Code = R"cpp(
- a
- #endif // mismatched
- #if X
- b
-)cpp";
- TokenStream S = cook(lex(Code, Opts), Opts);
- DirectiveTree PP = DirectiveTree::parse(S);
-
- ASSERT_THAT(PP.Chunks,
- ElementsAre(codeChunk(), directiveChunk(), conditionalChunk()));
- EXPECT_THAT(std::get<DirectiveTree::Code>(PP.Chunks[0]), tokensAre(S, "a"));
- const auto &Endif = std::get<DirectiveTree::Directive>(PP.Chunks[1]);
- EXPECT_EQ(Endif.Kind, tok::pp_endif);
- EXPECT_THAT(Endif, tokensAre(S, "# endif // mismatched"));
-
- const auto &X = std::get<DirectiveTree::Conditional>(PP.Chunks[2]);
- EXPECT_EQ(1u, X.Branches.size());
- // The (only) branch of the broken conditional section runs until eof.
- EXPECT_EQ(tok::pp_if, X.Branches.front().first.Kind);
- EXPECT_THAT(X.Branches.front().second.Chunks, ElementsAre(codeChunk()));
- // The missing terminating directive is marked as pp_not_keyword.
- EXPECT_EQ(tok::pp_not_keyword, X.End.Kind);
- EXPECT_EQ(0u, X.End.Tokens.size());
-}
-
-TEST(DirectiveTree, ChooseBranches) {
- LangOptions Opts;
- const std::string Cases[] = {
- R"cpp(
- // Branches with no alternatives are taken
- #if COND // TAKEN
- int x;
- #endif
- )cpp",
-
- R"cpp(
- // Empty branches are better than nothing
- #if COND // TAKEN
- #endif
- )cpp",
-
- R"cpp(
- // Trivially false branches are not taken, even with no alternatives.
- #if 0
- int x;
- #endif
- )cpp",
-
- R"cpp(
- // Longer branches are preferred over shorter branches
- #if COND // TAKEN
- int x = 1;
- #else
- int x;
- #endif
-
- #if COND
- int x;
- #else // TAKEN
- int x = 1;
- #endif
- )cpp",
-
- R"cpp(
- // Trivially true branches are taken if previous branches are trivial.
- #if 1 // TAKEN
- #else
- int x = 1;
- #endif
-
- #if 0
- int x = 1;
- #elif 0
- int x = 2;
- #elif 1 // TAKEN
- int x;
- #endif
-
- #if 0
- int x = 1;
- #elif FOO // TAKEN
- int x = 2;
- #elif 1
- int x;
- #endif
- )cpp",
-
- R"cpp(
- // #else is a trivially true branch
- #if 0
- int x = 1;
- #elif 0
- int x = 2;
- #else // TAKEN
- int x;
- #endif
- )cpp",
-
- R"cpp(
- // Directives break ties, but nondirective text is more important.
- #if FOO
- #define A 1 2 3
- #else // TAKEN
- #define B 4 5 6
- #define C 7 8 9
- #endif
-
- #if FOO // TAKEN
- ;
- #define A 1 2 3
- #else
- #define B 4 5 6
- #define C 7 8 9
- #endif
- )cpp",
-
- R"cpp(
- // Avoid #error directives.
- #if FOO
- int x = 42;
- #error This branch is no good
- #else // TAKEN
- #endif
-
- #if FOO
- // All paths here lead to errors.
- int x = 42;
- #if 1 // TAKEN
- #if COND // TAKEN
- #error This branch is no good
- #else
- #error This one is no good either
- #endif
- #endif
- #else // TAKEN
- #endif
- )cpp",
-
- R"cpp(
- // Populate taken branches recursively.
- #if FOO // TAKEN
- int x = 42;
- #if BAR
- ;
- #else // TAKEN
- int y = 43;
- #endif
- #else
- int x;
- #if BAR // TAKEN
- int y;
- #else
- ;
- #endif
- #endif
- )cpp",
- };
- for (const auto &Code : Cases) {
- TokenStream S = cook(lex(Code, Opts), Opts);
-
- std::function<void(const DirectiveTree &)> Verify =
- [&](const DirectiveTree &M) {
- for (const auto &C : M.Chunks) {
- if (!std::holds_alternative<DirectiveTree::Conditional>(C))
- continue;
- const DirectiveTree::Conditional &Cond =
- std::get<DirectiveTree::Conditional>(C);
- for (unsigned I = 0; I < Cond.Branches.size(); ++I) {
- auto Directive = S.tokens(Cond.Branches[I].first.Tokens);
- EXPECT_EQ(I == Cond.Taken, Directive.back().text() == "// TAKEN")
- << "At line " << Directive.front().Line << " of: " << Code;
- Verify(Cond.Branches[I].second);
- }
- }
- };
-
- DirectiveTree Tree = DirectiveTree::parse(S);
- chooseConditionalBranches(Tree, S);
- Verify(Tree);
- }
-}
-
-TEST(DirectiveTree, StripDirectives) {
- LangOptions Opts;
- std::string Code = R"cpp(
- #include <stddef.h>
- a a a
- #warning AAA
- b b b
- #if 1
- c c c
- #warning BBB
- #if 0
- d d d
- #warning CC
- #else
- e e e
- #endif
- f f f
- #if 0
- g g g
- #endif
- h h h
- #else
- i i i
- #endif
- j j j
- )cpp";
- TokenStream S = lex(Code, Opts);
-
- DirectiveTree Tree = DirectiveTree::parse(S);
- chooseConditionalBranches(Tree, S);
- EXPECT_THAT(Tree.stripDirectives(S).tokens(),
- tokens("a a a b b b c c c e e e f f f h h h j j j"));
-
- const DirectiveTree &Part =
- std::get<DirectiveTree::Conditional>(Tree.Chunks[4]).Branches[0].second;
- EXPECT_THAT(Part.stripDirectives(S).tokens(),
- tokens("c c c e e e f f f h h h"));
-}
-
-} // namespace
-} // namespace pseudo
-} // namespace clang
diff --git a/clang-tools-extra/pseudo/unittests/DisambiguateTest.cpp b/clang-tools-extra/pseudo/unittests/DisambiguateTest.cpp
deleted file mode 100644
index 2f483bb0906607..00000000000000
--- a/clang-tools-extra/pseudo/unittests/DisambiguateTest.cpp
+++ /dev/null
@@ -1,111 +0,0 @@
-//===--- DisambiguateTest.cpp ---------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "clang-pseudo/Disambiguate.h"
-#include "clang-pseudo/Forest.h"
-#include "clang-pseudo/Token.h"
-#include "clang/Basic/TokenKinds.h"
-#include "gmock/gmock.h"
-#include "gtest/gtest.h"
-#include <vector>
-
-namespace clang {
-namespace pseudo {
-namespace {
-using testing::ElementsAre;
-using testing::Pair;
-using testing::UnorderedElementsAre;
-
-// Common disambiguation test fixture.
-// This is the ambiguous forest representing parses of 'a * b;'.
-class DisambiguateTest : public ::testing::Test {
-protected:
- // Greatly simplified C++ grammar.
- enum Symbol : SymbolID {
- Statement,
- Declarator,
- Expression,
- DeclSpecifier,
- Type,
- Template,
- };
- enum Rule : RuleID {
- /* LHS__RHS1_RHS2 means LHS := RHS1 RHS2 */
- Statement__DeclSpecifier_Declarator_Semi,
- Declarator__Star_Declarator,
- Declarator__Identifier,
- Statement__Expression_Semi,
- Expression__Expression_Star_Expression,
- Expression__Identifier,
- DeclSpecifier__Type,
- DeclSpecifier__Template,
- Type__Identifier,
- Template__Identifier,
- };
-
- ForestArena Arena;
- ForestNode &A = Arena.createTerminal(tok::identifier, 0);
- ForestNode &Star = Arena.createTerminal(tok::star, 1);
- ForestNode &B = Arena.createTerminal(tok::identifier, 2);
- ForestNode &Semi = Arena.createTerminal(tok::semi, 3);
-
- // Parse as multiplication expression.
- ForestNode &AExpr =
- Arena.createSequence(Expression, Expression__Identifier, &A);
- ForestNode &BExpr =
- Arena.createSequence(Expression, Expression__Identifier, &B);
- ForestNode &Expr =
- Arena.createSequence(Expression, Expression__Expression_Star_Expression,
- {&AExpr, &Star, &BExpr});
- ForestNode &ExprStmt = Arena.createSequence(
- Statement, Statement__Expression_Semi, {&Expr, &Semi});
- // Parse as declaration (`a` may be CTAD or not).
- ForestNode &AType =
- Arena.createSequence(DeclSpecifier, DeclSpecifier__Type,
- &Arena.createSequence(Type, Type__Identifier, &A));
- ForestNode &ATemplate = Arena.createSequence(
- DeclSpecifier, DeclSpecifier__Template,
- &Arena.createSequence(Template, Template__Identifier, &A));
- ForestNode &DeclSpec =
- Arena.createAmbiguous(DeclSpecifier, {&AType, &ATemplate});
- ForestNode &BDeclarator =
- Arena.createSequence(Declarator, Declarator__Identifier, &B);
- ForestNode &BPtr = Arena.createSequence(
- Declarator, Declarator__Star_Declarator, {&Star, &BDeclarator});
- ForestNode &DeclStmt =
- Arena.createSequence(Statement, Statement__DeclSpecifier_Declarator_Semi,
- {&DeclSpec, &Star, &BDeclarator});
- // Top-level ambiguity
- ForestNode &Stmt = Arena.createAmbiguous(Statement, {&ExprStmt, &DeclStmt});
-};
-
-TEST_F(DisambiguateTest, Remove) {
- Disambiguation D;
- D.try_emplace(&Stmt, 1); // statement is a declaration, not an expression
- D.try_emplace(&DeclSpec, 0); // a is a type, not a (CTAD) template
- ForestNode *Root = &Stmt;
- removeAmbiguities(Root, D);
-
- EXPECT_EQ(Root, &DeclStmt);
- EXPECT_THAT(DeclStmt.elements(), ElementsAre(&AType, &Star, &BDeclarator));
-}
-
-TEST_F(DisambiguateTest, DummyStrategy) {
- Disambiguation D = disambiguate(&Stmt, {});
- EXPECT_THAT(D, UnorderedElementsAre(Pair(&Stmt, 1), Pair(&DeclSpec, 1)));
-
- ForestNode *Root = &Stmt;
- removeAmbiguities(Root, D);
- EXPECT_EQ(Root, &DeclStmt);
- EXPECT_THAT(DeclStmt.elements(),
- ElementsAre(&ATemplate, &Star, &BDeclarator));
-}
-
-} // namespace
-} // namespace pseudo
-} // namespace clang
diff --git a/clang-tools-extra/pseudo/unittests/ForestTest.cpp b/clang-tools-extra/pseudo/unittests/ForestTest.cpp
deleted file mode 100644
index 36af896148209d..00000000000000
--- a/clang-tools-extra/pseudo/unittests/ForestTest.cpp
+++ /dev/null
@@ -1,180 +0,0 @@
-//===--- ForestTest.cpp - Test Forest dump ----------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "clang-pseudo/Forest.h"
-#include "clang-pseudo/Token.h"
-#include "clang/Basic/LangOptions.h"
-#include "llvm/ADT/StringRef.h"
-#include "gmock/gmock.h"
-#include "gtest/gtest.h"
-#include <vector>
-
-namespace clang {
-namespace pseudo {
-namespace {
-
-// FIXME: extract to a TestGrammar class to allow code sharing among tests.
-class ForestTest : public ::testing::Test {
-public:
- void build(llvm::StringRef BNF) {
- Diags.clear();
- G = Grammar::parseBNF(BNF, Diags);
- }
-
- SymbolID symbol(llvm::StringRef Name) const {
- for (unsigned I = 0; I < NumTerminals; ++I)
- if (G.table().Terminals[I] == Name)
- return tokenSymbol(static_cast<tok::TokenKind>(I));
- for (SymbolID ID = 0; ID < G.table().Nonterminals.size(); ++ID)
- if (G.table().Nonterminals[ID].Name == Name)
- return ID;
- ADD_FAILURE() << "No such symbol found: " << Name;
- return 0;
- }
-
- RuleID ruleFor(llvm::StringRef NonterminalName) const {
- auto RuleRange = G.table().Nonterminals[symbol(NonterminalName)].RuleRange;
- if (RuleRange.End - RuleRange.Start == 1)
- return G.table().Nonterminals[symbol(NonterminalName)].RuleRange.Start;
- ADD_FAILURE() << "Expected a single rule for " << NonterminalName
- << ", but it has " << RuleRange.End - RuleRange.Start
- << " rule!\n";
- return 0;
- }
-
-protected:
- Grammar G;
- std::vector<std::string> Diags;
-};
-
-TEST_F(ForestTest, DumpBasic) {
- build(R"cpp(
- _ := add-expression EOF
- add-expression := id-expression + id-expression
- id-expression := IDENTIFIER
- )cpp");
- ASSERT_TRUE(Diags.empty());
- ForestArena Arena;
- const auto &TS =
- cook(lex("a + b", clang::LangOptions()), clang::LangOptions());
-
- auto T = Arena.createTerminals(TS);
- ASSERT_EQ(T.size(), 4u);
- const auto *Left = &Arena.createSequence(
- symbol("id-expression"), ruleFor("id-expression"), {&T.front()});
- const auto *Right = &Arena.createSequence(symbol("id-expression"),
- ruleFor("id-expression"), {&T[2]});
-
- const auto *Add =
- &Arena.createSequence(symbol("add-expression"), ruleFor("add-expression"),
- {Left, &T[1], Right});
- EXPECT_EQ(Add->dumpRecursive(G, true),
- "[ 0, end) add-expression := id-expression + id-expression\n"
- "[ 0, 1) ├─id-expression~IDENTIFIER := tok[0]\n"
- "[ 1, 2) ├─+ := tok[1]\n"
- "[ 2, end) └─id-expression~IDENTIFIER := tok[2]\n");
- EXPECT_EQ(Add->dumpRecursive(G, false),
- "[ 0, end) add-expression := id-expression + id-expression\n"
- "[ 0, 1) ├─id-expression := IDENTIFIER\n"
- "[ 0, 1) │ └─IDENTIFIER := tok[0]\n"
- "[ 1, 2) ├─+ := tok[1]\n"
- "[ 2, end) └─id-expression := IDENTIFIER\n"
- "[ 2, end) └─IDENTIFIER := tok[2]\n");
-}
-
-TEST_F(ForestTest, DumpAmbiguousAndRefs) {
- build(R"cpp(
- _ := type EOF
- type := class-type # rule 4
- type := enum-type # rule 5
- class-type := shared-type
- enum-type := shared-type
- shared-type := IDENTIFIER)cpp");
- ASSERT_TRUE(Diags.empty());
- ForestArena Arena;
- const auto &TS = cook(lex("abc", clang::LangOptions()), clang::LangOptions());
-
- auto Terminals = Arena.createTerminals(TS);
- ASSERT_EQ(Terminals.size(), 2u);
-
- const auto *SharedType = &Arena.createSequence(
- symbol("shared-type"), ruleFor("shared-type"), {Terminals.begin()});
- const auto *ClassType = &Arena.createSequence(
- symbol("class-type"), ruleFor("class-type"), {SharedType});
- const auto *EnumType = &Arena.createSequence(
- symbol("enum-type"), ruleFor("enum-type"), {SharedType});
- const auto *Alternative1 =
- &Arena.createSequence(symbol("type"), /*RuleID=*/4, {ClassType});
- const auto *Alternative2 =
- &Arena.createSequence(symbol("type"), /*RuleID=*/5, {EnumType});
- const auto *Type =
- &Arena.createAmbiguous(symbol("type"), {Alternative1, Alternative2});
- EXPECT_EQ(Type->dumpRecursive(G),
- "[ 0, end) type := <ambiguous>\n"
- "[ 0, end) ├─type := class-type\n"
- "[ 0, end) │ └─class-type := shared-type\n"
- "[ 0, end) │ └─shared-type := IDENTIFIER #1\n"
- "[ 0, end) │ └─IDENTIFIER := tok[0]\n"
- "[ 0, end) └─type := enum-type\n"
- "[ 0, end) └─enum-type := shared-type\n"
- "[ 0, end) └─shared-type =#1\n");
-}
-
-TEST_F(ForestTest, DumpAbbreviatedShared) {
- build(R"cpp(
- _ := A
- A := B
- B := *
- )cpp");
-
- ForestArena Arena;
- const auto *Star = &Arena.createTerminal(tok::star, 0);
-
- const auto *B = &Arena.createSequence(symbol("B"), ruleFor("B"), {Star});
- // We have two identical (but distinct) A nodes.
- // The GLR parser would never produce this, but it makes the example simpler.
- const auto *A1 = &Arena.createSequence(symbol("A"), ruleFor("A"), {B});
- const auto *A2 = &Arena.createSequence(symbol("A"), ruleFor("A"), {B});
- const auto *A = &Arena.createAmbiguous(symbol("A"), {A1, A2});
-
- // We must not abbreviate away shared nodes: if we show A~* there's no way to
- // show that the intermediate B node is shared between A1 and A2.
- EXPECT_EQ(A->dumpRecursive(G, /*Abbreviate=*/true),
- "[ 0, end) A := <ambiguous>\n"
- "[ 0, end) ├─A~B := * #1\n"
- "[ 0, end) │ └─* := tok[0]\n"
- "[ 0, end) └─A~B =#1\n");
-}
-
-TEST_F(ForestTest, Iteration) {
- // Z
- // / \
- // X Y
- // |\|
- // A B
- ForestArena Arena;
- const auto *A = &Arena.createTerminal(tok::identifier, 0);
- const auto *B = &Arena.createOpaque(1, 0);
- const auto *X = &Arena.createSequence(2, 1, {A, B});
- const auto *Y = &Arena.createSequence(2, 2, {B});
- const auto *Z = &Arena.createAmbiguous(2, {X, Y});
-
- std::vector<const ForestNode *> Nodes;
- for (const ForestNode &N : Z->descendants())
- Nodes.push_back(&N);
- EXPECT_THAT(Nodes, testing::UnorderedElementsAre(A, B, X, Y, Z));
-
- Nodes.clear();
- for (const ForestNode &N : X->descendants())
- Nodes.push_back(&N);
- EXPECT_THAT(Nodes, testing::UnorderedElementsAre(X, A, B));
-}
-
-} // namespace
-} // namespace pseudo
-} // namespace clang
diff --git a/clang-tools-extra/pseudo/unittests/GLRTest.cpp b/clang-tools-extra/pseudo/unittests/GLRTest.cpp
deleted file mode 100644
index f361fb78247acd..00000000000000
--- a/clang-tools-extra/pseudo/unittests/GLRTest.cpp
+++ /dev/null
@@ -1,789 +0,0 @@
-//===--- GLRTest.cpp - Test the GLR parser ----------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "clang-pseudo/GLR.h"
-#include "clang-pseudo/Bracket.h"
-#include "clang-pseudo/Language.h"
-#include "clang-pseudo/Token.h"
-#include "clang-pseudo/grammar/Grammar.h"
-#include "clang/Basic/LangOptions.h"
-#include "clang/Basic/TokenKinds.h"
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/Support/FormatVariadic.h"
-#include "gmock/gmock.h"
-#include "gtest/gtest.h"
-#include <memory>
-
-namespace clang {
-namespace pseudo {
-
-llvm::raw_ostream &operator<<(llvm::raw_ostream &OS,
- const std::vector<const GSS::Node *> &Heads) {
- for (const auto *Head : Heads)
- OS << *Head << "\n";
- return OS;
-}
-
-namespace {
-
-using StateID = LRTable::StateID;
-using testing::AllOf;
-using testing::ElementsAre;
-using testing::IsEmpty;
-using testing::UnorderedElementsAre;
-
-MATCHER_P(state, StateID, "") { return arg->State == StateID; }
-MATCHER_P(parsedSymbol, FNode, "") { return arg->Payload == FNode; }
-MATCHER_P(parsedSymbolID, SID, "") { return arg->Payload->symbol() == SID; }
-MATCHER_P(start, Start, "") { return arg->Payload->startTokenIndex() == Start; }
-
-testing::Matcher<const GSS::Node *>
-parents(llvm::ArrayRef<const GSS::Node *> Parents) {
- return testing::Property(&GSS::Node::parents,
- testing::UnorderedElementsAreArray(Parents));
-}
-
-Token::Index recoverBraces(Token::Index Begin, const TokenStream &Code) {
- EXPECT_GT(Begin, 0u);
- const Token &Left = Code.tokens()[Begin - 1];
- EXPECT_EQ(Left.Kind, tok::l_brace);
- if (const auto* Right = Left.pair()) {
- EXPECT_EQ(Right->Kind, tok::r_brace);
- return Code.index(*Right);
- }
- return Token::Invalid;
-}
-
-class GLRTest : public ::testing::Test {
-public:
- void build(llvm::StringRef GrammarBNF) {
- std::vector<std::string> Diags;
- TestLang.G = Grammar::parseBNF(GrammarBNF, Diags);
- }
-
- TokenStream emptyTokenStream() {
- TokenStream Empty;
- Empty.finalize();
- return Empty;
- }
-
- void buildGrammar(std::vector<std::string> Nonterminals,
- std::vector<std::string> Rules) {
- Nonterminals.push_back("_");
- llvm::sort(Nonterminals);
- Nonterminals.erase(std::unique(Nonterminals.begin(), Nonterminals.end()),
- Nonterminals.end());
- std::string FakeTestBNF;
- for (const auto &NT : Nonterminals)
- FakeTestBNF += llvm::formatv("{0} := {1}\n", "_", NT);
- FakeTestBNF += llvm::join(Rules, "\n");
- build(FakeTestBNF);
- }
-
- SymbolID id(llvm::StringRef Name) const {
- for (unsigned I = 0; I < NumTerminals; ++I)
- if (TestLang.G.table().Terminals[I] == Name)
- return tokenSymbol(static_cast<tok::TokenKind>(I));
- for (SymbolID ID = 0; ID < TestLang.G.table().Nonterminals.size(); ++ID)
- if (TestLang.G.table().Nonterminals[ID].Name == Name)
- return ID;
- ADD_FAILURE() << "No such symbol found: " << Name;
- return 0;
- }
- ExtensionID extensionID(llvm::StringRef AttrValueName) const {
- for (ExtensionID EID = 0; EID < TestLang.G.table().AttributeValues.size();
- ++EID)
- if (TestLang.G.table().AttributeValues[EID] == AttrValueName)
- return EID;
- ADD_FAILURE() << "No such attribute value found: " << AttrValueName;
- return 0;
- }
-
- RuleID ruleFor(llvm::StringRef NonterminalName) const {
- auto RuleRange =
- TestLang.G.table().Nonterminals[id(NonterminalName)].RuleRange;
- if (RuleRange.End - RuleRange.Start == 1)
- return TestLang.G.table()
- .Nonterminals[id(NonterminalName)]
- .RuleRange.Start;
- ADD_FAILURE() << "Expected a single rule for " << NonterminalName
- << ", but it has " << RuleRange.End - RuleRange.Start
- << " rule!\n";
- return 0;
- }
-
-protected:
- Language TestLang;
- ForestArena Arena;
- GSS GSStack;
-};
-
-TEST_F(GLRTest, ShiftMergingHeads) {
- // Given a test case where we have two heads 1, 2, 3 in the GSS, the heads 1,
- // 2 have shift actions to reach state 4, and the head 3 has a shift action to
- // reach state 5:
- // 0--1
- // └--2
- // └--3
- // After the shift action, the GSS (with new heads 4, 5) is:
- // 0---1---4
- // └---2---┘
- // └---3---5
- auto *GSSNode0 =
- GSStack.addNode(/*State=*/0, /*ForestNode=*/nullptr, /*Parents=*/{});
- auto *GSSNode1 = GSStack.addNode(/*State=*/1, /*ForestNode=*/nullptr,
- /*Parents=*/{GSSNode0});
- auto *GSSNode2 = GSStack.addNode(/*State=*/2, /*ForestNode=*/nullptr,
- /*Parents=*/{GSSNode0});
- auto *GSSNode3 = GSStack.addNode(/*State=*/3, /*ForestNode=*/nullptr,
- /*Parents=*/{GSSNode0});
-
- buildGrammar({}, {}); // Create a fake empty grammar.
- LRTable::Builder B(TestLang.G);
- B.Transition[{StateID{1}, tokenSymbol(tok::semi)}] = StateID{4};
- B.Transition[{StateID{2}, tokenSymbol(tok::semi)}] = StateID{4};
- B.Transition[{StateID{3}, tokenSymbol(tok::semi)}] = StateID{5};
- TestLang.Table = std::move(B).build();
-
- ForestNode &SemiTerminal = Arena.createTerminal(tok::semi, 0);
- std::vector<const GSS::Node *> NewHeads;
- glrShift({GSSNode1, GSSNode2, GSSNode3}, SemiTerminal,
- {emptyTokenStream(), Arena, GSStack}, TestLang, NewHeads);
-
- EXPECT_THAT(NewHeads,
- UnorderedElementsAre(AllOf(state(4), parsedSymbol(&SemiTerminal),
- parents({GSSNode1, GSSNode2})),
- AllOf(state(5), parsedSymbol(&SemiTerminal),
- parents({GSSNode3}))))
- << NewHeads;
-}
-
-TEST_F(GLRTest, ReduceConflictsSplitting) {
- // Before (splitting due to R/R conflict):
- // 0--1(IDENTIFIER)
- // After reducing 1 by `class-name := IDENTIFIER` and
- // `enum-name := IDENTIFIER`:
- // 0--2(class-name) // 2 is goto(0, class-name)
- // └--3(enum-name) // 3 is goto(0, enum-name)
- buildGrammar({"class-name", "enum-name"},
- {"class-name := IDENTIFIER", "enum-name := IDENTIFIER"});
- LRTable::Builder B(TestLang.G);
- B.Transition[{StateID{0}, id("class-name")}] = StateID{2};
- B.Transition[{StateID{0}, id("enum-name")}] = StateID{3};
- B.Reduce[StateID{1}].insert(ruleFor("class-name"));
- B.Reduce[StateID{1}].insert(ruleFor("enum-name"));
- TestLang.Table = std::move(B).build();
-
- const auto *GSSNode0 =
- GSStack.addNode(/*State=*/0, /*ForestNode=*/nullptr, /*Parents=*/{});
- const auto *GSSNode1 =
- GSStack.addNode(1, &Arena.createTerminal(tok::identifier, 0), {GSSNode0});
-
- std::vector<const GSS::Node *> Heads = {GSSNode1};
- glrReduce(Heads, tokenSymbol(tok::eof),
- {emptyTokenStream(), Arena, GSStack}, TestLang);
- EXPECT_THAT(Heads, UnorderedElementsAre(
- GSSNode1,
- AllOf(state(2), parsedSymbolID(id("class-name")),
- parents({GSSNode0})),
- AllOf(state(3), parsedSymbolID(id("enum-name")),
- parents({GSSNode0}))))
- << Heads;
-}
-
-TEST_F(GLRTest, ReduceSplittingDueToMultipleBases) {
- // Before (splitting due to multiple bases):
- // 2(class-name)--4(*)
- // 3(enum-name)---┘
- // After reducing 4 by `ptr-operator := *`:
- // 2(class-name)--5(ptr-operator) // 5 is goto(2, ptr-operator)
- // 3(enum-name)---6(ptr-operator) // 6 is goto(3, ptr-operator)
- buildGrammar({"ptr-operator", "class-name", "enum-name"},
- {"ptr-operator := *"});
-
- auto *ClassNameNode = &Arena.createOpaque(id("class-name"), /*TokenIndex=*/0);
- auto *EnumNameNode = &Arena.createOpaque(id("enum-name"), /*TokenIndex=*/0);
-
- const auto *GSSNode2 =
- GSStack.addNode(/*State=*/2, /*ForestNode=*/ClassNameNode, /*Parents=*/{});
- const auto *GSSNode3 =
- GSStack.addNode(/*State=*/3, /*ForestNode=*/EnumNameNode, /*Parents=*/{});
- const auto *GSSNode4 = GSStack.addNode(
- /*State=*/4, &Arena.createTerminal(tok::star, /*TokenIndex=*/1),
- /*Parents=*/{GSSNode2, GSSNode3});
-
- LRTable::Builder B(TestLang.G);
- B.Transition[{StateID{2}, id("ptr-operator")}] = StateID{5};
- B.Transition[{StateID{3}, id("ptr-operator")}] = StateID{6};
- B.Reduce[StateID{4}].insert(ruleFor("ptr-operator"));
- TestLang.Table = std::move(B).build();
-
- std::vector<const GSS::Node *> Heads = {GSSNode4};
- glrReduce(Heads, tokenSymbol(tok::eof), {emptyTokenStream(), Arena, GSStack},
- TestLang);
-
- EXPECT_THAT(Heads, UnorderedElementsAre(
- GSSNode4,
- AllOf(state(5), parsedSymbolID(id("ptr-operator")),
- parents({GSSNode2})),
- AllOf(state(6), parsedSymbolID(id("ptr-operator")),
- parents({GSSNode3}))))
- << Heads;
- // Verify that the payload of the two new heads is shared, only a single
- // ptr-operator node is created in the forest.
- EXPECT_EQ(Heads[1]->Payload, Heads[2]->Payload);
-}
-
-TEST_F(GLRTest, ReduceJoiningWithMultipleBases) {
- // Before (joining due to same goto state, multiple bases):
- // 0--1(cv-qualifier)--3(class-name)
- // └--2(cv-qualifier)--4(enum-name)
- // After reducing 3 by `type-name := class-name` and
- // 4 by `type-name := enum-name`:
- // 0--1(cv-qualifier)--5(type-name) // 5 is goto(1, type-name) and
- // └--2(cv-qualifier)--┘ // goto(2, type-name)
- buildGrammar({"type-name", "class-name", "enum-name", "cv-qualifier"},
- {"type-name := class-name", "type-name := enum-name"});
-
- auto *CVQualifierNode =
- &Arena.createOpaque(id("cv-qualifier"), /*TokenIndex=*/0);
- auto *ClassNameNode = &Arena.createOpaque(id("class-name"), /*TokenIndex=*/1);
- auto *EnumNameNode = &Arena.createOpaque(id("enum-name"), /*TokenIndex=*/1);
-
- const auto *GSSNode0 =
- GSStack.addNode(/*State=*/0, /*ForestNode=*/nullptr, /*Parents=*/{});
- const auto *GSSNode1 = GSStack.addNode(
- /*State=*/1, /*ForestNode=*/CVQualifierNode, /*Parents=*/{GSSNode0});
- const auto *GSSNode2 = GSStack.addNode(
- /*State=*/2, /*ForestNode=*/CVQualifierNode, /*Parents=*/{GSSNode0});
- const auto *GSSNode3 = GSStack.addNode(
- /*State=*/3, /*ForestNode=*/ClassNameNode,
- /*Parents=*/{GSSNode1});
- const auto *GSSNode4 =
- GSStack.addNode(/*State=*/4, /*ForestNode=*/EnumNameNode,
- /*Parents=*/{GSSNode2});
-
- // FIXME: figure out a way to get rid of the hard-coded reduce RuleID!
- LRTable::Builder B(TestLang.G);
- B.Transition[{StateID{1}, id("type-name")}] = StateID{5};
- B.Transition[{StateID{2}, id("type-name")}] = StateID{5};
- B.Reduce[StateID{3}].insert(/* type-name := class-name */ RuleID{0});
- B.Reduce[StateID{4}].insert(/* type-name := enum-name */ RuleID{1});
- TestLang.Table = std::move(B).build();
-
- std::vector<const GSS::Node *> Heads = {GSSNode3, GSSNode4};
- glrReduce(Heads, tokenSymbol(tok::eof), {emptyTokenStream(), Arena, GSStack},
- TestLang);
-
- // Verify that the stack heads are joint at state 5 after reduces.
- EXPECT_THAT(Heads, UnorderedElementsAre(GSSNode3, GSSNode4,
- AllOf(state(5),
- parsedSymbolID(id("type-name")),
- parents({GSSNode1, GSSNode2}))))
- << Heads;
- // Verify that we create an ambiguous ForestNode of two parses of `type-name`.
- EXPECT_EQ(Heads.back()->Payload->dumpRecursive(TestLang.G),
- "[ 1, end) type-name := <ambiguous>\n"
- "[ 1, end) ├─type-name := class-name\n"
- "[ 1, end) │ └─class-name := <opaque>\n"
- "[ 1, end) └─type-name := enum-name\n"
- "[ 1, end) └─enum-name := <opaque>\n");
-}
-
-TEST_F(GLRTest, ReduceJoiningWithSameBase) {
- // Before (joining due to same goto state, the same base):
- // 0--1(class-name)--3(*)
- // └--2(enum-name)--4(*)
- // After reducing 3 by `pointer := class-name *` and
- // 2 by `pointer := enum-name *`:
- // 0--5(pointer) // 5 is goto(0, pointer)
- buildGrammar({"pointer", "class-name", "enum-name"},
- {"pointer := class-name *", "pointer := enum-name *"});
-
- auto *ClassNameNode = &Arena.createOpaque(id("class-name"), /*TokenIndex=*/0);
- auto *EnumNameNode = &Arena.createOpaque(id("enum-name"), /*TokenIndex=*/0);
- auto *StartTerminal = &Arena.createTerminal(tok::star, /*TokenIndex=*/1);
-
- const auto *GSSNode0 =
- GSStack.addNode(/*State=*/0, /*ForestNode=*/nullptr, /*Parents=*/{});
- const auto *GSSNode1 =
- GSStack.addNode(/*State=*/1, /*ForestNode=*/ClassNameNode,
- /*Parents=*/{GSSNode0});
- const auto *GSSNode2 =
- GSStack.addNode(/*State=*/2, /*ForestNode=*/EnumNameNode,
- /*Parents=*/{GSSNode0});
- const auto *GSSNode3 =
- GSStack.addNode(/*State=*/3, /*ForestNode=*/StartTerminal,
- /*Parents=*/{GSSNode1});
- const auto *GSSNode4 =
- GSStack.addNode(/*State=*/4, /*ForestNode=*/StartTerminal,
- /*Parents=*/{GSSNode2});
-
- // FIXME: figure out a way to get rid of the hard-coded reduce RuleID!
- LRTable::Builder B(TestLang.G);
- B.Transition[{StateID{0}, id("pointer")}] = StateID{5};
- B.Reduce[StateID{3}].insert(/* pointer := class-name */ RuleID{0});
- B.Reduce[StateID{4}].insert(/* pointer := enum-name */ RuleID{1});
- TestLang.Table = std::move(B).build();
-
- std::vector<const GSS::Node *> Heads = {GSSNode3, GSSNode4};
- glrReduce(Heads, tokenSymbol(tok::eof),
- {emptyTokenStream(), Arena, GSStack}, TestLang);
-
- EXPECT_THAT(
- Heads, UnorderedElementsAre(GSSNode3, GSSNode4,
- AllOf(state(5), parsedSymbolID(id("pointer")),
- parents({GSSNode0}))))
- << Heads;
- EXPECT_EQ(Heads.back()->Payload->dumpRecursive(TestLang.G),
- "[ 0, end) pointer := <ambiguous>\n"
- "[ 0, end) ├─pointer := class-name *\n"
- "[ 0, 1) │ ├─class-name := <opaque>\n"
- "[ 1, end) │ └─* := tok[1]\n"
- "[ 0, end) └─pointer := enum-name *\n"
- "[ 0, 1) ├─enum-name := <opaque>\n"
- "[ 1, end) └─* := tok[1]\n");
-}
-
-TEST_F(GLRTest, ReduceLookahead) {
- // A term can be followed by +, but not by -.
- buildGrammar({"sum", "term"}, {"expr := term + term", "term := IDENTIFIER"});
- LRTable::Builder B(TestLang.G);
- B.Transition[{StateID{0}, id("term")}] = StateID{2};
- B.Reduce[StateID{1}].insert(RuleID{0});
- TestLang.Table = std::move(B).build();
-
- auto *Identifier = &Arena.createTerminal(tok::identifier, /*Start=*/0);
-
- const auto *Root =
- GSStack.addNode(/*State=*/0, /*ForestNode=*/nullptr, /*Parents=*/{});
- const auto *GSSNode1 =
- GSStack.addNode(/*State=*/1, /*ForestNode=*/Identifier, {Root});
-
- // When the lookahead is +, reduce is performed.
- std::vector<const GSS::Node *> Heads = {GSSNode1};
- glrReduce(Heads, tokenSymbol(tok::plus), {emptyTokenStream(), Arena, GSStack},
- TestLang);
- EXPECT_THAT(Heads,
- ElementsAre(GSSNode1, AllOf(state(2), parsedSymbolID(id("term")),
- parents(Root))));
-
- // When the lookahead is -, reduce is not performed.
- Heads = {GSSNode1};
- glrReduce(Heads, tokenSymbol(tok::minus),
- {emptyTokenStream(), Arena, GSStack}, TestLang);
- EXPECT_THAT(Heads, ElementsAre(GSSNode1));
-}
-
-TEST_F(GLRTest, Recover) {
- // Recovery while parsing "word" inside braces.
- // Before:
- // 0--1({)--2(?)
- // After recovering a `word` at state 1:
- // 0--3(word) // 3 is goto(1, word)
- buildGrammar({"word", "top"}, {"top := { word [recover=Braces] }"});
- LRTable::Builder B(TestLang.G);
- B.Transition[{StateID{1}, id("word")}] = StateID{3};
- B.Recoveries.push_back({StateID{1}, {extensionID("Braces"), id("word")}});
- TestLang.Table = std::move(B).build();
- TestLang.RecoveryStrategies.try_emplace(extensionID("Braces"), recoverBraces);
-
- auto *LBrace = &Arena.createTerminal(tok::l_brace, 0);
- auto *Question1 = &Arena.createTerminal(tok::question, 1);
- const auto *Root = GSStack.addNode(0, nullptr, {});
- const auto *OpenedBraces = GSStack.addNode(1, LBrace, {Root});
- const auto *AfterQuestion1 = GSStack.addNode(2, Question1, {OpenedBraces});
-
- // Need a token stream with paired braces so the strategy works.
- clang::LangOptions LOptions;
- TokenStream Tokens = cook(lex("{ ? ? ? }", LOptions), LOptions);
- pairBrackets(Tokens);
- std::vector<const GSS::Node *> NewHeads;
-
- unsigned TokenIndex = 2;
- glrRecover({AfterQuestion1}, TokenIndex, {Tokens, Arena, GSStack}, TestLang,
- NewHeads);
- EXPECT_EQ(TokenIndex, 4u) << "should skip ahead to matching brace";
- EXPECT_THAT(NewHeads, ElementsAre(AllOf(state(3), parsedSymbolID(id("word")),
- parents({OpenedBraces}), start(1u))));
- EXPECT_EQ(NewHeads.front()->Payload->kind(), ForestNode::Opaque);
-
- // Test recovery failure: omit closing brace so strategy fails
- TokenStream NoRBrace = cook(lex("{ ? ? ? ?", LOptions), LOptions);
- pairBrackets(NoRBrace);
- NewHeads.clear();
- TokenIndex = 2;
- glrRecover({AfterQuestion1}, TokenIndex, {NoRBrace, Arena, GSStack}, TestLang,
- NewHeads);
- EXPECT_EQ(TokenIndex, 2u) << "should not advance on failure";
- EXPECT_THAT(NewHeads, IsEmpty());
-}
-
-TEST_F(GLRTest, RecoverRightmost) {
- // In a nested block structure, we recover at the innermost possible block.
- // Before:
- // 0--1({)--1({)--1({)
- // After recovering a `block` at inside the second braces:
- // 0--1({)--2(body) // 2 is goto(1, body)
- buildGrammar({"body", "top"}, {"top := { body [recover=Braces] }"});
- LRTable::Builder B(TestLang.G);
- B.Transition[{StateID{1}, id("body")}] = StateID{2};
- B.Recoveries.push_back({StateID{1}, {extensionID("Braces"), id("body")}});
- TestLang.Table = std::move(B).build();
- TestLang.RecoveryStrategies.try_emplace(extensionID("Braces"), recoverBraces);
-
- clang::LangOptions LOptions;
- // Innermost brace is unmatched, to test fallback to next brace.
- TokenStream Tokens = cook(lex("{ { { ? } }", LOptions), LOptions);
- Tokens.tokens()[0].Pair = 5;
- Tokens.tokens()[1].Pair = 4;
- Tokens.tokens()[4].Pair = 1;
- Tokens.tokens()[5].Pair = 0;
-
- auto *Brace1 = &Arena.createTerminal(tok::l_brace, 0);
- auto *Brace2 = &Arena.createTerminal(tok::l_brace, 1);
- auto *Brace3 = &Arena.createTerminal(tok::l_brace, 2);
- const auto *Root = GSStack.addNode(0, nullptr, {});
- const auto *In1 = GSStack.addNode(1, Brace1, {Root});
- const auto *In2 = GSStack.addNode(1, Brace2, {In1});
- const auto *In3 = GSStack.addNode(1, Brace3, {In2});
-
- unsigned TokenIndex = 3;
- std::vector<const GSS::Node *> NewHeads;
- glrRecover({In3}, TokenIndex, {Tokens, Arena, GSStack}, TestLang, NewHeads);
- EXPECT_EQ(TokenIndex, 5u);
- EXPECT_THAT(NewHeads, ElementsAre(AllOf(state(2), parsedSymbolID(id("body")),
- parents({In2}), start(2u))));
-}
-
-TEST_F(GLRTest, RecoverAlternatives) {
- // Recovery inside braces with multiple equally good options
- // Before:
- // 0--1({)
- // After recovering either `word` or `number` inside the braces:
- // 0--1({)--2(word) // 2 is goto(1, word)
- // └--3(number) // 3 is goto(1, number)
- buildGrammar({"number", "word", "top"},
- {
- "top := { number [recover=Braces] }",
- "top := { word [recover=Braces] }",
- });
- LRTable::Builder B(TestLang.G);
- B.Transition[{StateID{1}, id("number")}] = StateID{2};
- B.Transition[{StateID{1}, id("word")}] = StateID{3};
- B.Recoveries.push_back({StateID{1}, {extensionID("Braces"), id("number")}});
- B.Recoveries.push_back({StateID{1}, {extensionID("Braces"), id("word")}});
- TestLang.RecoveryStrategies.try_emplace(extensionID("Braces"), recoverBraces);
- TestLang.Table = std::move(B).build();
- auto *LBrace = &Arena.createTerminal(tok::l_brace, 0);
- const auto *Root = GSStack.addNode(0, nullptr, {});
- const auto *OpenedBraces = GSStack.addNode(1, LBrace, {Root});
-
- clang::LangOptions LOptions;
- TokenStream Tokens = cook(lex("{ ? }", LOptions), LOptions);
- pairBrackets(Tokens);
- std::vector<const GSS::Node *> NewHeads;
- unsigned TokenIndex = 1;
-
- glrRecover({OpenedBraces}, TokenIndex, {Tokens, Arena, GSStack}, TestLang,
- NewHeads);
- EXPECT_EQ(TokenIndex, 2u);
- EXPECT_THAT(NewHeads,
- UnorderedElementsAre(AllOf(state(2), parsedSymbolID(id("number")),
- parents({OpenedBraces}), start(1u)),
- AllOf(state(3), parsedSymbolID(id("word")),
- parents({OpenedBraces}), start(1u))));
-}
-
-TEST_F(GLRTest, PerfectForestNodeSharing) {
- // Run the GLR on a simple grammar and test that we build exactly one forest
- // node per (SymbolID, token range).
-
- // This is a grmammar where the original parsing-stack-based forest node
- // sharing approach will fail. In its LR0 graph, it has two states containing
- // item `expr := • IDENTIFIER`, and both have different goto states on the
- // nonterminal `expr`.
- build(R"bnf(
- _ := test EOF
-
- test := { expr
- test := { IDENTIFIER
- test := left-paren expr
- left-paren := {
- expr := IDENTIFIER
- )bnf");
- TestLang.Table = LRTable::buildSLR(TestLang.G);
- clang::LangOptions LOptions;
- const TokenStream &Tokens = cook(lex("{ abc", LOptions), LOptions);
-
- const ForestNode &Parsed =
- glrParse({Tokens, Arena, GSStack}, id("test"), TestLang);
- // Verify that there is no duplicated sequence node of `expr := IDENTIFIER`
- // in the forest, see the `#1` and `=#1` in the dump string.
- EXPECT_EQ(Parsed.dumpRecursive(TestLang.G),
- "[ 0, end) test := <ambiguous>\n"
- "[ 0, end) ├─test := { expr\n"
- "[ 0, 1) │ ├─{ := tok[0]\n"
- "[ 1, end) │ └─expr := IDENTIFIER #1\n"
- "[ 1, end) │ └─IDENTIFIER := tok[1]\n"
- "[ 0, end) ├─test := { IDENTIFIER\n"
- "[ 0, 1) │ ├─{ := tok[0]\n"
- "[ 1, end) │ └─IDENTIFIER := tok[1]\n"
- "[ 0, end) └─test := left-paren expr\n"
- "[ 0, 1) ├─left-paren := {\n"
- "[ 0, 1) │ └─{ := tok[0]\n"
- "[ 1, end) └─expr =#1\n");
-}
-
-TEST_F(GLRTest, GLRReduceOrder) {
- // Given the following grammar, and the input `IDENTIFIER`, reductions should
- // be performed in the following order:
- // 1. foo := IDENTIFIER
- // 2. { test := IDENTIFIER, test := foo }
- // foo should be reduced first, so that in step 2 we have completed reduces
- // for test, and form an ambiguous forest node.
- build(R"bnf(
- _ := test EOF
-
- test := IDENTIFIER
- test := foo
- foo := IDENTIFIER
- )bnf");
- clang::LangOptions LOptions;
- const TokenStream &Tokens = cook(lex("IDENTIFIER", LOptions), LOptions);
- TestLang.Table = LRTable::buildSLR(TestLang.G);
-
- const ForestNode &Parsed =
- glrParse({Tokens, Arena, GSStack}, id("test"), TestLang);
- EXPECT_EQ(Parsed.dumpRecursive(TestLang.G),
- "[ 0, end) test := <ambiguous>\n"
- "[ 0, end) ├─test := IDENTIFIER\n"
- "[ 0, end) │ └─IDENTIFIER := tok[0]\n"
- "[ 0, end) └─test := foo\n"
- "[ 0, end) └─foo := IDENTIFIER\n"
- "[ 0, end) └─IDENTIFIER := tok[0]\n");
-}
-
-TEST_F(GLRTest, RecoveryEndToEnd) {
- // Simple example of brace-based recovery showing:
- // - recovered region includes tokens both ahead of and behind the cursor
- // - multiple possible recovery rules
- // - recovery from outer scopes is rejected
- build(R"bnf(
- _ := block EOF
-
- block := { block [recover=Braces] }
- block := { numbers [recover=Braces] }
- numbers := NUMERIC_CONSTANT NUMERIC_CONSTANT
- )bnf");
- TestLang.Table = LRTable::buildSLR(TestLang.G);
- TestLang.RecoveryStrategies.try_emplace(extensionID("Braces"), recoverBraces);
- clang::LangOptions LOptions;
- TokenStream Tokens = cook(lex("{ { 42 ? } }", LOptions), LOptions);
- pairBrackets(Tokens);
-
- const ForestNode &Parsed =
- glrParse({Tokens, Arena, GSStack}, id("block"), TestLang);
- EXPECT_EQ(Parsed.dumpRecursive(TestLang.G),
- "[ 0, end) block := { block [recover=Braces] }\n"
- "[ 0, 1) ├─{ := tok[0]\n"
- "[ 1, 5) ├─block := <ambiguous>\n"
- "[ 1, 5) │ ├─block := { block [recover=Braces] }\n"
- "[ 1, 2) │ │ ├─{ := tok[1]\n"
- "[ 2, 4) │ │ ├─block := <opaque>\n"
- "[ 4, 5) │ │ └─} := tok[4]\n"
- "[ 1, 5) │ └─block := { numbers [recover=Braces] }\n"
- "[ 1, 2) │ ├─{ := tok[1]\n"
- "[ 2, 4) │ ├─numbers := <opaque>\n"
- "[ 4, 5) │ └─} := tok[4]\n"
- "[ 5, end) └─} := tok[5]\n");
-}
-
-TEST_F(GLRTest, RecoverTerminal) {
- build(R"bnf(
- _ := stmt EOF
-
- stmt := IDENTIFIER ; [recover=Skip]
- )bnf");
- TestLang.Table = LRTable::buildSLR(TestLang.G);
- TestLang.RecoveryStrategies.try_emplace(
- extensionID("Skip"),
- [](Token::Index Start, const TokenStream &) { return Start; });
- clang::LangOptions LOptions;
- TokenStream Tokens = cook(lex("foo", LOptions), LOptions);
-
- const ForestNode &Parsed =
- glrParse({Tokens, Arena, GSStack}, id("stmt"), TestLang);
- EXPECT_EQ(Parsed.dumpRecursive(TestLang.G),
- "[ 0, end) stmt := IDENTIFIER ; [recover=Skip]\n"
- "[ 0, 1) ├─IDENTIFIER := tok[0]\n"
- "[ 1, end) └─; := <opaque>\n");
-}
-
-TEST_F(GLRTest, RecoverUnrestrictedReduce) {
- // Here, ! is not in any rule and therefore not in the follow set of `word`.
- // We would not normally reduce `word := IDENTIFIER`, but do so for recovery.
-
- build(R"bnf(
- _ := sentence EOF
-
- word := IDENTIFIER
- sentence := word word [recover=AcceptAnyTokenInstead]
- )bnf");
-
- clang::LangOptions LOptions;
- const TokenStream &Tokens = cook(lex("id !", LOptions), LOptions);
- TestLang.Table = LRTable::buildSLR(TestLang.G);
- TestLang.RecoveryStrategies.try_emplace(
- extensionID("AcceptAnyTokenInstead"),
- [](Token::Index Start, const TokenStream &Stream) { return Start + 1; });
-
- const ForestNode &Parsed =
- glrParse({Tokens, Arena, GSStack}, id("sentence"), TestLang);
- EXPECT_EQ(Parsed.dumpRecursive(TestLang.G),
- "[ 0, end) sentence := word word [recover=AcceptAnyTokenInstead]\n"
- "[ 0, 1) ├─word := IDENTIFIER\n"
- "[ 0, 1) │ └─IDENTIFIER := tok[0]\n"
- "[ 1, end) └─word := <opaque>\n");
-}
-
-TEST_F(GLRTest, RecoveryFromStartOfInput) {
- build(R"bnf(
- _ := start [recover=Fallback] EOF
-
- start := IDENTIFIER
- )bnf");
- TestLang.Table = LRTable::buildSLR(TestLang.G);
- bool fallback_recovered = false;
- auto fallback = [&](Token::Index Start, const TokenStream & Code) {
- fallback_recovered = true;
- return Code.tokens().size();
- };
- TestLang.RecoveryStrategies.try_emplace(
- extensionID("Fallback"),
- fallback);
- clang::LangOptions LOptions;
- TokenStream Tokens = cook(lex("?", LOptions), LOptions);
-
- const ForestNode &Parsed =
- glrParse({Tokens, Arena, GSStack}, id("start"), TestLang);
- EXPECT_TRUE(fallback_recovered);
- EXPECT_EQ(Parsed.dumpRecursive(TestLang.G),
- "[ 0, end) start := <opaque>\n");
-}
-
-TEST_F(GLRTest, RepeatedRecovery) {
- // We require multiple steps of recovery at eof and then a reduction in order
- // to successfully parse.
- build(R"bnf(
- _ := function EOF
- # FIXME: this forces EOF to be in follow(signature).
- # Remove it once we use unconstrained reduction for recovery.
- _ := signature EOF
-
- function := signature body [recover=Skip]
- signature := IDENTIFIER params [recover=Skip]
- params := ( )
- body := { }
- )bnf");
- TestLang.Table = LRTable::buildSLR(TestLang.G);
- TestLang.RecoveryStrategies.try_emplace(
- extensionID("Skip"),
- [](Token::Index Start, const TokenStream &) { return Start; });
- clang::LangOptions LOptions;
- TokenStream Tokens = cook(lex("main", LOptions), LOptions);
-
- const ForestNode &Parsed =
- glrParse({Tokens, Arena, GSStack}, id("function"), TestLang);
- EXPECT_EQ(Parsed.dumpRecursive(TestLang.G),
- "[ 0, end) function := signature body [recover=Skip]\n"
- "[ 0, 1) ├─signature := IDENTIFIER params [recover=Skip]\n"
- "[ 0, 1) │ ├─IDENTIFIER := tok[0]\n"
- "[ 1, 1) │ └─params := <opaque>\n"
- "[ 1, end) └─body := <opaque>\n");
-}
-
-TEST_F(GLRTest, NoExplicitAccept) {
- build(R"bnf(
- _ := test EOF
-
- test := IDENTIFIER test
- test := IDENTIFIER
- )bnf");
- clang::LangOptions LOptions;
- // Given the following input, and the grammar above, we perform two reductions
- // of the nonterminal `test` when the next token is `eof`, verify that the
- // parser stops at the right state.
- const TokenStream &Tokens = cook(lex("id id", LOptions), LOptions);
- TestLang.Table = LRTable::buildSLR(TestLang.G);
-
- const ForestNode &Parsed =
- glrParse({Tokens, Arena, GSStack}, id("test"), TestLang);
- EXPECT_EQ(Parsed.dumpRecursive(TestLang.G),
- "[ 0, end) test := IDENTIFIER test\n"
- "[ 0, 1) ├─IDENTIFIER := tok[0]\n"
- "[ 1, end) └─test := IDENTIFIER\n"
- "[ 1, end) └─IDENTIFIER := tok[1]\n");
-}
-
-TEST_F(GLRTest, GuardExtension) {
- build(R"bnf(
- _ := start EOF
-
- start := IDENTIFIER [guard]
- )bnf");
- TestLang.Guards.try_emplace(
- ruleFor("start"), [&](const GuardParams &P) {
- assert(P.RHS.size() == 1 &&
- P.RHS.front()->symbol() ==
- tokenSymbol(clang::tok::identifier));
- return P.Tokens.tokens()[P.RHS.front()->startTokenIndex()]
- .text() == "test";
- });
- clang::LangOptions LOptions;
- TestLang.Table = LRTable::buildSLR(TestLang.G);
-
- std::string Input = "test";
- const TokenStream &Succeeded = cook(lex(Input, LOptions), LOptions);
- EXPECT_EQ(glrParse({Succeeded, Arena, GSStack}, id("start"), TestLang)
- .dumpRecursive(TestLang.G),
- "[ 0, end) start := IDENTIFIER [guard]\n"
- "[ 0, end) └─IDENTIFIER := tok[0]\n");
-
- Input = "notest";
- const TokenStream &Failed = cook(lex(Input, LOptions), LOptions);
- EXPECT_EQ(glrParse({Failed, Arena, GSStack}, id("start"), TestLang)
- .dumpRecursive(TestLang.G),
- "[ 0, end) start := <opaque>\n");
-}
-
-TEST(GSSTest, GC) {
- // ┌-A-┬-AB
- // ├-B-┘
- // Root-+-C
- // ├-D
- // └-E
- GSS GSStack;
- auto *Root = GSStack.addNode(0, nullptr, {});
- auto *A = GSStack.addNode(0, nullptr, {Root});
- auto *B = GSStack.addNode(0, nullptr, {Root});
- auto *C = GSStack.addNode(0, nullptr, {Root});
- auto *D = GSStack.addNode(0, nullptr, {Root});
- auto *AB = GSStack.addNode(0, nullptr, {A, B});
-
- EXPECT_EQ(1u, GSStack.gc({AB, C})) << "D is destroyed";
- EXPECT_EQ(0u, GSStack.gc({AB, C})) << "D is already gone";
- auto *E = GSStack.addNode(0, nullptr, {Root});
- EXPECT_EQ(D, E) << "Storage of GCed node D is reused for E";
- EXPECT_EQ(3u, GSStack.gc({A, E})) << "Destroys B, AB, C";
- EXPECT_EQ(1u, GSStack.gc({E})) << "Destroys A";
-}
-
-} // namespace
-} // namespace pseudo
-} // namespace clang
diff --git a/clang-tools-extra/pseudo/unittests/GrammarTest.cpp b/clang-tools-extra/pseudo/unittests/GrammarTest.cpp
deleted file mode 100644
index 6b6b47b8a2dbec..00000000000000
--- a/clang-tools-extra/pseudo/unittests/GrammarTest.cpp
+++ /dev/null
@@ -1,213 +0,0 @@
-//===--- GrammarTest.cpp - grammar tests -----------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "clang-pseudo/grammar/Grammar.h"
-#include "gmock/gmock.h"
-#include "gtest/gtest.h"
-#include <memory>
-
-namespace clang {
-namespace pseudo {
-namespace {
-
-using testing::AllOf;
-using testing::ElementsAre;
-using testing::IsEmpty;
-using testing::Pair;
-using testing::UnorderedElementsAre;
-
-MATCHER_P(TargetID, SID, "") { return arg.Target == SID; }
-template <typename... T> testing::Matcher<const Rule &> Sequence(T... IDs) {
- return testing::Property(&Rule::seq, ElementsAre(IDs...));
-}
-
-class GrammarTest : public ::testing::Test {
-public:
- void build(llvm::StringRef BNF) {
- Diags.clear();
- G = Grammar::parseBNF(BNF, Diags);
- }
-
- SymbolID id(llvm::StringRef Name) const {
- for (unsigned I = 0; I < NumTerminals; ++I)
- if (G.table().Terminals[I] == Name)
- return tokenSymbol(static_cast<tok::TokenKind>(I));
- for (SymbolID ID = 0; ID < G.table().Nonterminals.size(); ++ID)
- if (G.table().Nonterminals[ID].Name == Name)
- return ID;
- ADD_FAILURE() << "No such symbol found: " << Name;
- return 0;
- }
-
- RuleID ruleFor(llvm::StringRef NonterminalName) const {
- auto RuleRange = G.table().Nonterminals[id(NonterminalName)].RuleRange;
- if (RuleRange.End - RuleRange.Start == 1)
- return G.table().Nonterminals[id(NonterminalName)].RuleRange.Start;
- ADD_FAILURE() << "Expected a single rule for " << NonterminalName
- << ", but it has " << RuleRange.End - RuleRange.Start
- << " rule!\n";
- return 0;
- }
-
-protected:
- Grammar G;
- std::vector<std::string> Diags;
-};
-
-TEST_F(GrammarTest, Basic) {
- build("_ := IDENTIFIER + _ # comment");
- EXPECT_THAT(Diags, IsEmpty());
-
- auto ExpectedRule =
- AllOf(TargetID(id("_")), Sequence(id("IDENTIFIER"), id("+"), id("_")));
- EXPECT_EQ(G.symbolName(id("_")), "_");
- EXPECT_THAT(G.rulesFor(id("_")), UnorderedElementsAre(ExpectedRule));
- const auto &Rule = G.lookupRule(/*RID=*/0);
- EXPECT_THAT(Rule, ExpectedRule);
- EXPECT_THAT(G.symbolName(Rule.seq()[0]), "IDENTIFIER");
- EXPECT_THAT(G.symbolName(Rule.seq()[1]), "+");
- EXPECT_THAT(G.symbolName(Rule.seq()[2]), "_");
-}
-
-TEST_F(GrammarTest, EliminatedOptional) {
- build("_ := CONST_opt INT ;_opt");
- EXPECT_THAT(Diags, IsEmpty());
- EXPECT_THAT(G.table().Rules,
- UnorderedElementsAre(Sequence(id("INT")),
- Sequence(id("CONST"), id("INT")),
- Sequence(id("CONST"), id("INT"), id(";")),
- Sequence(id("INT"), id(";"))));
-}
-
-TEST_F(GrammarTest, RuleIDSorted) {
- build(R"bnf(
- _ := x
-
- x := y
- y := z
- z := IDENTIFIER
- )bnf");
- ASSERT_TRUE(Diags.empty());
-
- EXPECT_LT(ruleFor("z"), ruleFor("y"));
- EXPECT_LT(ruleFor("y"), ruleFor("x"));
- EXPECT_LT(ruleFor("x"), ruleFor("_"));
-}
-
-TEST_F(GrammarTest, Annotation) {
- build(R"bnf(
- _ := x
- x := IDENTIFIER [guard]
- )bnf");
- ASSERT_THAT(Diags, IsEmpty());
- EXPECT_FALSE(G.lookupRule(ruleFor("_")).Guarded);
- EXPECT_TRUE(G.lookupRule(ruleFor("x")).Guarded);
-}
-
-TEST_F(GrammarTest, Diagnostics) {
- build(R"cpp(
- _ := ,_opt
- _ := undefined-sym
- null :=
- _ := IDENFIFIE # a typo of the terminal IDENFITIER
-
- invalid
- # cycle
- a := b
- b := a
-
- _ := IDENTIFIER [unknown=value]
- )cpp");
-
- EXPECT_EQ(G.underscore(), id("_"));
- EXPECT_THAT(Diags, UnorderedElementsAre(
- "Rule '_ := ,_opt' has a nullable RHS",
- "Rule 'null := ' has a nullable RHS",
- "No rules for nonterminal: undefined-sym",
- "Failed to parse 'invalid': no separator :=",
- "Token-like name IDENFIFIE is used as a nonterminal",
- "No rules for nonterminal: IDENFIFIE",
- "The grammar contains a cycle involving symbol a",
- "Unknown attribute 'unknown'"));
-}
-
-TEST_F(GrammarTest, DuplicatedDiagnostics) {
- build(R"cpp(
- _ := test
-
- test := INT
- test := DOUBLE
- test := INT
- )cpp");
-
- EXPECT_THAT(Diags, UnorderedElementsAre("Duplicate rule: `test := INT`"));
-}
-
-TEST_F(GrammarTest, FirstAndFollowSets) {
- build(
- R"bnf(
-_ := expr
-expr := expr - term
-expr := term
-term := IDENTIFIER
-term := ( expr )
-)bnf");
- ASSERT_TRUE(Diags.empty());
- auto ToPairs = [](std::vector<llvm::DenseSet<SymbolID>> Input) {
- std::vector<std::pair<SymbolID, llvm::DenseSet<SymbolID>>> Sets;
- for (SymbolID ID = 0; ID < Input.size(); ++ID)
- Sets.emplace_back(ID, std::move(Input[ID]));
- return Sets;
- };
-
- EXPECT_THAT(
- ToPairs(firstSets(G)),
- UnorderedElementsAre(
- Pair(id("_"), UnorderedElementsAre(id("IDENTIFIER"), id("("))),
- Pair(id("expr"), UnorderedElementsAre(id("IDENTIFIER"), id("("))),
- Pair(id("term"), UnorderedElementsAre(id("IDENTIFIER"), id("(")))));
- EXPECT_THAT(
- ToPairs(followSets(G)),
- UnorderedElementsAre(
- Pair(id("_"), UnorderedElementsAre(id("EOF"))),
- Pair(id("expr"), UnorderedElementsAre(id("-"), id("EOF"), id(")"))),
- Pair(id("term"), UnorderedElementsAre(id("-"), id("EOF"), id(")")))));
-
- build(R"bnf(
-# A simplfied C++ decl-specifier-seq.
-_ := decl-specifier-seq
-decl-specifier-seq := decl-specifier decl-specifier-seq
-decl-specifier-seq := decl-specifier
-decl-specifier := simple-type-specifier
-decl-specifier := INLINE
-simple-type-specifier := INT
- )bnf");
- ASSERT_TRUE(Diags.empty());
- EXPECT_THAT(
- ToPairs(firstSets(G)),
- UnorderedElementsAre(
- Pair(id("_"), UnorderedElementsAre(id("INLINE"), id("INT"))),
- Pair(id("decl-specifier-seq"),
- UnorderedElementsAre(id("INLINE"), id("INT"))),
- Pair(id("simple-type-specifier"), UnorderedElementsAre(id("INT"))),
- Pair(id("decl-specifier"),
- UnorderedElementsAre(id("INLINE"), id("INT")))));
- EXPECT_THAT(
- ToPairs(followSets(G)),
- UnorderedElementsAre(
- Pair(id("_"), UnorderedElementsAre(id("EOF"))),
- Pair(id("decl-specifier-seq"), UnorderedElementsAre(id("EOF"))),
- Pair(id("decl-specifier"),
- UnorderedElementsAre(id("INLINE"), id("INT"), id("EOF"))),
- Pair(id("simple-type-specifier"),
- UnorderedElementsAre(id("INLINE"), id("INT"), id("EOF")))));
-}
-
-} // namespace
-} // namespace pseudo
-} // namespace clang
diff --git a/clang-tools-extra/pseudo/unittests/LRTableTest.cpp b/clang-tools-extra/pseudo/unittests/LRTableTest.cpp
deleted file mode 100644
index 9c9f18e03a3d4c..00000000000000
--- a/clang-tools-extra/pseudo/unittests/LRTableTest.cpp
+++ /dev/null
@@ -1,76 +0,0 @@
-//===--- LRTableTest.cpp - ---------------------------------------*- C++-*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "clang-pseudo/grammar/LRTable.h"
-#include "clang-pseudo/grammar/Grammar.h"
-#include "clang/Basic/TokenKinds.h"
-#include "llvm/Testing/Support/SupportHelpers.h"
-#include "gmock/gmock.h"
-#include "gtest/gtest.h"
-#include <vector>
-
-namespace clang {
-namespace pseudo {
-namespace {
-
-using llvm::ValueIs;
-using testing::ElementsAre;
-using StateID = LRTable::StateID;
-
-TEST(LRTable, Builder) {
- std::vector<std::string> GrammarDiags;
- Grammar G = Grammar::parseBNF(R"bnf(
- _ := expr # rule 0
- expr := term # rule 1
- expr := expr + term # rule 2
- term := IDENTIFIER # rule 3
- )bnf",
- GrammarDiags);
- EXPECT_THAT(GrammarDiags, testing::IsEmpty());
-
- SymbolID Term = *G.findNonterminal("term");
- SymbolID Eof = tokenSymbol(tok::eof);
- SymbolID Identifier = tokenSymbol(tok::identifier);
- SymbolID Plus = tokenSymbol(tok::plus);
-
- LRTable::Builder B(G);
- // eof IDENT term
- // +-------+----+-------+------+
- // |state0 | | s0 | |
- // |state1 | | | g3 |
- // |state2 | | | |
- // +-------+----+-------+------+-------
- B.Transition[{StateID{0}, Identifier}] = StateID{0};
- B.Transition[{StateID{1}, Term}] = StateID{3};
- B.Reduce[StateID{0}].insert(RuleID{0});
- B.Reduce[StateID{1}].insert(RuleID{2});
- B.Reduce[StateID{2}].insert(RuleID{1});
- LRTable T = std::move(B).build();
-
- EXPECT_EQ(T.getShiftState(0, Eof), std::nullopt);
- EXPECT_THAT(T.getShiftState(0, Identifier), ValueIs(0));
- EXPECT_THAT(T.getReduceRules(0), ElementsAre(0));
-
- EXPECT_EQ(T.getShiftState(1, Eof), std::nullopt);
- EXPECT_EQ(T.getShiftState(1, Identifier), std::nullopt);
- EXPECT_THAT(T.getGoToState(1, Term), ValueIs(3));
- EXPECT_THAT(T.getReduceRules(1), ElementsAre(2));
-
- // Verify the behaivor for other non-available-actions terminals.
- SymbolID Int = tokenSymbol(tok::kw_int);
- EXPECT_EQ(T.getShiftState(2, Int), std::nullopt);
-
- // Check follow sets.
- EXPECT_TRUE(T.canFollow(Term, Plus));
- EXPECT_TRUE(T.canFollow(Term, Eof));
- EXPECT_FALSE(T.canFollow(Term, Int));
-}
-
-} // namespace
-} // namespace pseudo
-} // namespace clang
diff --git a/clang-tools-extra/pseudo/unittests/TokenTest.cpp b/clang-tools-extra/pseudo/unittests/TokenTest.cpp
deleted file mode 100644
index 5b71accfad50ff..00000000000000
--- a/clang-tools-extra/pseudo/unittests/TokenTest.cpp
+++ /dev/null
@@ -1,224 +0,0 @@
-//===--- TokenTest.cpp ----------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "clang-pseudo/Token.h"
-#include "clang/Basic/LangOptions.h"
-#include "clang/Basic/TokenKinds.h"
-#include "gmock/gmock.h"
-#include "gtest/gtest.h"
-
-namespace clang {
-namespace pseudo {
-namespace {
-
-using testing::AllOf;
-using testing::ElementsAre;
-using testing::ElementsAreArray;
-using testing::Not;
-
-MATCHER_P2(token, Text, Kind, "") {
- return arg.Kind == Kind && arg.text() == Text;
-}
-
-MATCHER_P(hasFlag, Flag, "") { return arg.flag(Flag); }
-
-MATCHER_P2(lineIndent, Line, Indent, "") {
- return arg.Line == (unsigned)Line && arg.Indent == (unsigned)Indent;
-}
-
-MATCHER_P(originalIndex, index, "") {
- return arg.OriginalIndex == (Token::Index)index;
-}
-
-TEST(TokenTest, Lex) {
- LangOptions Opts;
- std::string Code = R"cpp(
- #include <stdio.h>
- int main() {
- return 42; // the answer
- }
- )cpp";
- TokenStream Raw = lex(Code, Opts);
- ASSERT_TRUE(Raw.isFinalized());
- EXPECT_THAT(Raw.tokens(),
- ElementsAreArray({
- // Lexing of directives is weird, especially <angled> strings.
- token("#", tok::hash),
- token("include", tok::raw_identifier),
- token("<", tok::less),
- token("stdio", tok::raw_identifier),
- token(".", tok::period),
- token("h", tok::raw_identifier),
- token(">", tok::greater),
-
- token("int", tok::raw_identifier),
- token("main", tok::raw_identifier),
- token("(", tok::l_paren),
- token(")", tok::r_paren),
- token("{", tok::l_brace),
- token("return", tok::raw_identifier),
- token("42", tok::numeric_constant),
- token(";", tok::semi),
- token("// the answer", tok::comment),
- token("}", tok::r_brace),
- }));
-
- TokenStream Cooked = cook(Raw, Opts);
- ASSERT_TRUE(Cooked.isFinalized());
- EXPECT_THAT(Cooked.tokens(),
- ElementsAreArray({
- // Cooked identifier types in directives are not meaningful.
- token("#", tok::hash),
- token("include", tok::identifier),
- token("<", tok::less),
- token("stdio", tok::identifier),
- token(".", tok::period),
- token("h", tok::identifier),
- token(">", tok::greater),
-
- token("int", tok::kw_int),
- token("main", tok::identifier),
- token("(", tok::l_paren),
- token(")", tok::r_paren),
- token("{", tok::l_brace),
- token("return", tok::kw_return),
- token("42", tok::numeric_constant),
- token(";", tok::semi),
- token("// the answer", tok::comment),
- token("}", tok::r_brace),
- }));
- // Check raw tokens point back into original source code.
- EXPECT_EQ(Raw.tokens().front().text().begin(), &Code[Code.find('#')]);
-}
-
-TEST(TokenTest, LineContinuation) {
- LangOptions Opts;
- std::string Code = R"cpp(
-one_\
-token
-two \
-tokens
- )cpp";
- TokenStream Raw = lex(Code, Opts);
- EXPECT_THAT(
- Raw.tokens(),
- ElementsAre(AllOf(token("one_\\\ntoken", tok::raw_identifier),
- hasFlag(LexFlags::StartsPPLine),
- hasFlag(LexFlags::NeedsCleaning), lineIndent(1, 0),
- originalIndex(0)),
- AllOf(token("two", tok::raw_identifier),
- hasFlag(LexFlags::StartsPPLine),
- Not(hasFlag(LexFlags::NeedsCleaning)),
- originalIndex(1)),
- AllOf(token("\\\ntokens", tok::raw_identifier),
- Not(hasFlag(LexFlags::StartsPPLine)),
- hasFlag(LexFlags::NeedsCleaning), originalIndex(2))));
-
- TokenStream Cooked = cook(Raw, Opts);
- EXPECT_THAT(
- Cooked.tokens(),
- ElementsAre(AllOf(token("one_token", tok::identifier), lineIndent(1, 0),
- originalIndex(0)),
- AllOf(token("two", tok::identifier), originalIndex(1)),
- AllOf(token("tokens", tok::identifier), originalIndex(2))));
-}
-
-TEST(TokenTest, EncodedCharacters) {
- LangOptions Opts;
- Opts.Trigraphs = true;
- Opts.Digraphs = true;
- Opts.C99 = true; // UCNs
- Opts.CXXOperatorNames = true;
- std::string Code = R"(and <: ??! '??=' \u00E9)";
- TokenStream Raw = lex(Code, Opts);
- EXPECT_THAT(
- Raw.tokens(),
- ElementsAre( // and is not recognized as && until cook().
- AllOf(token("and", tok::raw_identifier),
- Not(hasFlag(LexFlags::NeedsCleaning))),
- // Digraphs are just different spellings of tokens.
- AllOf(token("<:", tok::l_square),
- Not(hasFlag(LexFlags::NeedsCleaning))),
- // Trigraps are interpreted, still need text cleaning.
- AllOf(token(R"(??!)", tok::pipe), hasFlag(LexFlags::NeedsCleaning)),
- // Trigraphs must be substituted inside constants too.
- AllOf(token(R"('??=')", tok::char_constant),
- hasFlag(LexFlags::NeedsCleaning)),
- // UCNs need substitution.
- AllOf(token(R"(\u00E9)", tok::raw_identifier),
- hasFlag(LexFlags::NeedsCleaning))));
-
- TokenStream Cooked = cook(Raw, Opts);
- EXPECT_THAT(
- Cooked.tokens(),
- ElementsAre(token("and", tok::ampamp), // alternate spelling recognized
- token("<:", tok::l_square),
- token("|", tok::pipe), // trigraph substituted
- token("'#'", tok::char_constant), // trigraph substituted
- token("é", tok::identifier))); // UCN substituted
-}
-
-TEST(TokenTest, Indentation) {
- LangOptions Opts;
- std::string Code = R"cpp( hello world
-no_indent \
- line_was_continued
-)cpp";
- TokenStream Raw = lex(Code, Opts);
- EXPECT_THAT(Raw.tokens(), ElementsAreArray({
- lineIndent(0, 3), // hello
- lineIndent(0, 3), // world
- lineIndent(1, 0), // no_indent
- lineIndent(2, 2), // line_was_continued
- }));
-}
-
-TEST(TokenTest, SplitGreaterGreater) {
- LangOptions Opts;
- std::string Code = R"cpp(
->> // split
-// >> with an escaped newline in the middle, split
->\
->
->>= // not split
-)cpp";
- TokenStream Cook = cook(lex(Code, Opts), Opts);
- TokenStream Split = stripComments(Cook);
- EXPECT_THAT(Split.tokens(),
- ElementsAre(AllOf(token(">", tok::greater), originalIndex(0)),
- AllOf(token(">", tok::greater), originalIndex(0)),
- // Token 1 and 2 are comments.
- AllOf(token(">", tok::greater), originalIndex(3)),
- AllOf(token(">", tok::greater), originalIndex(3)),
- AllOf(token(">>=", tok::greatergreaterequal),
- originalIndex(4))));
-}
-
-TEST(TokenTest, DropComments) {
- LangOptions Opts;
- std::string Code = R"cpp(
- // comment
- int /*abc*/;
-)cpp";
- TokenStream Raw = cook(lex(Code, Opts), Opts);
- TokenStream Stripped = stripComments(Raw);
- EXPECT_THAT(
- Raw.tokens(),
- ElementsAre(AllOf(token("// comment", tok::comment), originalIndex(0)),
- AllOf(token("int", tok::kw_int), originalIndex(1)),
- AllOf(token("/*abc*/", tok::comment), originalIndex(2)),
- AllOf(token(";", tok::semi), originalIndex(3))));
-
- EXPECT_THAT(Stripped.tokens(),
- ElementsAre(AllOf(token("int", tok::kw_int), originalIndex(1)),
- AllOf(token(";", tok::semi), originalIndex(3))));
-}
-
-} // namespace
-} // namespace pseudo
-} // namespace clang
diff --git a/clang/docs/ClangFormattedStatus.rst b/clang/docs/ClangFormattedStatus.rst
index b917e077679b47..2475a5d4b2775b 100644
--- a/clang/docs/ClangFormattedStatus.rst
+++ b/clang/docs/ClangFormattedStatus.rst
@@ -799,11 +799,6 @@ tree in terms of conformance to :doc:`ClangFormat` as of: March 06, 2022 17:32:2
- `1`
- `0`
- :good:`100%`
- * - clang/tools/clang-pseudo
- - `1`
- - `1`
- - `0`
- - :good:`100%`
* - clang/tools/clang-refactor
- `4`
- `4`
diff --git a/clang/docs/tools/clang-formatted-files.txt b/clang/docs/tools/clang-formatted-files.txt
index fa40ea74fb7e7d..67ff085144f4de 100644
--- a/clang/docs/tools/clang-formatted-files.txt
+++ b/clang/docs/tools/clang-formatted-files.txt
@@ -603,7 +603,6 @@ clang/tools/clang-fuzzer/ExampleClangLoopProtoFuzzer.cpp
clang/tools/clang-fuzzer/handle-llvm/handle_llvm.h
clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
clang/tools/clang-linker-wrapper/OffloadWrapper.cpp
-clang/tools/clang-pseudo/ClangPseudo.cpp
clang/tools/clang-refactor/ClangRefactor.cpp
clang/tools/clang-refactor/TestSupport.cpp
clang/tools/clang-refactor/TestSupport.h
More information about the cfe-commits
mailing list