[clang-tools-extra] [pseudo] remove most of clang-pseudo (PR #80081)
Sam McCall via cfe-commits
cfe-commits at lists.llvm.org
Tue Jan 30 15:55:58 PST 2024
https://github.com/sam-mccall created https://github.com/llvm/llvm-project/pull/80081
This was never completed, in particular we still wanted:
- disambiguation all grammatical ambiguity, e.g. by cross-referencing
reused identifiers
- heuristic symbol resolution
- conversion to syntax trees
The parts still used by clangd remain and will be dealt with later.
See https://discourse.llvm.org/t/removing-pseudo-parser/71131/5
Original design doc: https://docs.google.com/document/d/1eGkTOsFja63wsv8v0vd5JdoTonj-NlN3ujGF0T7xDbM/edit
>From d0fd37ae2a300e72ca78715639208bc52bb44dbe Mon Sep 17 00:00:00 2001
From: Sam McCall <sam.mccall at gmail.com>
Date: Wed, 31 Jan 2024 00:48:53 +0100
Subject: [PATCH] [pseudo] remove most of clang-pseudo
This was never completed, in particular we still wanted:
- disambiguation all grammatical ambiguity, e.g. by cross-referencing
reused identifiers
- heuristic symbol resolution
- conversion to syntax trees
The parts still used by clangd remain and will be dealt with later.
See https://discourse.llvm.org/t/removing-pseudo-parser/71131/5
Original design doc: https://docs.google.com/document/d/1eGkTOsFja63wsv8v0vd5JdoTonj-NlN3ujGF0T7xDbM/edit
---
clang-tools-extra/clangd/ClangdServer.h | 1 +
clang-tools-extra/pseudo/CMakeLists.txt | 6 -
clang-tools-extra/pseudo/README.md | 7 +
.../pseudo/benchmarks/Benchmark.cpp | 156 ----
.../pseudo/benchmarks/CMakeLists.txt | 9 -
.../pseudo/fuzzer/CMakeLists.txt | 16 -
clang-tools-extra/pseudo/fuzzer/Fuzzer.cpp | 82 --
clang-tools-extra/pseudo/fuzzer/Main.cpp | 16 -
clang-tools-extra/pseudo/gen/CMakeLists.txt | 11 -
clang-tools-extra/pseudo/gen/Main.cpp | 172 ----
.../pseudo/include/CMakeLists.txt | 31 -
.../include/clang-pseudo/Disambiguate.h | 64 --
.../pseudo/include/clang-pseudo/Forest.h | 236 ------
.../pseudo/include/clang-pseudo/GLR.h | 170 ----
.../pseudo/include/clang-pseudo/Language.h | 64 --
.../pseudo/include/clang-pseudo/cli/CLI.h | 35 -
.../pseudo/include/clang-pseudo/cxx/CXX.h | 91 --
.../include/clang-pseudo/grammar/Grammar.h | 230 -----
.../include/clang-pseudo/grammar/LRGraph.h | 196 -----
.../include/clang-pseudo/grammar/LRTable.h | 278 ------
clang-tools-extra/pseudo/lib/CMakeLists.txt | 8 -
clang-tools-extra/pseudo/lib/Disambiguate.cpp | 48 --
clang-tools-extra/pseudo/lib/Forest.cpp | 199 -----
clang-tools-extra/pseudo/lib/GLR.cpp | 772 -----------------
clang-tools-extra/pseudo/lib/cli/CLI.cpp | 54 --
.../pseudo/lib/cli/CMakeLists.txt | 15 -
.../pseudo/lib/cxx/CMakeLists.txt | 15 -
clang-tools-extra/pseudo/lib/cxx/CXX.cpp | 452 ----------
clang-tools-extra/pseudo/lib/cxx/cxx.bnf | 776 -----------------
.../pseudo/lib/grammar/CMakeLists.txt | 10 -
.../pseudo/lib/grammar/Grammar.cpp | 190 -----
.../pseudo/lib/grammar/GrammarBNF.cpp | 362 --------
.../pseudo/lib/grammar/LRGraph.cpp | 265 ------
.../pseudo/lib/grammar/LRTable.cpp | 79 --
.../pseudo/lib/grammar/LRTableBuild.cpp | 121 ---
clang-tools-extra/pseudo/test/CMakeLists.txt | 2 -
.../pseudo/test/check-cxx-bnf.test | 2 -
.../pseudo/test/crash/backslashes.c | 4 -
.../pseudo/test/cxx/capture-list.cpp | 23 -
.../pseudo/test/cxx/contextual-keywords.cpp | 9 -
.../pseudo/test/cxx/dangling-else.cpp | 22 -
.../pseudo/test/cxx/decl-specfier-seq.cpp | 27 -
.../pseudo/test/cxx/declarator-function.cpp | 9 -
.../pseudo/test/cxx/declarator-var.cpp | 9 -
.../test/cxx/declator-member-function.cpp | 9 -
.../test/cxx/empty-member-declaration.cpp | 7 -
.../pseudo/test/cxx/empty-member-spec.cpp | 13 -
clang-tools-extra/pseudo/test/cxx/keyword.cpp | 12 -
.../pseudo/test/cxx/literals.cpp | 43 -
.../pseudo/test/cxx/mixed-designator.cpp | 27 -
.../pseudo/test/cxx/nested-name-specifier.cpp | 28 -
.../pseudo/test/cxx/parameter-decl-clause.cpp | 14 -
.../pseudo/test/cxx/predefined-identifier.cpp | 5 -
.../test/cxx/recovery-func-parameters.cpp | 13 -
.../pseudo/test/cxx/recovery-init-list.cpp | 13 -
.../pseudo/test/cxx/structured-binding.cpp | 6 -
.../cxx/template-empty-type-parameter.cpp | 3 -
.../pseudo/test/cxx/unsized-array.cpp | 7 -
clang-tools-extra/pseudo/test/fuzzer.cpp | 4 -
.../pseudo/test/glr-variant-start.cpp | 9 -
clang-tools-extra/pseudo/test/glr.cpp | 30 -
clang-tools-extra/pseudo/test/html-forest.c | 8 -
clang-tools-extra/pseudo/test/lex.c | 42 -
.../pseudo/test/lr-build-basic.test | 32 -
.../pseudo/test/lr-build-conflicts.test | 49 --
.../pseudo/test/strip-directives.c | 49 --
clang-tools-extra/pseudo/tool/CMakeLists.txt | 29 -
clang-tools-extra/pseudo/tool/ClangPseudo.cpp | 243 ------
clang-tools-extra/pseudo/tool/HTMLForest.cpp | 192 -----
clang-tools-extra/pseudo/tool/HTMLForest.css | 93 ---
clang-tools-extra/pseudo/tool/HTMLForest.html | 15 -
clang-tools-extra/pseudo/tool/HTMLForest.js | 290 -------
.../pseudo/unittests/CMakeLists.txt | 8 -
.../pseudo/unittests/CXXTest.cpp | 30 -
.../pseudo/unittests/DisambiguateTest.cpp | 111 ---
.../pseudo/unittests/ForestTest.cpp | 180 ----
.../pseudo/unittests/GLRTest.cpp | 789 ------------------
.../pseudo/unittests/GrammarTest.cpp | 213 -----
.../pseudo/unittests/LRTableTest.cpp | 76 --
79 files changed, 8 insertions(+), 8028 deletions(-)
delete mode 100644 clang-tools-extra/pseudo/benchmarks/Benchmark.cpp
delete mode 100644 clang-tools-extra/pseudo/benchmarks/CMakeLists.txt
delete mode 100644 clang-tools-extra/pseudo/fuzzer/CMakeLists.txt
delete mode 100644 clang-tools-extra/pseudo/fuzzer/Fuzzer.cpp
delete mode 100644 clang-tools-extra/pseudo/fuzzer/Main.cpp
delete mode 100644 clang-tools-extra/pseudo/gen/CMakeLists.txt
delete mode 100644 clang-tools-extra/pseudo/gen/Main.cpp
delete mode 100644 clang-tools-extra/pseudo/include/CMakeLists.txt
delete mode 100644 clang-tools-extra/pseudo/include/clang-pseudo/Disambiguate.h
delete mode 100644 clang-tools-extra/pseudo/include/clang-pseudo/Forest.h
delete mode 100644 clang-tools-extra/pseudo/include/clang-pseudo/GLR.h
delete mode 100644 clang-tools-extra/pseudo/include/clang-pseudo/Language.h
delete mode 100644 clang-tools-extra/pseudo/include/clang-pseudo/cli/CLI.h
delete mode 100644 clang-tools-extra/pseudo/include/clang-pseudo/cxx/CXX.h
delete mode 100644 clang-tools-extra/pseudo/include/clang-pseudo/grammar/Grammar.h
delete mode 100644 clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRGraph.h
delete mode 100644 clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRTable.h
delete mode 100644 clang-tools-extra/pseudo/lib/Disambiguate.cpp
delete mode 100644 clang-tools-extra/pseudo/lib/Forest.cpp
delete mode 100644 clang-tools-extra/pseudo/lib/GLR.cpp
delete mode 100644 clang-tools-extra/pseudo/lib/cli/CLI.cpp
delete mode 100644 clang-tools-extra/pseudo/lib/cli/CMakeLists.txt
delete mode 100644 clang-tools-extra/pseudo/lib/cxx/CMakeLists.txt
delete mode 100644 clang-tools-extra/pseudo/lib/cxx/CXX.cpp
delete mode 100644 clang-tools-extra/pseudo/lib/cxx/cxx.bnf
delete mode 100644 clang-tools-extra/pseudo/lib/grammar/CMakeLists.txt
delete mode 100644 clang-tools-extra/pseudo/lib/grammar/Grammar.cpp
delete mode 100644 clang-tools-extra/pseudo/lib/grammar/GrammarBNF.cpp
delete mode 100644 clang-tools-extra/pseudo/lib/grammar/LRGraph.cpp
delete mode 100644 clang-tools-extra/pseudo/lib/grammar/LRTable.cpp
delete mode 100644 clang-tools-extra/pseudo/lib/grammar/LRTableBuild.cpp
delete mode 100644 clang-tools-extra/pseudo/test/check-cxx-bnf.test
delete mode 100644 clang-tools-extra/pseudo/test/crash/backslashes.c
delete mode 100644 clang-tools-extra/pseudo/test/cxx/capture-list.cpp
delete mode 100644 clang-tools-extra/pseudo/test/cxx/contextual-keywords.cpp
delete mode 100644 clang-tools-extra/pseudo/test/cxx/dangling-else.cpp
delete mode 100644 clang-tools-extra/pseudo/test/cxx/decl-specfier-seq.cpp
delete mode 100644 clang-tools-extra/pseudo/test/cxx/declarator-function.cpp
delete mode 100644 clang-tools-extra/pseudo/test/cxx/declarator-var.cpp
delete mode 100644 clang-tools-extra/pseudo/test/cxx/declator-member-function.cpp
delete mode 100644 clang-tools-extra/pseudo/test/cxx/empty-member-declaration.cpp
delete mode 100644 clang-tools-extra/pseudo/test/cxx/empty-member-spec.cpp
delete mode 100644 clang-tools-extra/pseudo/test/cxx/keyword.cpp
delete mode 100644 clang-tools-extra/pseudo/test/cxx/literals.cpp
delete mode 100644 clang-tools-extra/pseudo/test/cxx/mixed-designator.cpp
delete mode 100644 clang-tools-extra/pseudo/test/cxx/nested-name-specifier.cpp
delete mode 100644 clang-tools-extra/pseudo/test/cxx/parameter-decl-clause.cpp
delete mode 100644 clang-tools-extra/pseudo/test/cxx/predefined-identifier.cpp
delete mode 100644 clang-tools-extra/pseudo/test/cxx/recovery-func-parameters.cpp
delete mode 100644 clang-tools-extra/pseudo/test/cxx/recovery-init-list.cpp
delete mode 100644 clang-tools-extra/pseudo/test/cxx/structured-binding.cpp
delete mode 100644 clang-tools-extra/pseudo/test/cxx/template-empty-type-parameter.cpp
delete mode 100644 clang-tools-extra/pseudo/test/cxx/unsized-array.cpp
delete mode 100644 clang-tools-extra/pseudo/test/fuzzer.cpp
delete mode 100644 clang-tools-extra/pseudo/test/glr-variant-start.cpp
delete mode 100644 clang-tools-extra/pseudo/test/glr.cpp
delete mode 100644 clang-tools-extra/pseudo/test/html-forest.c
delete mode 100644 clang-tools-extra/pseudo/test/lex.c
delete mode 100644 clang-tools-extra/pseudo/test/lr-build-basic.test
delete mode 100644 clang-tools-extra/pseudo/test/lr-build-conflicts.test
delete mode 100644 clang-tools-extra/pseudo/test/strip-directives.c
delete mode 100644 clang-tools-extra/pseudo/tool/CMakeLists.txt
delete mode 100644 clang-tools-extra/pseudo/tool/ClangPseudo.cpp
delete mode 100644 clang-tools-extra/pseudo/tool/HTMLForest.cpp
delete mode 100644 clang-tools-extra/pseudo/tool/HTMLForest.css
delete mode 100644 clang-tools-extra/pseudo/tool/HTMLForest.html
delete mode 100644 clang-tools-extra/pseudo/tool/HTMLForest.js
delete mode 100644 clang-tools-extra/pseudo/unittests/CXXTest.cpp
delete mode 100644 clang-tools-extra/pseudo/unittests/DisambiguateTest.cpp
delete mode 100644 clang-tools-extra/pseudo/unittests/ForestTest.cpp
delete mode 100644 clang-tools-extra/pseudo/unittests/GLRTest.cpp
delete mode 100644 clang-tools-extra/pseudo/unittests/GrammarTest.cpp
delete mode 100644 clang-tools-extra/pseudo/unittests/LRTableTest.cpp
diff --git a/clang-tools-extra/clangd/ClangdServer.h b/clang-tools-extra/clangd/ClangdServer.h
index a416602251428..7d0a1c65b8e38 100644
--- a/clang-tools-extra/clangd/ClangdServer.h
+++ b/clang-tools-extra/clangd/ClangdServer.h
@@ -168,6 +168,7 @@ class ClangdServer {
std::vector<std::string> QueryDriverGlobs;
// Whether the client supports folding only complete lines.
+ // FIXME: we currently do not behave differently based on this flag.
bool LineFoldingOnly = false;
FeatureModuleSet *FeatureModules = nullptr;
diff --git a/clang-tools-extra/pseudo/CMakeLists.txt b/clang-tools-extra/pseudo/CMakeLists.txt
index 24bc1530bb7d6..2bc0f92d063cc 100644
--- a/clang-tools-extra/pseudo/CMakeLists.txt
+++ b/clang-tools-extra/pseudo/CMakeLists.txt
@@ -1,11 +1,5 @@
include_directories(include)
-include_directories(${CMAKE_CURRENT_BINARY_DIR}/include)
-add_subdirectory(include)
-add_subdirectory(gen)
add_subdirectory(lib)
-add_subdirectory(tool)
-add_subdirectory(fuzzer)
-add_subdirectory(benchmarks)
if(CLANG_INCLUDE_TESTS)
add_subdirectory(unittests)
add_subdirectory(test)
diff --git a/clang-tools-extra/pseudo/README.md b/clang-tools-extra/pseudo/README.md
index 0958f5d500e7f..b5984fdcdc097 100644
--- a/clang-tools-extra/pseudo/README.md
+++ b/clang-tools-extra/pseudo/README.md
@@ -1,3 +1,10 @@
+# Removed
+
+This was never completed and most of the implementation has been removed.
+This document remains for historical interest, for now.
+
+See https://docs.google.com/document/d/1eGkTOsFja63wsv8v0vd5JdoTonj-NlN3ujGF0T7xDbM/edit
+
# clang pseudoparser
This directory implements an approximate heuristic parser for C++, based on the
diff --git a/clang-tools-extra/pseudo/benchmarks/Benchmark.cpp b/clang-tools-extra/pseudo/benchmarks/Benchmark.cpp
deleted file mode 100644
index 087ab6c250e39..0000000000000
--- a/clang-tools-extra/pseudo/benchmarks/Benchmark.cpp
+++ /dev/null
@@ -1,156 +0,0 @@
-//===--- Benchmark.cpp - clang pseudoparser benchmarks ---------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Benchmark for the overall pseudoparser performance, it also includes other
-// important pieces of the pseudoparser (grammar compliation, LR table build
-// etc).
-//
-// Note: make sure to build the benchmark in Release mode.
-//
-// Usage:
-// tools/clang/tools/extra/pseudo/benchmarks/ClangPseudoBenchmark \
-// --grammar=../clang-tools-extra/pseudo/lib/cxx.bnf \
-// --source=../clang/lib/Sema/SemaDecl.cpp
-//
-//===----------------------------------------------------------------------===//
-
-#include "benchmark/benchmark.h"
-#include "clang-pseudo/Bracket.h"
-#include "clang-pseudo/DirectiveTree.h"
-#include "clang-pseudo/Forest.h"
-#include "clang-pseudo/GLR.h"
-#include "clang-pseudo/Token.h"
-#include "clang-pseudo/cli/CLI.h"
-#include "clang-pseudo/grammar/Grammar.h"
-#include "clang-pseudo/grammar/LRTable.h"
-#include "clang/Basic/LangOptions.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/ErrorOr.h"
-#include "llvm/Support/MemoryBuffer.h"
-#include "llvm/Support/raw_ostream.h"
-#include <string>
-
-using llvm::cl::desc;
-using llvm::cl::opt;
-using llvm::cl::Required;
-
-static opt<std::string> Source("source", desc("Source file"), Required);
-
-namespace clang {
-namespace pseudo {
-namespace bench {
-namespace {
-
-const std::string *SourceText = nullptr;
-const Language *Lang = nullptr;
-
-void setup() {
- auto ReadFile = [](llvm::StringRef FilePath) -> std::string {
- llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> GrammarText =
- llvm::MemoryBuffer::getFile(FilePath);
- if (std::error_code EC = GrammarText.getError()) {
- llvm::errs() << "Error: can't read file '" << FilePath
- << "': " << EC.message() << "\n";
- std::exit(1);
- }
- return GrammarText.get()->getBuffer().str();
- };
- SourceText = new std::string(ReadFile(Source));
- Lang = &getLanguageFromFlags();
-}
-
-static void buildSLR(benchmark::State &State) {
- for (auto _ : State)
- LRTable::buildSLR(Lang->G);
-}
-BENCHMARK(buildSLR);
-
-TokenStream lexAndPreprocess() {
- clang::LangOptions LangOpts = genericLangOpts();
- TokenStream RawStream = pseudo::lex(*SourceText, LangOpts);
- auto DirectiveStructure = DirectiveTree::parse(RawStream);
- chooseConditionalBranches(DirectiveStructure, RawStream);
- TokenStream Cook =
- cook(DirectiveStructure.stripDirectives(RawStream), LangOpts);
- auto Stream = stripComments(Cook);
- pairBrackets(Stream);
- return Stream;
-}
-
-static void lex(benchmark::State &State) {
- clang::LangOptions LangOpts = genericLangOpts();
- for (auto _ : State)
- clang::pseudo::lex(*SourceText, LangOpts);
- State.SetBytesProcessed(static_cast<uint64_t>(State.iterations()) *
- SourceText->size());
-}
-BENCHMARK(lex);
-
-static void pairBrackets(benchmark::State &State) {
- clang::LangOptions LangOpts = genericLangOpts();
- auto Stream = clang::pseudo::lex(*SourceText, LangOpts);
- for (auto _ : State)
- pairBrackets(Stream);
- State.SetBytesProcessed(static_cast<uint64_t>(State.iterations()) *
- SourceText->size());
-}
-BENCHMARK(pairBrackets);
-
-static void preprocess(benchmark::State &State) {
- clang::LangOptions LangOpts = genericLangOpts();
- TokenStream RawStream = clang::pseudo::lex(*SourceText, LangOpts);
- for (auto _ : State) {
- auto DirectiveStructure = DirectiveTree::parse(RawStream);
- chooseConditionalBranches(DirectiveStructure, RawStream);
- stripComments(
- cook(DirectiveStructure.stripDirectives(RawStream), LangOpts));
- }
- State.SetBytesProcessed(static_cast<uint64_t>(State.iterations()) *
- SourceText->size());
-}
-BENCHMARK(preprocess);
-
-static void glrParse(benchmark::State &State) {
- SymbolID StartSymbol = *Lang->G.findNonterminal("translation-unit");
- TokenStream Stream = lexAndPreprocess();
- for (auto _ : State) {
- pseudo::ForestArena Forest;
- pseudo::GSS GSS;
- pseudo::glrParse(ParseParams{Stream, Forest, GSS}, StartSymbol, *Lang);
- }
- State.SetBytesProcessed(static_cast<uint64_t>(State.iterations()) *
- SourceText->size());
-}
-BENCHMARK(glrParse);
-
-static void full(benchmark::State &State) {
- SymbolID StartSymbol = *Lang->G.findNonterminal("translation-unit");
- for (auto _ : State) {
- TokenStream Stream = lexAndPreprocess();
- pseudo::ForestArena Forest;
- pseudo::GSS GSS;
- pseudo::glrParse(ParseParams{Stream, Forest, GSS}, StartSymbol, *Lang);
- }
- State.SetBytesProcessed(static_cast<uint64_t>(State.iterations()) *
- SourceText->size());
-}
-BENCHMARK(full);
-
-} // namespace
-} // namespace bench
-} // namespace pseudo
-} // namespace clang
-
-int main(int argc, char *argv[]) {
- benchmark::Initialize(&argc, argv);
- llvm::cl::ParseCommandLineOptions(argc, argv);
- clang::pseudo::bench::setup();
- benchmark::RunSpecifiedBenchmarks();
- return 0;
-}
diff --git a/clang-tools-extra/pseudo/benchmarks/CMakeLists.txt b/clang-tools-extra/pseudo/benchmarks/CMakeLists.txt
deleted file mode 100644
index 859db991403cd..0000000000000
--- a/clang-tools-extra/pseudo/benchmarks/CMakeLists.txt
+++ /dev/null
@@ -1,9 +0,0 @@
-add_benchmark(ClangPseudoBenchmark Benchmark.cpp)
-
-target_link_libraries(ClangPseudoBenchmark
- PRIVATE
- clangPseudo
- clangPseudoCLI
- clangPseudoGrammar
- LLVMSupport
- )
diff --git a/clang-tools-extra/pseudo/fuzzer/CMakeLists.txt b/clang-tools-extra/pseudo/fuzzer/CMakeLists.txt
deleted file mode 100644
index e1d79873471f0..0000000000000
--- a/clang-tools-extra/pseudo/fuzzer/CMakeLists.txt
+++ /dev/null
@@ -1,16 +0,0 @@
-set(LLVM_LINK_COMPONENTS
- FuzzerCLI
- Support
- )
-
-add_llvm_fuzzer(clang-pseudo-fuzzer
- Fuzzer.cpp
- DUMMY_MAIN Main.cpp
- )
-
-target_link_libraries(clang-pseudo-fuzzer
- PRIVATE
- clangPseudo
- clangPseudoCLI
- clangPseudoGrammar
- )
diff --git a/clang-tools-extra/pseudo/fuzzer/Fuzzer.cpp b/clang-tools-extra/pseudo/fuzzer/Fuzzer.cpp
deleted file mode 100644
index 87b9d15480cc3..0000000000000
--- a/clang-tools-extra/pseudo/fuzzer/Fuzzer.cpp
+++ /dev/null
@@ -1,82 +0,0 @@
-//===-- Fuzzer.cpp - Fuzz the pseudoparser --------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "clang-pseudo/DirectiveTree.h"
-#include "clang-pseudo/Forest.h"
-#include "clang-pseudo/GLR.h"
-#include "clang-pseudo/Token.h"
-#include "clang-pseudo/cli/CLI.h"
-#include "clang-pseudo/grammar/Grammar.h"
-#include "clang-pseudo/grammar/LRTable.h"
-#include "clang/Basic/LangOptions.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/Support/MemoryBuffer.h"
-#include "llvm/Support/raw_ostream.h"
-#include <algorithm>
-
-namespace clang {
-namespace pseudo {
-namespace {
-
-class Fuzzer {
- clang::LangOptions LangOpts = clang::pseudo::genericLangOpts();
- bool Print;
-
-public:
- Fuzzer(bool Print) : Print(Print) {}
-
- void operator()(llvm::StringRef Code) {
- std::string CodeStr = Code.str(); // Must be null-terminated.
- auto RawStream = lex(CodeStr, LangOpts);
- auto DirectiveStructure = DirectiveTree::parse(RawStream);
- clang::pseudo::chooseConditionalBranches(DirectiveStructure, RawStream);
- // FIXME: strip preprocessor directives
- auto ParseableStream =
- clang::pseudo::stripComments(cook(RawStream, LangOpts));
-
- clang::pseudo::ForestArena Arena;
- clang::pseudo::GSS GSS;
- const Language &Lang = getLanguageFromFlags();
- auto &Root =
- glrParse(clang::pseudo::ParseParams{ParseableStream, Arena, GSS},
- *Lang.G.findNonterminal("translation-unit"), Lang);
- if (Print)
- llvm::outs() << Root.dumpRecursive(Lang.G);
- }
-};
-
-Fuzzer *Fuzz = nullptr;
-
-} // namespace
-} // namespace pseudo
-} // namespace clang
-
-extern "C" {
-
-// Set up the fuzzer from command line flags:
-// -print - used for testing the fuzzer
-int LLVMFuzzerInitialize(int *Argc, char ***Argv) {
- bool PrintForest = false;
- auto ConsumeArg = [&](llvm::StringRef Arg) -> bool {
- if (Arg == "-print") {
- PrintForest = true;
- return true;
- }
- return false;
- };
- *Argc = std::remove_if(*Argv + 1, *Argv + *Argc, ConsumeArg) - *Argv;
-
- clang::pseudo::Fuzz = new clang::pseudo::Fuzzer(PrintForest);
- return 0;
-}
-
-int LLVMFuzzerTestOneInput(uint8_t *Data, size_t Size) {
- (*clang::pseudo::Fuzz)(llvm::StringRef(reinterpret_cast<char *>(Data), Size));
- return 0;
-}
-}
diff --git a/clang-tools-extra/pseudo/fuzzer/Main.cpp b/clang-tools-extra/pseudo/fuzzer/Main.cpp
deleted file mode 100644
index 542a3007a399f..0000000000000
--- a/clang-tools-extra/pseudo/fuzzer/Main.cpp
+++ /dev/null
@@ -1,16 +0,0 @@
-//===--- Main.cpp - Entry point to sanity check the fuzzer ----------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/FuzzMutate/FuzzerCLI.h"
-
-extern "C" int LLVMFuzzerInitialize(int *, char ***);
-extern "C" int LLVMFuzzerTestOneInput(const uint8_t *, size_t);
-int main(int argc, char *argv[]) {
- return llvm::runFuzzerOnInputs(argc, argv, LLVMFuzzerTestOneInput,
- LLVMFuzzerInitialize);
-}
diff --git a/clang-tools-extra/pseudo/gen/CMakeLists.txt b/clang-tools-extra/pseudo/gen/CMakeLists.txt
deleted file mode 100644
index 3dd615a558751..0000000000000
--- a/clang-tools-extra/pseudo/gen/CMakeLists.txt
+++ /dev/null
@@ -1,11 +0,0 @@
-set(LLVM_LINK_COMPONENTS Support)
-list(REMOVE_ITEM LLVM_COMMON_DEPENDS clang-tablegen-targets)
-
-add_clang_executable(clang-pseudo-gen
- Main.cpp
- )
-
-target_link_libraries(clang-pseudo-gen
- PRIVATE
- clangPseudoGrammar
- )
diff --git a/clang-tools-extra/pseudo/gen/Main.cpp b/clang-tools-extra/pseudo/gen/Main.cpp
deleted file mode 100644
index 25cb26563837a..0000000000000
--- a/clang-tools-extra/pseudo/gen/Main.cpp
+++ /dev/null
@@ -1,172 +0,0 @@
-//===--- Main.cpp - Compile BNF grammar -----------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This is a tool to compile a BNF grammar, it is used by the build system to
-// generate a necessary data bits to statically construct core pieces (Grammar,
-// LRTable etc) of the LR parser.
-//
-//===----------------------------------------------------------------------===//
-
-#include "clang-pseudo/grammar/Grammar.h"
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/FileSystem.h"
-#include "llvm/Support/FormatVariadic.h"
-#include "llvm/Support/MemoryBuffer.h"
-#include "llvm/Support/ToolOutputFile.h"
-#include <algorithm>
-
-using llvm::cl::desc;
-using llvm::cl::init;
-using llvm::cl::opt;
-using llvm::cl::Required;
-using llvm::cl::value_desc;
-using llvm::cl::values;
-
-namespace {
-enum EmitType {
- EmitSymbolList,
- EmitGrammarContent,
-};
-
-opt<std::string> Grammar("grammar", desc("Parse a BNF grammar file."),
- Required);
-opt<EmitType>
- Emit(desc("which information to emit:"),
- values(clEnumValN(EmitSymbolList, "emit-symbol-list",
- "Print nonterminal symbols (default)"),
- clEnumValN(EmitGrammarContent, "emit-grammar-content",
- "Print the BNF grammar content as a string")));
-
-opt<std::string> OutputFilename("o", init("-"), desc("Output"),
- value_desc("file"));
-
-std::string readOrDie(llvm::StringRef Path) {
- llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> Text =
- llvm::MemoryBuffer::getFile(Path);
- if (std::error_code EC = Text.getError()) {
- llvm::errs() << "Error: can't read grammar file '" << Path
- << "': " << EC.message() << "\n";
- ::exit(1);
- }
- return Text.get()->getBuffer().str();
-}
-} // namespace
-
-namespace clang {
-namespace pseudo {
-namespace {
-
-// Mangles a symbol name into a valid identifier.
-//
-// These follow names in the grammar fairly closely:
-// nonterminal: `ptr-declarator` becomes `ptr_declarator`;
-// punctuator: `,` becomes `COMMA`;
-// keyword: `INT` becomes `INT`;
-// terminal: `IDENTIFIER` becomes `IDENTIFIER`;
-std::string mangleSymbol(SymbolID SID, const Grammar &G) {
- static auto &TokNames = *new std::vector<std::string>{
-#define TOK(X) llvm::StringRef(#X).upper(),
-#define KEYWORD(Keyword, Condition) llvm::StringRef(#Keyword).upper(),
-#include "clang/Basic/TokenKinds.def"
- };
- if (isToken(SID))
- return TokNames[symbolToToken(SID)];
- std::string Name = G.symbolName(SID).str();
- // translation-unit -> translation_unit
- std::replace(Name.begin(), Name.end(), '-', '_');
- return Name;
-}
-
-// Mangles the RHS of a rule definition into a valid identifier.
-//
-// These are unique only for a fixed LHS.
-// e.g. for the grammar rule `ptr-declarator := ptr-operator ptr-declarator`,
-// it is `ptr_operator__ptr_declarator`.
-std::string mangleRule(RuleID RID, const Grammar &G) {
- const auto &R = G.lookupRule(RID);
- std::string MangleName = mangleSymbol(R.seq().front(), G);
- for (SymbolID S : R.seq().drop_front()) {
- MangleName.append("__");
- MangleName.append(mangleSymbol(S, G));
- }
- return MangleName;
-}
-
-} // namespace
-} // namespace pseudo
-} // namespace clang
-
-int main(int argc, char *argv[]) {
- llvm::cl::ParseCommandLineOptions(argc, argv, "");
-
- std::string GrammarText = readOrDie(Grammar);
- std::vector<std::string> Diags;
- auto G = clang::pseudo::Grammar::parseBNF(GrammarText, Diags);
-
- if (!Diags.empty()) {
- llvm::errs() << llvm::join(Diags, "\n");
- return 1;
- }
-
- std::error_code EC;
- llvm::ToolOutputFile Out{OutputFilename, EC, llvm::sys::fs::OF_None};
- if (EC) {
- llvm::errs() << EC.message() << '\n';
- return 1;
- }
-
- switch (Emit) {
- case EmitSymbolList:
- Out.os() << R"cpp(
-#ifndef NONTERMINAL
-#define NONTERMINAL(NAME, ID)
-#endif
-#ifndef RULE
-#define RULE(LHS, RHS, ID)
-#endif
-#ifndef EXTENSION
-#define EXTENSION(NAME, ID)
-#endif
-)cpp";
- for (clang::pseudo::SymbolID ID = 0; ID < G.table().Nonterminals.size();
- ++ID) {
- Out.os() << llvm::formatv("NONTERMINAL({0}, {1})\n",
- clang::pseudo::mangleSymbol(ID, G), ID);
- for (const clang::pseudo::Rule &R : G.rulesFor(ID)) {
- clang::pseudo::RuleID RID = &R - G.table().Rules.data();
- Out.os() << llvm::formatv("RULE({0}, {1}, {2})\n",
- clang::pseudo::mangleSymbol(R.Target, G),
- clang::pseudo::mangleRule(RID, G), RID);
- }
- }
- for (clang::pseudo::ExtensionID EID = 1 /*skip the sentinel 0 value*/;
- EID < G.table().AttributeValues.size(); ++EID) {
- llvm::StringRef Name = G.table().AttributeValues[EID];
- assert(!Name.empty());
- Out.os() << llvm::formatv("EXTENSION({0}, {1})\n", Name, EID);
- }
- Out.os() << R"cpp(
-#undef NONTERMINAL
-#undef RULE
-#undef EXTENSION
-)cpp";
- break;
- case EmitGrammarContent:
- for (llvm::StringRef Line : llvm::split(GrammarText, '\n')) {
- Out.os() << '"';
- Out.os().write_escaped((Line + "\n").str());
- Out.os() << "\"\n";
- }
- break;
- }
-
- Out.keep();
-
- return 0;
-}
diff --git a/clang-tools-extra/pseudo/include/CMakeLists.txt b/clang-tools-extra/pseudo/include/CMakeLists.txt
deleted file mode 100644
index 2334cfa12e337..0000000000000
--- a/clang-tools-extra/pseudo/include/CMakeLists.txt
+++ /dev/null
@@ -1,31 +0,0 @@
-# The cxx.bnf grammar file
-set(cxx_bnf ${CMAKE_CURRENT_SOURCE_DIR}/../lib/cxx/cxx.bnf)
-
-setup_host_tool(clang-pseudo-gen CLANG_PSEUDO_GEN pseudo_gen pseudo_gen_target)
-
-# Generate inc files.
-set(cxx_symbols_inc ${CMAKE_CURRENT_BINARY_DIR}/CXXSymbols.inc)
-add_custom_command(OUTPUT ${cxx_symbols_inc}
- COMMAND "${pseudo_gen}"
- --grammar ${cxx_bnf}
- --emit-symbol-list
- -o ${cxx_symbols_inc}
- COMMENT "Generating nonterminal symbol file for cxx grammar..."
- DEPENDS ${pseudo_gen_target} ${cxx_bnf}
- VERBATIM)
-
-set(cxx_bnf_inc ${CMAKE_CURRENT_BINARY_DIR}/CXXBNF.inc)
-add_custom_command(OUTPUT ${cxx_bnf_inc}
- COMMAND "${pseudo_gen}"
- --grammar ${cxx_bnf}
- --emit-grammar-content
- -o ${cxx_bnf_inc}
- COMMENT "Generating bnf string file for cxx grammar..."
- DEPENDS ${pseudo_gen_target} ${cxx_bnf}
- VERBATIM)
-
-# add_custom_command does not create a new target, we need to deine a target
-# explicitly, so that other targets can depend on it.
-add_custom_target(cxx_gen
- DEPENDS ${cxx_symbols_inc} ${cxx_bnf_inc}
- VERBATIM)
diff --git a/clang-tools-extra/pseudo/include/clang-pseudo/Disambiguate.h b/clang-tools-extra/pseudo/include/clang-pseudo/Disambiguate.h
deleted file mode 100644
index 5f3a22c9cabb3..0000000000000
--- a/clang-tools-extra/pseudo/include/clang-pseudo/Disambiguate.h
+++ /dev/null
@@ -1,64 +0,0 @@
-//===--- Disambiguate.h - Find the best tree in the forest -------*- C++-*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// A GLR parse forest represents every possible parse tree for the source code.
-//
-// Before we can do useful analysis/editing of the code, we need to pick a
-// single tree which we think is accurate. We use three main types of clues:
-//
-// A) Semantic language rules may restrict which parses are allowed.
-// For example, `string string string X` is *grammatical* C++, but only a
-// single type-name is allowed in a decl-specifier-sequence.
-// Where possible, these interpretations are forbidden by guards.
-// Sometimes this isn't possible, or we want our parser to be lenient.
-//
-// B) Some constructs are rarer, while others are common.
-// For example `a<b>::c` is often a template specialization, and rarely a
-// double comparison between a, b, and c.
-//
-// C) Identifier text hints whether they name types/values/templates etc.
-// "std" is usually a namespace, a project index may also guide us.
-// Hints may be within the document: if one occurrence of 'foo' is a variable
-// then the others probably are too.
-// (Text need not match: similar CaseStyle can be a weak hint, too).
-//
-//----------------------------------------------------------------------------//
-//
-// Mechanically, we replace each ambiguous node with its best alternative.
-//
-// "Best" is determined by assigning bonuses/penalties to nodes, to express
-// the clues of type A and B above. A forest node representing an unlikely
-// parse would apply a penalty to every subtree is is present in.
-// Disambiguation proceeds bottom-up, so that the score of each alternative
-// is known when a decision is made.
-//
-// Identifier-based hints within the document mean some nodes should be
-// *correlated*. Rather than resolve these simultaneously, we make the most
-// certain decisions first and use these results to update bonuses elsewhere.
-//
-//===----------------------------------------------------------------------===//
-
-#include "clang-pseudo/Forest.h"
-
-namespace clang::pseudo {
-
-struct DisambiguateParams {};
-
-// Maps ambiguous nodes onto the index of their preferred alternative.
-using Disambiguation = llvm::DenseMap<const ForestNode *, unsigned>;
-
-// Resolve each ambiguous node in the forest.
-// Maps each ambiguous node to the index of the chosen alternative.
-// FIXME: current implementation is a placeholder and chooses arbitrarily.
-Disambiguation disambiguate(const ForestNode *Root,
- const DisambiguateParams &Params);
-
-// Remove all ambiguities from the forest, resolving them according to Disambig.
-void removeAmbiguities(ForestNode *&Root, const Disambiguation &Disambig);
-
-} // namespace clang::pseudo
diff --git a/clang-tools-extra/pseudo/include/clang-pseudo/Forest.h b/clang-tools-extra/pseudo/include/clang-pseudo/Forest.h
deleted file mode 100644
index e9edb40e02b64..0000000000000
--- a/clang-tools-extra/pseudo/include/clang-pseudo/Forest.h
+++ /dev/null
@@ -1,236 +0,0 @@
-//===--- Forest.h - Parse forest, the output of the GLR parser ---*- C++-*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// A parse forest represents a set of possible parse trees efficiently, it is
-// produced by the GLR parser.
-//
-// Despite the name, its data structure is a tree-like DAG with a single root.
-// Multiple ways to parse the same tokens are presented as an ambiguous node
-// with all possible interpretations as children.
-// Common sub-parses are shared: if two interpretations both parse "1 + 1" as
-// "expr := expr + expr", they will share a Sequence node representing the expr.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef CLANG_PSEUDO_FOREST_H
-#define CLANG_PSEUDO_FOREST_H
-
-#include "clang-pseudo/Token.h"
-#include "clang-pseudo/grammar/Grammar.h"
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/Support/Allocator.h"
-#include <cstdint>
-
-namespace clang {
-namespace pseudo {
-
-// A node represents ways to parse a sequence of tokens, it interprets a fixed
-// range of tokens as a fixed grammar symbol.
-//
-// There are different kinds of nodes, some nodes have "children" (stored in a
-// trailing array) and have pointers to them. "Children" has different semantics
-// depending on the node kinds. For an Ambiguous node, it means all
-// possible interpretations; for a Sequence node, it means each symbol on the
-// right hand side of the production rule.
-//
-// Since this is a node in a DAG, a node may have multiple parents. And a node
-// doesn't have parent pointers.
-class alignas(class ForestNode *) ForestNode {
-public:
- class RecursiveIterator;
- enum Kind {
- // A Terminal node is a single terminal symbol bound to a token.
- Terminal,
- // A Sequence node is a nonterminal symbol parsed from a grammar rule,
- // elements() are the parses of each symbol on the RHS of the rule.
- // If the rule is A := X Y Z, the node is for nonterminal A, and elements()
- // are [X, Y, Z].
- Sequence,
- // An Ambiguous node exposes multiple ways to interpret the code as the
- // same symbol, alternatives() are all possible parses.
- Ambiguous,
- // An Opaque node is a placeholder. It asserts that tokens match a symbol,
- // without saying how.
- // It is used for lazy-parsing (not parsed yet), or error-recovery (invalid
- // code).
- Opaque,
- };
- Kind kind() const { return K; }
-
- SymbolID symbol() const { return Symbol; }
-
- // The start of the token range, it is a poistion within a token stream.
- Token::Index startTokenIndex() const { return StartIndex; }
-
- // Returns the corresponding grammar rule.
- // REQUIRES: this is a Sequence node.
- RuleID rule() const {
- assert(kind() == Sequence);
- return Data & ((1 << RuleBits) - 1);
- }
- // Returns the parses of each element on the RHS of the rule.
- // REQUIRES: this is a Sequence node;
- llvm::ArrayRef<const ForestNode *> elements() const {
- assert(kind() == Sequence);
- return children(Data >> RuleBits);
- }
- llvm::MutableArrayRef<ForestNode *> elements() {
- assert(kind() == Sequence);
- return children(Data >> RuleBits);
- }
-
- // Returns all possible interpretations of the code.
- // REQUIRES: this is an Ambiguous node.
- llvm::ArrayRef<const ForestNode *> alternatives() const {
- assert(kind() == Ambiguous);
- return children(Data);
- }
- llvm::MutableArrayRef<ForestNode *> alternatives() {
- assert(kind() == Ambiguous);
- return children(Data);
- }
-
- llvm::ArrayRef<const ForestNode *> children() const {
- switch (kind()) {
- case Sequence:
- return elements();
- case Ambiguous:
- return alternatives();
- case Terminal:
- case Opaque:
- return {};
- }
- llvm_unreachable("Bad kind");
- }
-
- // Iteration over all nodes in the forest, including this.
- llvm::iterator_range<RecursiveIterator> descendants() const;
-
- std::string dump(const Grammar &) const;
- std::string dumpRecursive(const Grammar &, bool Abbreviated = false) const;
-
-private:
- friend class ForestArena;
-
- ForestNode(Kind K, SymbolID Symbol, Token::Index StartIndex, uint16_t Data)
- : StartIndex(StartIndex), K(K), Symbol(Symbol), Data(Data) {}
-
- ForestNode(const ForestNode &) = delete;
- ForestNode &operator=(const ForestNode &) = delete;
- ForestNode(ForestNode &&) = delete;
- ForestNode &operator=(ForestNode &&) = delete;
-
- static uint16_t sequenceData(RuleID Rule,
- llvm::ArrayRef<const ForestNode *> Elements) {
- assert(Rule < (1 << RuleBits));
- assert(Elements.size() < (1 << (16 - RuleBits)));
- return Rule | Elements.size() << RuleBits;
- }
- static uint16_t
- ambiguousData(llvm::ArrayRef<const ForestNode *> Alternatives) {
- return Alternatives.size();
- }
-
- // Retrieves the trailing array.
- llvm::ArrayRef<const ForestNode *> children(uint16_t Num) const {
- return llvm::ArrayRef(reinterpret_cast<ForestNode *const *>(this + 1), Num);
- }
- llvm::MutableArrayRef<ForestNode *> children(uint16_t Num) {
- return llvm::MutableArrayRef(reinterpret_cast<ForestNode **>(this + 1),
- Num);
- }
-
- Token::Index StartIndex;
- Kind K : 4;
- SymbolID Symbol : SymbolBits;
- // Sequence - child count : 4 | RuleID : RuleBits (12)
- // Ambiguous - child count : 16
- // Terminal, Opaque - unused
- uint16_t Data;
- // An array of ForestNode* following the object.
-};
-// ForestNode may not be destroyed (for BumpPtrAllocator).
-static_assert(std::is_trivially_destructible<ForestNode>());
-
-// A memory arena for the parse forest.
-class ForestArena {
-public:
- llvm::ArrayRef<ForestNode> createTerminals(const TokenStream &Code);
- ForestNode &createSequence(SymbolID SID, RuleID RID,
- llvm::ArrayRef<const ForestNode *> Elements) {
- assert(!Elements.empty());
- return create(ForestNode::Sequence, SID,
- Elements.front()->startTokenIndex(),
- ForestNode::sequenceData(RID, Elements), Elements);
- }
- ForestNode &createAmbiguous(SymbolID SID,
- llvm::ArrayRef<const ForestNode *> Alternatives) {
- assert(!Alternatives.empty());
- assert(llvm::all_of(Alternatives,
- [SID](const ForestNode *Alternative) {
- return SID == Alternative->symbol();
- }) &&
- "Ambiguous alternatives must represent the same symbol!");
- return create(ForestNode::Ambiguous, SID,
- Alternatives.front()->startTokenIndex(),
- ForestNode::ambiguousData(Alternatives), Alternatives);
- }
- ForestNode &createOpaque(SymbolID SID, Token::Index Start) {
- return create(ForestNode::Opaque, SID, Start, 0, {});
- }
-
- ForestNode &createTerminal(tok::TokenKind TK, Token::Index Start) {
- return create(ForestNode::Terminal, tokenSymbol(TK), Start, 0, {});
- }
-
- size_t nodeCount() const { return NodeCount; }
- size_t bytes() const { return Arena.getBytesAllocated() + sizeof(*this); }
-
-private:
- ForestNode &create(ForestNode::Kind K, SymbolID SID, Token::Index Start,
- uint16_t Data,
- llvm::ArrayRef<const ForestNode *> Elements) {
- ++NodeCount;
- ForestNode *New = new (Arena.Allocate(
- sizeof(ForestNode) + Elements.size() * sizeof(ForestNode *),
- alignof(ForestNode))) ForestNode(K, SID, Start, Data);
- if (!Elements.empty())
- llvm::copy(Elements, reinterpret_cast<const ForestNode **>(New + 1));
- return *New;
- }
-
- llvm::BumpPtrAllocator Arena;
- uint32_t NodeCount = 0;
-};
-
-class ForestNode::RecursiveIterator
- : public llvm::iterator_facade_base<ForestNode::RecursiveIterator,
- std::input_iterator_tag,
- const ForestNode> {
- llvm::DenseSet<const ForestNode *> Seen;
- struct StackFrame {
- const ForestNode *Parent;
- unsigned ChildIndex;
- };
- std::vector<StackFrame> Stack;
- const ForestNode *Cur;
-
-public:
- RecursiveIterator(const ForestNode *N = nullptr) : Cur(N) {}
-
- const ForestNode &operator*() const { return *Cur; }
- void operator++();
- bool operator==(const RecursiveIterator &I) const { return Cur == I.Cur; }
- bool operator!=(const RecursiveIterator &I) const { return !(*this == I); }
-};
-
-} // namespace pseudo
-} // namespace clang
-
-#endif // CLANG_PSEUDO_FOREST_H
diff --git a/clang-tools-extra/pseudo/include/clang-pseudo/GLR.h b/clang-tools-extra/pseudo/include/clang-pseudo/GLR.h
deleted file mode 100644
index 0100f818d4ed7..0000000000000
--- a/clang-tools-extra/pseudo/include/clang-pseudo/GLR.h
+++ /dev/null
@@ -1,170 +0,0 @@
-//===--- GLR.h - Implement a GLR parsing algorithm ---------------*- C++-*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This implements a standard Generalized LR (GLR) parsing algorithm.
-//
-// The GLR parser behaves as a normal LR parser until it encounters a conflict.
-// To handle a conflict (where there are multiple actions could perform), the
-// parser will simulate nondeterminism by doing a breadth-first search
-// over all the possibilities.
-//
-// Basic mechanisims of the GLR parser:
-// - A number of processes are operated in parallel.
-// - Each process has its own parsing stack and behaves as a standard
-// determinism LR parser.
-// - When a process encounters a conflict, it will be fork (one for each
-// avaiable action).
-// - When a process encounters an error, it is abandoned.
-// - All process are synchronized by the lookahead token: they perfrom shift
-// action at the same time, which means some processes need wait until other
-// processes have performed all reduce actions.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef CLANG_PSEUDO_GLR_H
-#define CLANG_PSEUDO_GLR_H
-
-#include "clang-pseudo/Forest.h"
-#include "clang-pseudo/Language.h"
-#include "clang-pseudo/grammar/Grammar.h"
-#include "clang-pseudo/grammar/LRTable.h"
-#include "llvm/Support/Allocator.h"
-#include <vector>
-
-namespace clang {
-namespace pseudo {
-
-// A Graph-Structured Stack efficiently represents all parse stacks of a GLR
-// parser.
-//
-// Each node stores a parse state, the last parsed ForestNode, and the parent
-// node. There may be several heads (top of stack), and the parser operates by:
-// - shift: pushing terminal symbols on top of the stack
-// - reduce: replace N symbols on top of the stack with one nonterminal
-//
-// The structure is a DAG rather than a linear stack:
-// - GLR allows multiple actions (conflicts) on the same head, producing forks
-// where several nodes have the same parent
-// - The parser merges nodes with the same (state, ForestNode), producing joins
-// where one node has multiple parents
-//
-// The parser is responsible for creating nodes and keeping track of the set of
-// heads. The GSS class is mostly an arena for them.
-struct GSS {
- // A node represents a partial parse of the input up to some point.
- //
- // It is the equivalent of a frame in an LR parse stack.
- // Like such a frame, it has an LR parse state and a syntax-tree node
- // representing the last parsed symbol (a ForestNode in our case).
- // Unlike a regular LR stack frame, it may have multiple parents.
- //
- // Nodes are not exactly pushed and popped on the stack: pushing is just
- // allocating a new head node with a parent pointer to the old head. Popping
- // is just forgetting about a node and remembering its parent instead.
- struct alignas(struct Node *) Node {
- // LR state describing how parsing should continue from this head.
- LRTable::StateID State;
- // Used internally to track reachability during garbage collection.
- bool GCParity;
- // Have we already used this node for error recovery? (prevents loops)
- mutable bool Recovered = false;
- // Number of the parents of this node.
- // The parents hold previous parsed symbols, and may resume control after
- // this node is reduced.
- unsigned ParentCount;
- // The parse node for the last parsed symbol.
- // This symbol appears on the left of the dot in the parse state's items.
- // (In the literature, the node is attached to the *edge* to the parent).
- const ForestNode *Payload = nullptr;
-
- llvm::ArrayRef<const Node *> parents() const {
- return llvm::ArrayRef(reinterpret_cast<const Node *const *>(this + 1),
- ParentCount);
- };
- // Parents are stored as a trailing array of Node*.
- };
-
- // Allocates a new node in the graph.
- const Node *addNode(LRTable::StateID State, const ForestNode *Symbol,
- llvm::ArrayRef<const Node *> Parents);
- // Frees all nodes not reachable as ancestors of Roots, and returns the count.
- // Calling this periodically prevents steady memory growth of the GSS.
- unsigned gc(std::vector<const Node *> &&Roots);
-
- size_t bytes() const { return Arena.getTotalMemory() + sizeof(*this); }
- size_t nodesCreated() const { return NodesCreated; }
-
-private:
- // Nodes are recycled using freelists.
- // They are variable size, so use one free-list per distinct #parents.
- std::vector<std::vector<Node *>> FreeList;
- Node *allocate(unsigned Parents);
- void destroy(Node *N);
- // The list of nodes created and not destroyed - our candidates for gc().
- std::vector<Node *> Alive;
- bool GCParity = false; // All nodes should match this, except during GC.
-
- llvm::BumpPtrAllocator Arena;
- unsigned NodesCreated = 0;
-};
-llvm::raw_ostream &operator<<(llvm::raw_ostream &, const GSS::Node &);
-
-// Parameters for the GLR parsing.
-struct ParseParams {
- // The token stream to parse.
- const TokenStream &Code;
-
- // Arena for data structure used by the GLR algorithm.
- ForestArena &Forest; // Storage for the output forest.
- GSS &GSStack; // Storage for parsing stacks.
-};
-
-// Parses the given token stream as the start symbol with the GLR algorithm,
-// and returns a forest node of the start symbol.
-//
-// A rule `_ := StartSymbol` must exit for the chosen start symbol.
-//
-// If the parsing fails, we model it as an opaque node in the forest.
-ForestNode &glrParse(const ParseParams &Params, SymbolID StartSymbol,
- const Language &Lang);
-
-// Shift a token onto all OldHeads, placing the results into NewHeads.
-//
-// Exposed for testing only.
-void glrShift(llvm::ArrayRef<const GSS::Node *> OldHeads,
- const ForestNode &NextTok, const ParseParams &Params,
- const Language &Lang, std::vector<const GSS::Node *> &NewHeads);
-// Applies available reductions on Heads, appending resulting heads to the list.
-//
-// Exposed for testing only.
-void glrReduce(std::vector<const GSS::Node *> &Heads, SymbolID Lookahead,
- const ParseParams &Params, const Language &Lang);
-
-// Heuristically recover from a state where no further parsing is possible.
-//
-// OldHeads is the parse state at TokenIndex.
-// This function consumes zero or more tokens by advancing TokenIndex,
-// and places any recovery states created in NewHeads.
-//
-// On failure, NewHeads is empty and TokenIndex is unchanged.
-//
-// WARNING: glrRecover acts as a "fallback shift". If it consumes no tokens,
-// there is a risk of the parser falling into an infinite loop, creating an
-// endless sequence of recovery nodes.
-// Generally it is safe for recovery to match 0 tokens against sequence symbols
-// like `statement-seq`, as the grammar won't permit another statement-seq
-// immediately afterwards. However recovery strategies for `statement` should
-// consume at least one token, as statements may be adjacent in the input.
-void glrRecover(llvm::ArrayRef<const GSS::Node *> OldHeads,
- unsigned &TokenIndex, const ParseParams &Params,
- const Language &Lang, std::vector<const GSS::Node *> &NewHeads);
-
-} // namespace pseudo
-} // namespace clang
-
-#endif // CLANG_PSEUDO_GLR_H
diff --git a/clang-tools-extra/pseudo/include/clang-pseudo/Language.h b/clang-tools-extra/pseudo/include/clang-pseudo/Language.h
deleted file mode 100644
index 1a2b71f081da0..0000000000000
--- a/clang-tools-extra/pseudo/include/clang-pseudo/Language.h
+++ /dev/null
@@ -1,64 +0,0 @@
-//===--- Language.h -------------------------------------------- -*- C++-*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef CLANG_PSEUDO_LANGUAGE_H
-#define CLANG_PSEUDO_LANGUAGE_H
-
-#include "clang-pseudo/Token.h"
-#include "clang-pseudo/grammar/Grammar.h"
-#include "clang-pseudo/grammar/LRTable.h"
-
-namespace clang {
-namespace pseudo {
-class ForestNode;
-class TokenStream;
-class LRTable;
-
-struct GuardParams {
- llvm::ArrayRef<const ForestNode *> RHS;
- const TokenStream &Tokens;
- // FIXME: use the index of Tokens.
- SymbolID Lookahead;
-};
-// A guard restricts when a grammar rule can be used.
-//
-// The GLR parser will use the guard to determine whether a rule reduction will
-// be conducted. For example, e.g. a guard may allow the rule
-// `virt-specifier := IDENTIFIER` only if the identifier's text is 'override`.
-//
-// Return true if the guard is satisfied.
-using RuleGuard = llvm::function_ref<bool(const GuardParams &)>;
-
-// A recovery strategy determines a region of code to skip when parsing fails.
-//
-// For example, given `class-def := CLASS IDENT { body [recover=Brackets] }`,
-// if parsing fails while attempting to parse `body`, we may skip up to the
-// matching `}` and assume everything between was a `body`.
-//
-// The provided index is the token where the skipped region begins.
-// Returns the (excluded) end of the range, or Token::Invalid for no recovery.
-using RecoveryStrategy =
- llvm::function_ref<Token::Index(Token::Index Start, const TokenStream &)>;
-
-// Specify a language that can be parsed by the pseduoparser.
-struct Language {
- Grammar G;
- LRTable Table;
-
- // Binding extension ids to corresponding implementations.
- llvm::DenseMap<RuleID, RuleGuard> Guards;
- llvm::DenseMap<ExtensionID, RecoveryStrategy> RecoveryStrategies;
-
- // FIXME: add clang::LangOptions.
- // FIXME: add default start symbols.
-};
-
-} // namespace pseudo
-} // namespace clang
-
-#endif // CLANG_PSEUDO_LANGUAGE_H
diff --git a/clang-tools-extra/pseudo/include/clang-pseudo/cli/CLI.h b/clang-tools-extra/pseudo/include/clang-pseudo/cli/CLI.h
deleted file mode 100644
index db09aba21502f..0000000000000
--- a/clang-tools-extra/pseudo/include/clang-pseudo/cli/CLI.h
+++ /dev/null
@@ -1,35 +0,0 @@
-//===--- CLI.h - Get grammar from variant sources ----------------*- C++-*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Provides the Grammar, LRTable etc for a language specified by the `--grammar`
-// flags. It is by design to be used by pseudoparser-based CLI tools.
-//
-// The CLI library defines a `--grammar` CLI flag, which supports 1) using a
-// grammar from a file (--grammar=/path/to/lang.bnf) or using the prebuilt cxx
-// language (--grammar=cxx).
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef CLANG_PSEUDO_CLI_CLI_H
-#define CLANG_PSEUDO_CLI_CLI_H
-
-#include "clang-pseudo/Language.h"
-
-namespace clang {
-namespace pseudo {
-
-// Returns the corresponding Language from the '--grammar' command-line flag.
-//
-// !! If the grammar flag is invalid (e.g. unexisting file), this function will
-// exit the program immediately.
-const Language &getLanguageFromFlags();
-
-} // namespace pseudo
-} // namespace clang
-
-#endif // CLANG_PSEUDO_CLI_CLI_H
diff --git a/clang-tools-extra/pseudo/include/clang-pseudo/cxx/CXX.h b/clang-tools-extra/pseudo/include/clang-pseudo/cxx/CXX.h
deleted file mode 100644
index 7bbb4d2c00201..0000000000000
--- a/clang-tools-extra/pseudo/include/clang-pseudo/cxx/CXX.h
+++ /dev/null
@@ -1,91 +0,0 @@
-//===--- CXX.h - Public interfaces for the C++ grammar -----------*- C++-*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines public interfaces for the C++ grammar
-// (pseudo/lib/cxx/cxx.bnf). It provides a fast way to access core building
-// pieces of the LR parser, e.g. Grammar, LRTable, rather than parsing the
-// grammar file at the runtime.
-//
-// We do a compilation of the C++ BNF grammar at build time, and generate
-// critical data sources. The implementation of the interfaces are based on the
-// generated data sources.
-//
-// FIXME: not everything is fully compiled yet. The implementation of the
-// interfaces are still parsing the grammar file at the runtime.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef CLANG_PSEUDO_CXX_CXX_H
-#define CLANG_PSEUDO_CXX_CXX_H
-
-#include "clang-pseudo/Language.h"
-#include "clang-pseudo/grammar/Grammar.h"
-
-namespace clang {
-namespace pseudo {
-namespace cxx {
-
-// We want enums to be scoped but implicitly convertible to RuleID etc.
-// So create regular (unscoped) enums inside subnamespaces of `detail`.
-// Then add aliases for them outside `detail`.
-namespace detail {
-namespace symbols {
-enum Symbol : SymbolID {
-#define NONTERMINAL(X, Y) X = Y,
-#include "CXXSymbols.inc"
-#undef NONTERMINAL
-};
-} // namespace symbols
-
-namespace extensions {
-enum Extension : ExtensionID {
-#define EXTENSION(X, Y) X = Y,
-#include "CXXSymbols.inc"
-#undef EXTENSION
-};
-} // namespace extensions
-
-namespace rules {
-// For each symbol we close the last symbol's enum+namespace and open new ones.
-// We need a dummy namespace+enum so that this works for the first rule.
-namespace dummy {
-enum Dummy {
-//clang-format off
-#define NONTERMINAL(NAME, ID) \
-}; \
-} \
-namespace NAME { \
-enum Rule : RuleID {
-//clang-format on
-#define RULE(LHS, RHS, ID) RHS = ID,
-#include "CXXSymbols.inc"
-};
-}
-} // namespace rules
-} // namespace detail
-
-// Symbol represents nonterminal symbols in the C++ grammar.
-// It provides a simple uniform way to access a particular nonterminal.
-using Symbol = detail::symbols::Symbol;
-
-using Extension = detail::extensions::Extension;
-
-namespace rule {
-#define NONTERMINAL(NAME, ID) using NAME = detail::rules::NAME::Rule;
-#include "CXXSymbols.inc"
-} // namespace rule
-
-// Returns the Language for the cxx.bnf grammar.
-const Language &getLanguage();
-
-} // namespace cxx
-
-} // namespace pseudo
-} // namespace clang
-
-#endif // CLANG_PSEUDO_CXX_CXX_H
diff --git a/clang-tools-extra/pseudo/include/clang-pseudo/grammar/Grammar.h b/clang-tools-extra/pseudo/include/clang-pseudo/grammar/Grammar.h
deleted file mode 100644
index a1c779a02d864..0000000000000
--- a/clang-tools-extra/pseudo/include/clang-pseudo/grammar/Grammar.h
+++ /dev/null
@@ -1,230 +0,0 @@
-//===--- Grammar.h - grammar used by clang pseudoparser ---------*- C++-*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines base structures for parsing & modeling a grammar for a
-// programming language:
-//
-// # This is a fake C++ BNF grammar
-// _ := translation-unit
-// translation-unit := declaration-seq_opt
-// declaration-seq := declaration
-// declaration-seq := declaration-seq declaration
-//
-// A grammar formally describes a language, and it is constructed by a set of
-// production rules. A rule is of BNF form (AAA := BBB CCC). A symbol is either
-// nonterminal or terminal, identified by a SymbolID.
-//
-// Annotations are supported in a syntax form of [key=value]. They specify
-// attributes which are associated with either a grammar symbol (on the
-// right-hand side of the symbol) or a grammar rule (at the end of the rule
-// body).
-// Attributes provide a way to inject custom code into the GLR parser. Each
-// unique attribute value creates an extension point (identified by ExtensionID
-// ), and an extension point corresponds to a piece of native code. For
-// example, C++ grammar has a rule:
-//
-// compound_statement := { statement-seq [recover=Brackets] }
-//
-// The `recover` attribute instructs the parser that we should perform error
-// recovery if parsing the statement-seq fails. The `Brackets` recovery
-// heuristic is implemented in CXX.cpp by binding the ExtensionID for the
-// `Recovery` value to a specific C++ function that finds the recovery point.
-//
-// Notions about the BNF grammar:
-// - "_" is the start symbol of the augmented grammar;
-// - single-line comment is supported, starting with a #
-// - A rule describes how a nonterminal (left side of :=) is constructed, and
-// it is *per line* in the grammar file
-// - Terminals (also called tokens) correspond to the clang::TokenKind; they
-// are written in the grammar like "IDENTIFIER", "USING", "+"
-// - Nonterminals are specified with "lower-case" names in the grammar; they
-// shouldn't be nullable (has an empty sequence)
-// - optional symbols are supported (specified with a _opt suffix), and they
-// will be eliminated during the grammar parsing stage
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef CLANG_PSEUDO_GRAMMAR_GRAMMAR_H
-#define CLANG_PSEUDO_GRAMMAR_GRAMMAR_H
-
-#include "clang/Basic/TokenKinds.h"
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/DenseSet.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/Support/raw_ostream.h"
-#include <cstdint>
-#include <optional>
-#include <vector>
-
-namespace clang {
-namespace pseudo {
-// A SymbolID uniquely identifies a terminal/nonterminal symbol in a grammar.
-// nonterminal IDs are indexes into a table of nonterminal symbols.
-// Terminal IDs correspond to the clang TokenKind enum.
-using SymbolID = uint16_t;
-// SymbolID is only 12 bits wide.
-// There are maximum 2^11 terminals (aka tokens) and 2^11 nonterminals.
-static constexpr uint16_t SymbolBits = 12;
-static constexpr uint16_t NumTerminals = tok::NUM_TOKENS;
-// SymbolIDs with the top bit set are tokens/terminals.
-static constexpr SymbolID TokenFlag = 1 << (SymbolBits - 1);
-inline bool isToken(SymbolID ID) { return ID & TokenFlag; }
-inline bool isNonterminal(SymbolID ID) { return !isToken(ID); }
-// The terminals are always the clang tok::TokenKind (not all are used).
-inline tok::TokenKind symbolToToken(SymbolID SID) {
- assert(isToken(SID));
- SID &= ~TokenFlag;
- assert(SID < NumTerminals);
- return static_cast<tok::TokenKind>(SID);
-}
-inline constexpr SymbolID tokenSymbol(tok::TokenKind TK) {
- return TokenFlag | static_cast<SymbolID>(TK);
-}
-
-// An extension is a piece of native code specific to a grammar that modifies
-// the behavior of annotated rules. One ExtensionID is assigned for each unique
-// attribute value (all attributes share a namespace).
-using ExtensionID = uint16_t;
-
-// A RuleID uniquely identifies a production rule in a grammar.
-// It is an index into a table of rules.
-using RuleID = uint16_t;
-// There are maximum 2^12 rules.
-static constexpr unsigned RuleBits = 12;
-
-// Represent a production rule in the grammar, e.g.
-// expression := a b c
-// ^Target ^Sequence
-struct Rule {
- Rule(SymbolID Target, llvm::ArrayRef<SymbolID> Seq);
-
- // We occupy 4 bits for the sequence, in theory, it can be at most 2^4 tokens
- // long, however, we're stricter in order to reduce the size, we limit the max
- // length to 9 (this is the longest sequence in cxx grammar).
- static constexpr unsigned SizeBits = 4;
- static constexpr unsigned MaxElements = 9;
- static_assert(MaxElements < (1 << SizeBits), "Exceeds the maximum limit");
- static_assert(SizeBits + SymbolBits <= 16,
- "Must be able to store symbol ID + size efficiently");
-
- // 16 bits for target symbol and size of sequence:
- // SymbolID : 12 | Size : 4
- SymbolID Target : SymbolBits;
- uint8_t Size : SizeBits; // Size of the Sequence
- SymbolID Sequence[MaxElements];
-
- // A guarded rule has extra logic to determine whether the RHS is eligible.
- bool Guarded = false;
-
- // Specifies the index within Sequence eligible for error recovery.
- // Given stmt := { stmt-seq_opt }, if we fail to parse the stmt-seq then we
- // should recover by finding the matching brace, and forcing stmt-seq to match
- // everything between braces.
- // For now, only a single strategy at a single point is possible.
- uint8_t RecoveryIndex = -1;
- ExtensionID Recovery = 0;
-
- llvm::ArrayRef<SymbolID> seq() const {
- return llvm::ArrayRef<SymbolID>(Sequence, Size);
- }
- friend bool operator==(const Rule &L, const Rule &R) {
- return L.Target == R.Target && L.seq() == R.seq() && L.Guarded == R.Guarded;
- }
-};
-
-struct GrammarTable;
-
-// Grammar that describes a programming language, e.g. C++. It represents the
-// contents of the specified grammar.
-// It is a building block for constructing a table-based parser.
-class Grammar {
-public:
- Grammar() = default; // Creates an invalid dummy grammar.
- explicit Grammar(std::unique_ptr<GrammarTable>);
-
- // Parses grammar from a BNF file.
- // Diagnostics emitted during parsing are stored in Diags.
- static Grammar parseBNF(llvm::StringRef BNF, std::vector<std::string> &Diags);
-
- // Returns the SymbolID of the symbol '_'.
- SymbolID underscore() const { return Underscore; };
-
- // Returns all rules of the given nonterminal symbol.
- llvm::ArrayRef<Rule> rulesFor(SymbolID SID) const;
- const Rule &lookupRule(RuleID RID) const;
-
- // Gets symbol (terminal or nonterminal) name.
- // Terminals have names like "," (kw_comma) or "OPERATOR" (kw_operator).
- llvm::StringRef symbolName(SymbolID) const;
-
- // Lookup the SymbolID of the nonterminal symbol by Name.
- std::optional<SymbolID> findNonterminal(llvm::StringRef Name) const;
-
- // Dumps the whole grammar.
- std::string dump() const;
- // Dumps a particular rule.
- std::string dumpRule(RuleID) const;
- // Dumps all rules of the given nonterminal symbol.
- std::string dumpRules(SymbolID) const;
-
- const GrammarTable &table() const { return *T; }
-
-private:
- std::unique_ptr<GrammarTable> T;
- // The symbol ID of '_'. (In the LR literature, this is the start symbol of
- // the augmented grammar.)
- SymbolID Underscore;
-};
-// For each nonterminal X, computes the set of terminals that begin strings
-// derived from X. (Known as FIRST sets in grammar-based parsers).
-std::vector<llvm::DenseSet<SymbolID>> firstSets(const Grammar &);
-// For each nonterminal X, computes the set of terminals that could immediately
-// follow X. (Known as FOLLOW sets in grammar-based parsers).
-std::vector<llvm::DenseSet<SymbolID>> followSets(const Grammar &);
-
-// Storage for the underlying data of the Grammar.
-// It can be constructed dynamically (from compiling BNF file) or statically
-// (a compiled data-source).
-struct GrammarTable {
- GrammarTable();
-
- struct Nonterminal {
- std::string Name;
- // Corresponding rules that construct the nonterminal, it is a [Start, End)
- // index range of the Rules table.
- struct {
- RuleID Start;
- RuleID End;
- } RuleRange;
- };
-
- // RuleID is an index into this table of rule definitions.
- //
- // Rules with the same target symbol (LHS) are grouped into a single range.
- // The relative order of different target symbols is *not* by SymbolID, but
- // rather a topological sort: if S := T then the rules producing T have lower
- // RuleIDs than rules producing S.
- // (This strange order simplifies the GLR parser: for a given token range, if
- // we reduce in increasing RuleID order then we need never backtrack --
- // prerequisite reductions are reached before dependent ones).
- std::vector<Rule> Rules;
- // A table of terminals (aka tokens). It corresponds to the clang::Token.
- // clang::tok::TokenKind is the index of the table.
- llvm::ArrayRef<std::string> Terminals;
- // A table of nonterminals, sorted by name.
- // SymbolID is the index of the table.
- std::vector<Nonterminal> Nonterminals;
- // A table of attribute values, sorted by name.
- // ExtensionID is the index of the table.
- std::vector<std::string> AttributeValues;
-};
-
-} // namespace pseudo
-} // namespace clang
-
-#endif // CLANG_PSEUDO_GRAMMAR_GRAMMAR_H
diff --git a/clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRGraph.h b/clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRGraph.h
deleted file mode 100644
index dd9e87c2c172b..0000000000000
--- a/clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRGraph.h
+++ /dev/null
@@ -1,196 +0,0 @@
-//===--- LRGraph.h - Build an LR automaton ------------------*- C++-*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// LR parsers are bottom-up parsers -- they scan the input from left to right,
-// and collect the right-hand side of a production rule (called handle) on top
-// of the stack, then replace (reduce) the handle with the nonterminal defined
-// by the production rule.
-//
-// This file defines LRGraph, a deterministic handle-finding finite-state
-// automaton, which is a key component in LR parsers to recognize any of
-// handles in the grammar efficiently. We build the LR table (ACTION and GOTO
-// Table) based on the LRGraph.
-//
-// LRGraph can be constructed for any context-free grammars.
-// Even for a LR-ambiguous grammar, we can construct a deterministic FSA, but
-// interpretation of the FSA is nondeterministic -- we might in a state where
-// we can continue searching an handle and identify a handle (called
-// shift/reduce conflicts), or identify more than one handle (callled
-// reduce/reduce conflicts).
-//
-// LRGraph is a common model for all variants of LR automatons, from the most
-// basic one LR(0), the powerful SLR(1), LR(1) which uses a one-token lookahead
-// in making decisions.
-//===----------------------------------------------------------------------===//
-
-#ifndef CLANG_PSEUDO_GRAMMAR_LRGRAPH_H
-#define CLANG_PSEUDO_GRAMMAR_LRGRAPH_H
-
-#include "clang-pseudo/grammar/Grammar.h"
-#include "llvm/ADT/Hashing.h"
-#include <vector>
-
-namespace clang {
-namespace pseudo {
-
-// An LR item -- a grammar rule with a dot at some position of the body.
-// e.g. a production rule A := X Y yields 3 items:
-// A := . X Y
-// A := X . Y
-// A := X Y .
-// An item indicates how much of a production rule has been recognized at a
-// position (described by dot), for example, A := X . Y indicates that we have
-// recognized the X part from the input, and we hope next to see the input
-// derivable from Y.
-class Item {
-public:
- static Item start(RuleID ID, const Grammar &G) {
- Item I;
- I.RID = ID;
- I.RuleLength = G.lookupRule(ID).Size;
- return I;
- }
- static Item sentinel(RuleID ID) {
- Item I;
- I.RID = ID;
- return I;
- }
-
- RuleID rule() const { return RID; }
- uint8_t dot() const { return DotPos; }
-
- bool hasNext() const { return DotPos < RuleLength; }
- SymbolID next(const Grammar &G) const {
- assert(hasNext());
- return G.lookupRule(RID).Sequence[DotPos];
- }
-
- Item advance() const {
- assert(hasNext());
- Item I = *this;
- ++I.DotPos;
- return I;
- }
-
- std::string dump(const Grammar &G) const;
-
- bool operator==(const Item &I) const {
- return DotPos == I.DotPos && RID == I.RID;
- }
- bool operator<(const Item &I) const {
- return std::tie(RID, DotPos) < std::tie(I.RID, I.DotPos);
- }
- friend llvm::hash_code hash_value(const Item &I) {
- return llvm::hash_combine(I.RID, I.DotPos);
- }
-
-private:
- RuleID RID = 0;
- uint8_t DotPos = 0;
- uint8_t RuleLength = 0; // the length of rule body.
-};
-
-// A state represents a node in the LR automaton graph. It is an item set, which
-// contains all possible rules that the LR parser may be parsing in that state.
-//
-// Conceptually, If we knew in advance what we're parsing, at any point we're
-// partway through parsing a production, sitting on a stack of partially parsed
-// productions. But because we don't know, there could be *several* productions
-// we're partway through. The set of possibilities is the parser state, and we
-// precompute all the transitions between these states.
-struct State {
- // A full set of items (including non-kernel items) representing the state,
- // in a canonical order (see SortByNextSymbol in the cpp file).
- std::vector<Item> Items;
-
- std::string dump(const Grammar &G, unsigned Indent = 0) const;
-};
-
-// LRGraph is a deterministic finite state automaton for LR parsing.
-//
-// Intuitively, an LR automaton is a transition graph. The graph has a
-// collection of nodes, called States. Each state corresponds to a particular
-// item set, which represents a condition that could occur during the process of
-// parsing a production. Edges are directed from one state to another. Each edge
-// is labeled by a grammar symbol (terminal or nonterminal).
-//
-// LRGraph is used to construct the LR parsing table which is a core
-// data-structure driving the LR parser.
-class LRGraph {
-public:
- // StateID is the index in States table.
- using StateID = uint16_t;
-
- // Constructs an LR(0) automaton.
- static LRGraph buildLR0(const Grammar &);
-
- // An edge in the LR graph, it represents a transition in the LR automaton.
- // If the parser is at state Src, with a lookahead Label, then it
- // transits to state Dst.
- struct Edge {
- StateID Src, Dst;
- SymbolID Label;
- };
-
- // A possible error recovery: choose to match some tokens against a symbol.
- //
- // e.g. a state that contains
- // stmt := { . stmt-seq [recover=braces] }
- // has a Recovery { Src = S, Strategy=braces, Result=stmt-seq }.
- struct Recovery {
- StateID Src; // The state we are in when encountering the error.
- ExtensionID Strategy; // Heuristic choosing the tokens to match.
- SymbolID Result; // The symbol that is produced.
- };
-
- llvm::ArrayRef<State> states() const { return States; }
- llvm::ArrayRef<Edge> edges() const { return Edges; }
- llvm::ArrayRef<Recovery> recoveries() const { return Recoveries; }
- llvm::ArrayRef<std::pair<SymbolID, StateID>> startStates() const {
- return StartStates;
- }
-
- std::string dumpForTests(const Grammar &) const;
-
-private:
- LRGraph(std::vector<State> States, std::vector<Edge> Edges,
- std::vector<Recovery> Recoveries,
- std::vector<std::pair<SymbolID, StateID>> StartStates)
- : States(std::move(States)), Edges(std::move(Edges)),
- Recoveries(std::move(Recoveries)), StartStates(std::move(StartStates)) {
- }
-
- std::vector<State> States;
- std::vector<Edge> Edges;
- std::vector<Recovery> Recoveries;
- std::vector<std::pair<SymbolID, StateID>> StartStates;
-};
-
-} // namespace pseudo
-} // namespace clang
-
-namespace llvm {
-// Support clang::pseudo::Item as DenseMap keys.
-template <> struct DenseMapInfo<clang::pseudo::Item> {
- static inline clang::pseudo::Item getEmptyKey() {
- return clang::pseudo::Item::sentinel(-1);
- }
- static inline clang::pseudo::Item getTombstoneKey() {
- return clang::pseudo::Item::sentinel(-2);
- }
- static unsigned getHashValue(const clang::pseudo::Item &I) {
- return hash_value(I);
- }
- static bool isEqual(const clang::pseudo::Item &LHS,
- const clang::pseudo::Item &RHS) {
- return LHS == RHS;
- }
-};
-} // namespace llvm
-
-#endif // CLANG_PSEUDO_GRAMMAR_LRGRAPH_H
diff --git a/clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRTable.h b/clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRTable.h
deleted file mode 100644
index 1706b6936c9ea..0000000000000
--- a/clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRTable.h
+++ /dev/null
@@ -1,278 +0,0 @@
-//===--- LRTable.h - Define LR Parsing Table ---------------------*- C++-*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// The LRTable (referred as LR parsing table in the LR literature) is the core
-// component in LR parsers, it drives the LR parsers by specifying an action to
-// take given the current state on the top of the stack and the current
-// lookahead token.
-//
-// The LRTable can be described as a matrix where the rows represent
-// the states of the LR graph, the columns represent the symbols of the
-// grammar, and each entry of the matrix (called action) represents a
-// state transition in the graph.
-//
-// Typically, based on the category of the grammar symbol, the LRTable is
-// broken into two logically separate tables:
-// - ACTION table with terminals as columns -- e.g. ACTION[S, a] specifies
-// next action (shift/reduce) on state S under a lookahead terminal a
-// - GOTO table with nonterminals as columns -- e.g. GOTO[S, X] specifies
-// the state which we transist to from the state S with the nonterminal X
-//
-// LRTable is *performance-critial* as it is consulted frequently during a
-// parse. In general, LRTable is very sparse (most of the entries are empty).
-// For example, for the C++ language, the SLR table has ~1500 states and 650
-// symbols which results in a matrix having 975K entries, ~90% of entries are
-// empty.
-//
-// This file implements a speed-and-space-efficient LRTable.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef CLANG_PSEUDO_GRAMMAR_LRTABLE_H
-#define CLANG_PSEUDO_GRAMMAR_LRTABLE_H
-
-#include "clang-pseudo/grammar/Grammar.h"
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/BitVector.h"
-#include "llvm/ADT/SmallSet.h"
-#include "llvm/Support/Capacity.h"
-#include "llvm/Support/MathExtras.h"
-#include <cstdint>
-#include <vector>
-
-namespace clang {
-namespace pseudo {
-
-// Represents the LR parsing table, which can efficiently the question "what is
-// the next step given the lookahead token and current state on top of the
-// stack?".
-//
-// This is a dense implementation, which only takes an amount of space that is
-// proportional to the number of non-empty entries in the table.
-//
-// Unlike the typical LR parsing table which allows at most one available action
-// per entry, conflicted actions are allowed in LRTable. The LRTable is designed
-// to be used in nondeterministic LR parsers (e.g. GLR).
-//
-// There are no "accept" actions in the LRTable, instead the stack is inspected
-// after parsing completes: is the state goto(StartState, StartSymbol)?
-class LRTable {
-public:
- // StateID is only 13 bits wide.
- using StateID = uint16_t;
- static constexpr unsigned StateBits = 13;
-
- struct Recovery {
- ExtensionID Strategy;
- SymbolID Result;
- };
-
- // Returns the state after we reduce a nonterminal.
- // Expected to be called by LR parsers.
- // If the nonterminal is invalid here, returns std::nullopt.
- std::optional<StateID> getGoToState(StateID State,
- SymbolID Nonterminal) const {
- return Gotos.get(gotoIndex(State, Nonterminal, numStates()));
- }
- // Returns the state after we shift a terminal.
- // Expected to be called by LR parsers.
- // If the terminal is invalid here, returns std::nullopt.
- std::optional<StateID> getShiftState(StateID State,
- SymbolID Terminal) const {
- return Shifts.get(shiftIndex(State, Terminal, numStates()));
- }
-
- // Returns the possible reductions from a state.
- //
- // These are not keyed by a lookahead token. Instead, call canFollow() to
- // check whether a reduction should apply in the current context:
- // for (RuleID R : LR.getReduceRules(S)) {
- // if (!LR.canFollow(G.lookupRule(R).Target, NextToken))
- // continue;
- // // ...apply reduce...
- // }
- llvm::ArrayRef<RuleID> getReduceRules(StateID State) const {
- assert(State + 1u < ReduceOffset.size());
- return llvm::ArrayRef(Reduces.data() + ReduceOffset[State],
- Reduces.data() + ReduceOffset[State + 1]);
- }
- // Returns whether Terminal can follow Nonterminal in a valid source file.
- bool canFollow(SymbolID Nonterminal, SymbolID Terminal) const {
- assert(isToken(Terminal));
- assert(isNonterminal(Nonterminal));
- // tok::unknown is a sentinel value used in recovery: can follow anything.
- return Terminal == tokenSymbol(tok::unknown) ||
- FollowSets.test(tok::NUM_TOKENS * Nonterminal +
- symbolToToken(Terminal));
- }
-
- // Looks up available recovery actions if we stopped parsing in this state.
- llvm::ArrayRef<Recovery> getRecovery(StateID State) const {
- return llvm::ArrayRef(Recoveries.data() + RecoveryOffset[State],
- Recoveries.data() + RecoveryOffset[State + 1]);
- }
-
- // Returns the state from which the LR parser should start to parse the input
- // tokens as the given StartSymbol.
- //
- // In LR parsing, the start state of `translation-unit` corresponds to
- // `_ := • translation-unit`.
- //
- // Each start state responds to **a** single grammar rule like `_ := start`.
- // REQUIRE: The given StartSymbol must exist in the grammar (in a form of
- // `_ := start`).
- StateID getStartState(SymbolID StartSymbol) const;
-
- size_t bytes() const {
- return sizeof(*this) + Gotos.bytes() + Shifts.bytes() +
- llvm::capacity_in_bytes(Reduces) +
- llvm::capacity_in_bytes(ReduceOffset) +
- llvm::capacity_in_bytes(FollowSets);
- }
-
- std::string dumpStatistics() const;
- std::string dumpForTests(const Grammar &G) const;
-
- // Build a SLR(1) parsing table.
- static LRTable buildSLR(const Grammar &G);
-
- // Helper for building a table with specified actions/states.
- struct Builder {
- Builder() = default;
- Builder(const Grammar &G) {
- NumNonterminals = G.table().Nonterminals.size();
- FollowSets = followSets(G);
- }
-
- unsigned int NumNonterminals = 0;
- // States representing `_ := . start` for various start symbols.
- std::vector<std::pair<SymbolID, StateID>> StartStates;
- // State transitions `X := ABC . D EFG` => `X := ABC D . EFG`.
- // Key is (initial state, D), value is final state.
- llvm::DenseMap<std::pair<StateID, SymbolID>, StateID> Transition;
- // Reductions available in a given state.
- llvm::DenseMap<StateID, llvm::SmallSet<RuleID, 4>> Reduce;
- // FollowSets[NT] is the set of terminals that can follow the nonterminal.
- std::vector<llvm::DenseSet<SymbolID>> FollowSets;
- // Recovery options available at each state.
- std::vector<std::pair<StateID, Recovery>> Recoveries;
-
- LRTable build() &&;
- };
-
-private:
- unsigned numStates() const { return ReduceOffset.size() - 1; }
-
- // A map from unsigned key => StateID, used to store actions.
- // The keys should be sequential but the values are somewhat sparse.
- //
- // In practice, the keys encode (origin state, symbol) pairs, and the values
- // are the state we should move to after seeing that symbol.
- //
- // We store one bit for presence/absence of the value for each key.
- // At every 64th key, we store the offset into the table of values.
- // e.g. key 0x500 is checkpoint 0x500/64 = 20
- // Checkpoints[20] = 34
- // get(0x500) = Values[34] (assuming it has a value)
- // To look up values in between, we count the set bits:
- // get(0x509) has a value if HasValue[20] & (1<<9)
- // #values between 0x500 and 0x509: popcnt(HasValue[20] & (1<<9 - 1))
- // get(0x509) = Values[34 + popcnt(...)]
- //
- // Overall size is 1.25 bits/key + 16 bits/value.
- // Lookup is constant time with a low factor (no hashing).
- class TransitionTable {
- using Word = uint64_t;
- constexpr static unsigned WordBits = CHAR_BIT * sizeof(Word);
-
- std::vector<StateID> Values;
- std::vector<Word> HasValue;
- std::vector<uint16_t> Checkpoints;
-
- public:
- TransitionTable() = default;
- TransitionTable(const llvm::DenseMap<unsigned, StateID> &Entries,
- unsigned NumKeys) {
- assert(
- Entries.size() <
- std::numeric_limits<decltype(Checkpoints)::value_type>::max() &&
- "16 bits too small for value offsets!");
- unsigned NumWords = (NumKeys + WordBits - 1) / WordBits;
- HasValue.resize(NumWords, 0);
- Checkpoints.reserve(NumWords);
- Values.reserve(Entries.size());
- for (unsigned I = 0; I < NumKeys; ++I) {
- if ((I % WordBits) == 0)
- Checkpoints.push_back(Values.size());
- auto It = Entries.find(I);
- if (It != Entries.end()) {
- HasValue[I / WordBits] |= (Word(1) << (I % WordBits));
- Values.push_back(It->second);
- }
- }
- }
-
- std::optional<StateID> get(unsigned Key) const {
- // Do we have a value for this key?
- Word KeyMask = Word(1) << (Key % WordBits);
- unsigned KeyWord = Key / WordBits;
- if ((HasValue[KeyWord] & KeyMask) == 0)
- return std::nullopt;
- // Count the number of values since the checkpoint.
- Word BelowKeyMask = KeyMask - 1;
- unsigned CountSinceCheckpoint =
- llvm::popcount(HasValue[KeyWord] & BelowKeyMask);
- // Find the value relative to the last checkpoint.
- return Values[Checkpoints[KeyWord] + CountSinceCheckpoint];
- }
-
- unsigned size() const { return Values.size(); }
-
- size_t bytes() const {
- return llvm::capacity_in_bytes(HasValue) +
- llvm::capacity_in_bytes(Values) +
- llvm::capacity_in_bytes(Checkpoints);
- }
- };
- // Shift and Goto tables are keyed by encoded (State, Symbol).
- static unsigned shiftIndex(StateID State, SymbolID Terminal,
- unsigned NumStates) {
- return NumStates * symbolToToken(Terminal) + State;
- }
- static unsigned gotoIndex(StateID State, SymbolID Nonterminal,
- unsigned NumStates) {
- assert(isNonterminal(Nonterminal));
- return NumStates * Nonterminal + State;
- }
- TransitionTable Shifts;
- TransitionTable Gotos;
-
- // A sorted table, storing the start state for each target parsing symbol.
- std::vector<std::pair<SymbolID, StateID>> StartStates;
-
- // Given a state ID S, the half-open range of Reduces is
- // [ReduceOffset[S], ReduceOffset[S+1])
- std::vector<uint32_t> ReduceOffset;
- std::vector<RuleID> Reduces;
- // Conceptually this is a bool[SymbolID][Token], each entry describing whether
- // the grammar allows the (nonterminal) symbol to be followed by the token.
- //
- // This is flattened by encoding the (SymbolID Nonterminal, tok::Kind Token)
- // as an index: Nonterminal * NUM_TOKENS + Token.
- llvm::BitVector FollowSets;
-
- // Recovery stores all recovery actions from all states.
- // A given state has [RecoveryOffset[S], RecoveryOffset[S+1]).
- std::vector<uint32_t> RecoveryOffset;
- std::vector<Recovery> Recoveries;
-};
-
-} // namespace pseudo
-} // namespace clang
-
-#endif // CLANG_PSEUDO_GRAMMAR_LRTABLE_H
diff --git a/clang-tools-extra/pseudo/lib/CMakeLists.txt b/clang-tools-extra/pseudo/lib/CMakeLists.txt
index f92f79be12150..0f56728d0eceb 100644
--- a/clang-tools-extra/pseudo/lib/CMakeLists.txt
+++ b/clang-tools-extra/pseudo/lib/CMakeLists.txt
@@ -1,22 +1,14 @@
-add_subdirectory(cli)
-add_subdirectory(cxx)
-add_subdirectory(grammar)
-
set(LLVM_LINK_COMPONENTS Support)
add_clang_library(clangPseudo
Bracket.cpp
DirectiveTree.cpp
- Disambiguate.cpp
- Forest.cpp
- GLR.cpp
Lex.cpp
Token.cpp
LINK_LIBS
clangBasic
clangLex
- clangPseudoGrammar
DEPENDS
ClangDriverOptions
diff --git a/clang-tools-extra/pseudo/lib/Disambiguate.cpp b/clang-tools-extra/pseudo/lib/Disambiguate.cpp
deleted file mode 100644
index b0bc75cf96c93..0000000000000
--- a/clang-tools-extra/pseudo/lib/Disambiguate.cpp
+++ /dev/null
@@ -1,48 +0,0 @@
-//===--- Disambiguate.cpp - Find the best tree in the forest --------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "clang-pseudo/Disambiguate.h"
-
-namespace clang::pseudo {
-
-Disambiguation disambiguate(const ForestNode *Root,
- const DisambiguateParams &Params) {
- // FIXME: this is a dummy placeholder strategy, implement a real one!
- Disambiguation Result;
- for (const ForestNode &N : Root->descendants()) {
- if (N.kind() == ForestNode::Ambiguous)
- Result.try_emplace(&N, 1);
- }
- return Result;
-}
-
-void removeAmbiguities(ForestNode *&Root, const Disambiguation &D) {
- std::vector<ForestNode **> Queue = {&Root};
- while (!Queue.empty()) {
- ForestNode **Next = Queue.back();
- Queue.pop_back();
- switch ((*Next)->kind()) {
- case ForestNode::Sequence:
- for (ForestNode *&Child : (*Next)->elements())
- Queue.push_back(&Child);
- break;
- case ForestNode::Ambiguous: {
- assert(D.count(*Next) != 0 && "disambiguation is incomplete!");
- ForestNode *ChosenChild = (*Next)->alternatives()[D.lookup(*Next)];
- *Next = ChosenChild;
- Queue.push_back(Next);
- break;
- }
- case ForestNode::Terminal:
- case ForestNode::Opaque:
- break;
- }
- }
-}
-
-} // namespace clang::pseudo
diff --git a/clang-tools-extra/pseudo/lib/Forest.cpp b/clang-tools-extra/pseudo/lib/Forest.cpp
deleted file mode 100644
index e8e60e5ec475a..0000000000000
--- a/clang-tools-extra/pseudo/lib/Forest.cpp
+++ /dev/null
@@ -1,199 +0,0 @@
-//===--- Forest.cpp - Parse forest ------------------------------*- C++-*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "clang-pseudo/Forest.h"
-#include "clang-pseudo/Token.h"
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/Support/FormatVariadic.h"
-#include <optional>
-
-namespace clang {
-namespace pseudo {
-
-void ForestNode::RecursiveIterator::operator++() {
- auto C = Cur->children();
- // Try to find a child of the current node to descend into.
- for (unsigned I = 0; I < C.size(); ++I) {
- if (Seen.insert(C[I]).second) {
- Stack.push_back({Cur, I});
- Cur = C[I];
- return;
- }
- }
- // Try to find a sibling af an ancestor to advance to.
- for (; !Stack.empty(); Stack.pop_back()) {
- C = Stack.back().Parent->children();
- unsigned &Index = Stack.back().ChildIndex;
- while (++Index < C.size()) {
- if (Seen.insert(C[Index]).second) {
- Cur = C[Index];
- return;
- }
- }
- }
- Cur = nullptr;
-}
-
-llvm::iterator_range<ForestNode::RecursiveIterator>
-ForestNode::descendants() const {
- return {RecursiveIterator(this), RecursiveIterator()};
-}
-
-std::string ForestNode::dump(const Grammar &G) const {
- switch (kind()) {
- case Ambiguous:
- return llvm::formatv("{0} := <ambiguous>", G.symbolName(symbol()));
- case Terminal:
- return llvm::formatv("{0} := tok[{1}]", G.symbolName(symbol()),
- startTokenIndex());
- case Sequence:
- return G.dumpRule(rule());
- case Opaque:
- return llvm::formatv("{0} := <opaque>", G.symbolName(symbol()));
- }
- llvm_unreachable("Unhandled node kind!");
-}
-
-std::string ForestNode::dumpRecursive(const Grammar &G,
- bool Abbreviated) const {
- using llvm::formatv;
- Token::Index MaxToken = 0;
- // Count visits of nodes so we can mark those seen multiple times.
- llvm::DenseMap<const ForestNode *, /*VisitCount*/ unsigned> VisitCounts;
- std::function<void(const ForestNode *)> CountVisits =
- [&](const ForestNode *P) {
- MaxToken = std::max(MaxToken, P->startTokenIndex());
- if (VisitCounts[P]++ > 0)
- return; // Don't count children as multiply visited.
- if (P->kind() == Ambiguous)
- llvm::for_each(P->alternatives(), CountVisits);
- else if (P->kind() == Sequence)
- llvm::for_each(P->elements(), CountVisits);
- };
- CountVisits(this);
-
- unsigned IndexWidth = std::max(3, (int)std::to_string(MaxToken).size());
- // e.g. "[{0,4}, {1,4})" if MaxToken is 5742.
- std::string RangeFormat = formatv("[{{0,{0}}, {{1,{0}}) ", IndexWidth);
-
- // The box-drawing characters that should be added as a child is rendered.
- struct LineDecoration {
- std::string Prefix; // Prepended to every line.
- llvm::StringRef First; // added to the child's line.
- llvm::StringRef Subsequent; // added to descendants' lines.
- };
-
- // We print a "#<id>" for nonterminal forest nodes that are being dumped
- // multiple times.
- llvm::DenseMap<const ForestNode *, size_t> ReferenceIds;
- std::string Result;
- constexpr Token::Index KEnd = std::numeric_limits<Token::Index>::max();
- std::function<void(const ForestNode *, Token::Index, std::optional<SymbolID>,
- LineDecoration &LineDec)>
- Dump = [&](const ForestNode *P, Token::Index End,
- std::optional<SymbolID> ElidedParent, LineDecoration LineDec) {
- bool SharedNode = VisitCounts.find(P)->getSecond() > 1;
- llvm::ArrayRef<const ForestNode *> Children;
- auto EndOfElement = [&](size_t ChildIndex) {
- return ChildIndex + 1 == Children.size()
- ? End
- : Children[ChildIndex + 1]->startTokenIndex();
- };
- if (P->kind() == Ambiguous) {
- Children = P->alternatives();
- } else if (P->kind() == Sequence) {
- Children = P->elements();
- if (Abbreviated) {
- // Abbreviate chains of trivial sequence nodes.
- // A := B, B := C, C := D, D := X Y Z
- // becomes
- // A~D := X Y Z
- //
- // We can't hide nodes that appear multiple times in the tree,
- // because we need to call out their identity with IDs.
- if (Children.size() == 1 && !SharedNode) {
- assert(Children[0]->startTokenIndex() == P->startTokenIndex() &&
- EndOfElement(0) == End);
- return Dump(Children[0], End,
- /*ElidedParent=*/ElidedParent.value_or(P->symbol()),
- LineDec);
- }
- }
- }
-
- if (End == KEnd)
- Result += formatv(RangeFormat.c_str(), P->startTokenIndex(), "end");
- else
- Result += formatv(RangeFormat.c_str(), P->startTokenIndex(), End);
- Result += LineDec.Prefix;
- Result += LineDec.First;
- if (ElidedParent) {
- Result += G.symbolName(*ElidedParent);
- Result += "~";
- }
-
- if (SharedNode && P->kind() != ForestNode::Terminal) {
- auto It = ReferenceIds.try_emplace(P, ReferenceIds.size() + 1);
- bool First = It.second;
- unsigned ID = It.first->second;
-
- // The first time, print as #1. Later, =#1.
- if (First) {
- Result += formatv("{0} #{1}", P->dump(G), ID);
- } else {
- Result += formatv("{0} =#{1}", G.symbolName(P->symbol()), ID);
- Children = {}; // Don't walk the children again.
- }
- } else {
- Result.append(P->dump(G));
- }
- Result.push_back('\n');
-
- auto OldPrefixSize = LineDec.Prefix.size();
- LineDec.Prefix += LineDec.Subsequent;
- for (size_t I = 0; I < Children.size(); ++I) {
- if (I == Children.size() - 1) {
- LineDec.First = "└─";
- LineDec.Subsequent = " ";
- } else {
- LineDec.First = "├─";
- LineDec.Subsequent = "│ ";
- }
- Dump(Children[I], P->kind() == Sequence ? EndOfElement(I) : End,
- std::nullopt, LineDec);
- }
- LineDec.Prefix.resize(OldPrefixSize);
- };
- LineDecoration LineDec;
- Dump(this, KEnd, std::nullopt, LineDec);
- return Result;
-}
-
-llvm::ArrayRef<ForestNode>
-ForestArena::createTerminals(const TokenStream &Code) {
- ForestNode *Terminals = Arena.Allocate<ForestNode>(Code.tokens().size() + 1);
- size_t Index = 0;
- for (const auto &T : Code.tokens()) {
- new (&Terminals[Index])
- ForestNode(ForestNode::Terminal, tokenSymbol(T.Kind),
- /*Start=*/Index, /*TerminalData*/ 0);
- ++Index;
- }
- // Include an `eof` terminal.
- // This is important to drive the final shift/recover/reduce loop.
- new (&Terminals[Index])
- ForestNode(ForestNode::Terminal, tokenSymbol(tok::eof),
- /*Start=*/Index, /*TerminalData*/ 0);
- ++Index;
- NodeCount = Index;
- return llvm::ArrayRef(Terminals, Index);
-}
-
-} // namespace pseudo
-} // namespace clang
diff --git a/clang-tools-extra/pseudo/lib/GLR.cpp b/clang-tools-extra/pseudo/lib/GLR.cpp
deleted file mode 100644
index ac43c02db521e..0000000000000
--- a/clang-tools-extra/pseudo/lib/GLR.cpp
+++ /dev/null
@@ -1,772 +0,0 @@
-//===--- GLR.cpp -----------------------------------------------*- C++-*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "clang-pseudo/GLR.h"
-#include "clang-pseudo/Language.h"
-#include "clang-pseudo/grammar/Grammar.h"
-#include "clang-pseudo/grammar/LRTable.h"
-#include "clang/Basic/TokenKinds.h"
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/ScopeExit.h"
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/FormatVariadic.h"
-#include <algorithm>
-#include <memory>
-#include <optional>
-#include <queue>
-
-#define DEBUG_TYPE "GLR.cpp"
-
-namespace clang {
-namespace pseudo {
-namespace {
-
-Token::Index findRecoveryEndpoint(ExtensionID Strategy, Token::Index Begin,
- const TokenStream &Tokens,
- const Language &Lang) {
- assert(Strategy != 0);
- if (auto S = Lang.RecoveryStrategies.lookup(Strategy))
- return S(Begin, Tokens);
- return Token::Invalid;
-}
-
-} // namespace
-
-void glrRecover(llvm::ArrayRef<const GSS::Node *> OldHeads,
- unsigned &TokenIndex, const ParseParams &Params,
- const Language &Lang,
- std::vector<const GSS::Node *> &NewHeads) {
- LLVM_DEBUG(llvm::dbgs() << "Recovery at token " << TokenIndex << "...\n");
- // Describes a possibility to recover by forcibly interpreting a range of
- // tokens around the cursor as a nonterminal that we expected to see.
- struct PlaceholderRecovery {
- // The token prior to the nonterminal which is being recovered.
- // This starts of the region we're skipping, so higher Position is better.
- Token::Index Position;
- // The nonterminal which will be created in order to recover.
- SymbolID Symbol;
- // The heuristic used to choose the bounds of the nonterminal to recover.
- ExtensionID Strategy;
-
- // The GSS head where we are expecting the recovered nonterminal.
- const GSS::Node *RecoveryNode;
- // Payload of nodes on the way back from the OldHead to the recovery node.
- // These represent the partial parse that is being discarded.
- // They should become the children of the opaque recovery node.
- // FIXME: internal structure of opaque nodes is not implemented.
- //
- // There may be multiple paths leading to the same recovery node, we choose
- // one arbitrarily.
- std::vector<const ForestNode *> DiscardedParse;
- };
- std::vector<PlaceholderRecovery> Options;
-
- // Find recovery options by walking up the stack.
- //
- // This is similar to exception handling: we walk up the "frames" of nested
- // rules being parsed until we find one that has a "handler" which allows us
- // to determine the node bounds without parsing it.
- //
- // Unfortunately there's a significant difference: the stack contains both
- // "upward" nodes (ancestor parses) and "leftward" ones.
- // e.g. when parsing `{ if (1) ? }` as compound-stmt, the stack contains:
- // stmt := IF ( expr ) . stmt - current state, we should recover here!
- // stmt := IF ( expr . ) stmt - (left, no recovery here)
- // stmt := IF ( . expr ) stmt - left, we should NOT recover here!
- // stmt := IF . ( expr ) stmt - (left, no recovery here)
- // stmt-seq := . stmt - up, we might recover here
- // compound-stmt := { . stmt-seq } - up, we should recover here!
- //
- // It's not obvious how to avoid collecting "leftward" recovery options.
- // I think the distinction is ill-defined after merging items into states.
- // For now, we have to take this into account when defining recovery rules.
- // (e.g. in the expr recovery above, stay inside the parentheses).
- // FIXME: find a more satisfying way to avoid such false recovery.
- // FIXME: Add a test for spurious recovery once tests can define strategies.
- std::vector<const ForestNode *> Path;
- llvm::DenseSet<const GSS::Node *> Seen;
- auto WalkUp = [&](const GSS::Node *N, Token::Index NextTok, auto &WalkUp) {
- if (!Seen.insert(N).second)
- return;
- if (!N->Recovered) { // Don't recover the same way twice!
- for (auto Strategy : Lang.Table.getRecovery(N->State)) {
- Options.push_back(PlaceholderRecovery{
- NextTok,
- Strategy.Result,
- Strategy.Strategy,
- N,
- Path,
- });
- LLVM_DEBUG(llvm::dbgs()
- << "Option: recover " << Lang.G.symbolName(Strategy.Result)
- << " at token " << NextTok << "\n");
- }
- }
- Path.push_back(N->Payload);
- for (const GSS::Node *Parent : N->parents())
- WalkUp(Parent, N->Payload->startTokenIndex(), WalkUp);
- Path.pop_back();
- };
- for (auto *N : OldHeads)
- WalkUp(N, TokenIndex, WalkUp);
-
- // Now we select the option(s) we will use to recover.
- //
- // We prefer options starting further right, as these discard less code
- // (e.g. we prefer to recover inner scopes rather than outer ones).
- // The options also need to agree on an endpoint, so the parser has a
- // consistent position afterwards.
- //
- // So conceptually we're sorting by the tuple (start, end), though we avoid
- // computing `end` for options that can't be winners.
-
- // Consider options starting further right first.
- // Don't drop the others yet though, we may still use them if preferred fails.
- llvm::stable_sort(Options, [&](const auto &L, const auto &R) {
- return L.Position > R.Position;
- });
-
- // We may find multiple winners, but they will have the same range.
- std::optional<Token::Range> RecoveryRange;
- std::vector<const PlaceholderRecovery *> BestOptions;
- for (const PlaceholderRecovery &Option : Options) {
- // If this starts further left than options we've already found, then
- // we'll never find anything better. Skip computing End for the rest.
- if (RecoveryRange && Option.Position < RecoveryRange->Begin)
- break;
-
- auto End = findRecoveryEndpoint(Option.Strategy, Option.Position,
- Params.Code, Lang);
- // Recovery may not take the parse backwards.
- if (End == Token::Invalid || End < TokenIndex)
- continue;
- if (RecoveryRange) {
- // If this is worse than our previous options, ignore it.
- if (RecoveryRange->End < End)
- continue;
- // If this is an improvement over our previous options, then drop them.
- if (RecoveryRange->End > End)
- BestOptions.clear();
- }
- // Create recovery nodes and heads for them in the GSS. These may be
- // discarded if a better recovery is later found, but this path isn't hot.
- RecoveryRange = {Option.Position, End};
- BestOptions.push_back(&Option);
- }
-
- if (BestOptions.empty()) {
- LLVM_DEBUG(llvm::dbgs() << "Recovery failed after trying " << Options.size()
- << " strategies\n");
- return;
- }
-
- // We've settled on a set of recovery options, so create their nodes and
- // advance the cursor.
- LLVM_DEBUG({
- llvm::dbgs() << "Recovered range=" << *RecoveryRange << ":";
- for (const auto *Option : BestOptions)
- llvm::dbgs() << " " << Lang.G.symbolName(Option->Symbol);
- llvm::dbgs() << "\n";
- });
- // FIXME: in general, we might have the same Option->Symbol multiple times,
- // and we risk creating redundant Forest and GSS nodes.
- // We also may inadvertently set up the next glrReduce to create a sequence
- // node duplicating an opaque node that we're creating here.
- // There are various options, including simply breaking ties between options.
- // For now it's obscure enough to ignore.
- for (const PlaceholderRecovery *Option : BestOptions) {
- Option->RecoveryNode->Recovered = true;
- const ForestNode &Placeholder =
- Params.Forest.createOpaque(Option->Symbol, RecoveryRange->Begin);
- LRTable::StateID OldState = Option->RecoveryNode->State;
- LRTable::StateID NewState =
- isToken(Option->Symbol)
- ? *Lang.Table.getShiftState(OldState, Option->Symbol)
- : *Lang.Table.getGoToState(OldState, Option->Symbol);
- const GSS::Node *NewHead =
- Params.GSStack.addNode(NewState, &Placeholder, {Option->RecoveryNode});
- NewHeads.push_back(NewHead);
- }
- TokenIndex = RecoveryRange->End;
-}
-
-using StateID = LRTable::StateID;
-
-llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const GSS::Node &N) {
- std::vector<std::string> ParentStates;
- for (const auto *Parent : N.parents())
- ParentStates.push_back(llvm::formatv("{0}", Parent->State));
- OS << llvm::formatv("state {0}, parsed symbol {1}, parents {3}", N.State,
- N.Payload ? N.Payload->symbol() : 0,
- llvm::join(ParentStates, ", "));
- return OS;
-}
-
-// Apply all pending shift actions.
-// In theory, LR parsing doesn't have shift/shift conflicts on a single head.
-// But we may have multiple active heads, and each head has a shift action.
-//
-// We merge the stack -- if multiple heads will reach the same state after
-// shifting a token, we shift only once by combining these heads.
-//
-// E.g. we have two heads (2, 3) in the GSS, and will shift both to reach 4:
-// 0---1---2
-// └---3
-// After the shift action, the GSS is:
-// 0---1---2---4
-// └---3---┘
-void glrShift(llvm::ArrayRef<const GSS::Node *> OldHeads,
- const ForestNode &NewTok, const ParseParams &Params,
- const Language &Lang, std::vector<const GSS::Node *> &NewHeads) {
- assert(NewTok.kind() == ForestNode::Terminal);
- LLVM_DEBUG(llvm::dbgs() << llvm::formatv(" Shift {0} ({1} active heads):\n",
- Lang.G.symbolName(NewTok.symbol()),
- OldHeads.size()));
-
- // We group pending shifts by their target state so we can merge them.
- llvm::SmallVector<std::pair<StateID, const GSS::Node *>, 8> Shifts;
- for (const auto *H : OldHeads)
- if (auto S = Lang.Table.getShiftState(H->State, NewTok.symbol()))
- Shifts.push_back({*S, H});
- llvm::stable_sort(Shifts, llvm::less_first{});
-
- auto Rest = llvm::ArrayRef(Shifts);
- llvm::SmallVector<const GSS::Node *> Parents;
- while (!Rest.empty()) {
- // Collect the batch of PendingShift that have compatible shift states.
- // Their heads become TempParents, the parents of the new GSS node.
- StateID NextState = Rest.front().first;
-
- Parents.clear();
- for (const auto &Base : Rest) {
- if (Base.first != NextState)
- break;
- Parents.push_back(Base.second);
- }
- Rest = Rest.drop_front(Parents.size());
-
- LLVM_DEBUG(llvm::dbgs() << llvm::formatv(" --> S{0} ({1} heads)\n",
- NextState, Parents.size()));
- NewHeads.push_back(Params.GSStack.addNode(NextState, &NewTok, Parents));
- }
-}
-
-namespace {
-// A KeyedQueue yields pairs of keys and values in order of the keys.
-template <typename Key, typename Value>
-using KeyedQueue =
- std::priority_queue<std::pair<Key, Value>,
- std::vector<std::pair<Key, Value>>, llvm::less_first>;
-
-template <typename T> void sortAndUnique(std::vector<T> &Vec) {
- llvm::sort(Vec);
- Vec.erase(std::unique(Vec.begin(), Vec.end()), Vec.end());
-}
-
-// Perform reduces until no more are possible.
-//
-// Generally this means walking up from the heads gathering ForestNodes that
-// will match the RHS of the rule we're reducing into a sequence ForestNode,
-// and ending up at a base node.
-// Then we push a new GSS node onto that base, taking care to:
-// - pack alternative sequence ForestNodes into an ambiguous ForestNode.
-// - use the same GSS node for multiple heads if the parse state matches.
-//
-// Examples of reduction:
-// Before (simple):
-// 0--1(expr)--2(semi)
-// After reducing 2 by `stmt := expr semi`:
-// 0--3(stmt) // 3 is goto(0, stmt)
-//
-// Before (splitting due to R/R conflict):
-// 0--1(IDENTIFIER)
-// After reducing 1 by `class-name := IDENTIFIER` & `enum-name := IDENTIFIER`:
-// 0--2(class-name) // 2 is goto(0, class-name)
-// └--3(enum-name) // 3 is goto(0, enum-name)
-//
-// Before (splitting due to multiple bases):
-// 0--2(class-name)--4(STAR)
-// └--3(enum-name)---┘
-// After reducing 4 by `ptr-operator := STAR`:
-// 0--2(class-name)--5(ptr-operator) // 5 is goto(2, ptr-operator)
-// └--3(enum-name)---6(ptr-operator) // 6 is goto(3, ptr-operator)
-//
-// Before (joining due to same goto state, multiple bases):
-// 0--1(cv-qualifier)--3(class-name)
-// └--2(cv-qualifier)--4(enum-name)
-// After reducing 3 by `type-name := class-name` and
-// 4 by `type-name := enum-name`:
-// 0--1(cv-qualifier)--5(type-name) // 5 is goto(1, type-name) and
-// └--2(cv-qualifier)--┘ // goto(2, type-name)
-//
-// Before (joining due to same goto state, the same base):
-// 0--1(class-name)--3(STAR)
-// └--2(enum-name)--4(STAR)
-// After reducing 3 by `pointer := class-name STAR` and
-// 2 by`enum-name := class-name STAR`:
-// 0--5(pointer) // 5 is goto(0, pointer)
-//
-// (This is a functor rather than a function to allow it to reuse scratch
-// storage across calls).
-class GLRReduce {
- const ParseParams &Params;
- const Language& Lang;
- // There are two interacting complications:
- // 1. Performing one reduce can unlock new reduces on the newly-created head.
- // 2a. The ambiguous ForestNodes must be complete (have all sequence nodes).
- // This means we must have unlocked all the reduces that contribute to it.
- // 2b. Similarly, the new GSS nodes must be complete (have all parents).
- //
- // We define a "family" of reduces as those that produce the same symbol and
- // cover the same range of tokens. These are exactly the set of reductions
- // whose sequence nodes would be covered by the same ambiguous node.
- // We wish to process a whole family at a time (to satisfy complication 2),
- // and can address complication 1 by carefully ordering the families:
- // - Process families covering fewer tokens first.
- // A reduce can't depend on a longer reduce!
- // - For equal token ranges: if S := T, process T families before S families.
- // Parsing T can't depend on an equal-length S, as the grammar is acyclic.
- //
- // This isn't quite enough: we don't know the token length of the reduction
- // until we walk up the stack to perform the pop.
- // So we perform the pop part upfront, and place the push specification on
- // priority queues such that we can retrieve a family at a time.
-
- // A reduction family is characterized by its token range and symbol produced.
- // It is used as a key in the priority queues to group pushes by family.
- struct Family {
- // The start of the token range of the reduce.
- Token::Index Start;
- SymbolID Symbol;
- // Rule must produce Symbol and can otherwise be arbitrary.
- // RuleIDs have the topological order based on the acyclic grammar.
- // FIXME: should SymbolIDs be so ordered instead?
- RuleID Rule;
-
- bool operator==(const Family &Other) const {
- return Start == Other.Start && Symbol == Other.Symbol;
- }
- // The larger Family is the one that should be processed first.
- bool operator<(const Family &Other) const {
- if (Start != Other.Start)
- return Start < Other.Start;
- if (Symbol != Other.Symbol)
- return Rule > Other.Rule;
- assert(*this == Other);
- return false;
- }
- };
-
- // A sequence is the ForestNode payloads of the GSS nodes we are reducing.
- using Sequence = llvm::SmallVector<const ForestNode *, Rule::MaxElements>;
- // Like ArrayRef<const ForestNode*>, but with the missing operator<.
- // (Sequences are big to move by value as the collections gets rearranged).
- struct SequenceRef {
- SequenceRef(const Sequence &S) : S(S) {}
- llvm::ArrayRef<const ForestNode *> S;
- friend bool operator==(SequenceRef A, SequenceRef B) { return A.S == B.S; }
- friend bool operator<(const SequenceRef &A, const SequenceRef &B) {
- return std::lexicographical_compare(A.S.begin(), A.S.end(), B.S.begin(),
- B.S.end());
- }
- };
- // Underlying storage for sequences pointed to by stored SequenceRefs.
- std::deque<Sequence> SequenceStorage;
- // We don't actually destroy the sequences between calls, to reuse storage.
- // Everything SequenceStorage[ >=SequenceStorageCount ] is reusable scratch.
- unsigned SequenceStorageCount;
-
- // Halfway through a reduction (after the pop, before the push), we have
- // collected nodes for the RHS of a rule, and reached a base node.
- // They specify a sequence ForestNode we may build (but we dedup first).
- // (The RuleID is not stored here, but rather in the Family).
- struct PushSpec {
- // The last node popped before pushing. Its parent is the reduction base(s).
- // (Base is more fundamental, but this is cheaper to store).
- const GSS::Node* LastPop = nullptr;
- Sequence *Seq = nullptr;
- };
- KeyedQueue<Family, PushSpec> Sequences; // FIXME: rename => PendingPushes?
-
- // We treat Heads as a queue of Pop operations still to be performed.
- // PoppedHeads is our position within it.
- std::vector<const GSS::Node *> *Heads;
- unsigned NextPopHead;
- SymbolID Lookahead;
-
- Sequence TempSequence;
-public:
- GLRReduce(const ParseParams &Params, const Language &Lang)
- : Params(Params), Lang(Lang) {}
-
- // Reduce Heads, resulting in new nodes that are appended to Heads.
- // The "consumed" nodes are not removed!
- // Only reduce rules compatible with the Lookahead are applied, though
- // tokenSymbol(tok::unknown) will match any rule.
- void operator()(std::vector<const GSS::Node *> &Heads, SymbolID Lookahead) {
- assert(isToken(Lookahead));
-
- NextPopHead = 0;
- this->Heads = &Heads;
- this->Lookahead = Lookahead;
- assert(Sequences.empty());
- SequenceStorageCount = 0;
-
- popPending();
- while (!Sequences.empty()) {
- pushNext();
- popPending();
- }
- }
-
-private:
- bool canReduce(const Rule &R, RuleID RID,
- llvm::ArrayRef<const ForestNode *> RHS) const {
- if (!R.Guarded)
- return true;
- if (auto Guard = Lang.Guards.lookup(RID))
- return Guard({RHS, Params.Code, Lookahead});
- LLVM_DEBUG(llvm::dbgs()
- << llvm::formatv("missing guard implementation for rule {0}\n",
- Lang.G.dumpRule(RID)));
- return true;
- }
- // pop walks up the parent chain(s) for a reduction from Head by to Rule.
- // Once we reach the end, record the bases and sequences.
- void pop(const GSS::Node *Head, RuleID RID, const Rule &Rule) {
- LLVM_DEBUG(llvm::dbgs() << " Pop " << Lang.G.dumpRule(RID) << "\n");
- Family F{/*Start=*/0, /*Symbol=*/Rule.Target, /*Rule=*/RID};
- TempSequence.resize_for_overwrite(Rule.Size);
- auto DFS = [&](const GSS::Node *N, unsigned I, auto &DFS) {
- TempSequence[Rule.Size - 1 - I] = N->Payload;
- if (I + 1 == Rule.Size) {
- F.Start = TempSequence.front()->startTokenIndex();
- LLVM_DEBUG({
- for (const auto *B : N->parents())
- llvm::dbgs() << " --> base at S" << B->State << "\n";
- });
- if (!canReduce(Rule, RID, TempSequence))
- return;
- // Copy the chain to stable storage so it can be enqueued.
- if (SequenceStorageCount == SequenceStorage.size())
- SequenceStorage.emplace_back();
- SequenceStorage[SequenceStorageCount] = TempSequence;
- Sequence *Seq = &SequenceStorage[SequenceStorageCount++];
-
- Sequences.emplace(F, PushSpec{N, Seq});
- return;
- }
- for (const GSS::Node *Parent : N->parents())
- DFS(Parent, I + 1, DFS);
- };
- DFS(Head, 0, DFS);
- }
-
- // popPending pops every available reduction.
- void popPending() {
- for (; NextPopHead < Heads->size(); ++NextPopHead) {
- // In trivial cases, we perform the complete reduce here!
- if (popAndPushTrivial())
- continue;
- for (RuleID RID :
- Lang.Table.getReduceRules((*Heads)[NextPopHead]->State)) {
- const auto &Rule = Lang.G.lookupRule(RID);
- if (Lang.Table.canFollow(Rule.Target, Lookahead))
- pop((*Heads)[NextPopHead], RID, Rule);
- }
- }
- }
-
- // Storage reused by each call to pushNext.
- std::vector<std::pair</*Goto*/ StateID, const GSS::Node *>> FamilyBases;
- std::vector<std::pair<RuleID, SequenceRef>> FamilySequences;
- std::vector<const GSS::Node *> Parents;
- std::vector<const ForestNode *> SequenceNodes;
-
- // Process one push family, forming a forest node.
- // This produces new GSS heads which may enable more pops.
- void pushNext() {
- assert(!Sequences.empty());
- Family F = Sequences.top().first;
-
- LLVM_DEBUG(llvm::dbgs() << " Push " << Lang.G.symbolName(F.Symbol)
- << " from token " << F.Start << "\n");
-
- // Grab the sequences and bases for this family.
- // We don't care which rule yielded each base. If Family.Symbol is S, the
- // base includes an item X := ... • S ... and since the grammar is
- // context-free, *all* parses of S are valid here.
- FamilySequences.clear();
- FamilyBases.clear();
- do {
- const PushSpec &Push = Sequences.top().second;
- FamilySequences.emplace_back(Sequences.top().first.Rule, *Push.Seq);
- for (const GSS::Node *Base : Push.LastPop->parents()) {
- auto NextState = Lang.Table.getGoToState(Base->State, F.Symbol);
- assert(NextState.has_value() && "goto must succeed after reduce!");
- FamilyBases.emplace_back(*NextState, Base);
- }
-
- Sequences.pop();
- } while (!Sequences.empty() && Sequences.top().first == F);
- // Build a forest node for each unique sequence.
- sortAndUnique(FamilySequences);
- SequenceNodes.clear();
- for (const auto &SequenceSpec : FamilySequences)
- SequenceNodes.push_back(&Params.Forest.createSequence(
- F.Symbol, SequenceSpec.first, SequenceSpec.second.S));
- // Wrap in an ambiguous node if needed.
- const ForestNode *Parsed =
- SequenceNodes.size() == 1
- ? SequenceNodes.front()
- : &Params.Forest.createAmbiguous(F.Symbol, SequenceNodes);
- LLVM_DEBUG(llvm::dbgs() << " --> " << Parsed->dump(Lang.G) << "\n");
-
- // Bases for this family, deduplicate them, and group by the goTo State.
- sortAndUnique(FamilyBases);
- // Create a GSS node for each unique goto state.
- llvm::ArrayRef<decltype(FamilyBases)::value_type> BasesLeft = FamilyBases;
- while (!BasesLeft.empty()) {
- StateID NextState = BasesLeft.front().first;
- Parents.clear();
- for (const auto &Base : BasesLeft) {
- if (Base.first != NextState)
- break;
- Parents.push_back(Base.second);
- }
- BasesLeft = BasesLeft.drop_front(Parents.size());
- Heads->push_back(Params.GSStack.addNode(NextState, Parsed, Parents));
- }
- }
-
- // In general we split a reduce into a pop/push, so concurrently-available
- // reductions can run in the correct order. The data structures are expensive.
- //
- // When only one reduction is possible at a time, we can skip this:
- // we pop and immediately push, as an LR parser (as opposed to GLR) would.
- // This is valid whenever there's only one concurrent PushSpec.
- //
- // This function handles a trivial but common subset of these cases:
- // - there must be no pending pushes, and only one poppable head
- // - the head must have only one reduction rule
- // - the reduction path must be a straight line (no multiple parents)
- // (Roughly this means there's no local ambiguity, so the LR algorithm works).
- //
- // Returns true if we successfully consumed the next unpopped head.
- bool popAndPushTrivial() {
- if (!Sequences.empty() || Heads->size() != NextPopHead + 1)
- return false;
- const GSS::Node *Head = Heads->back();
- std::optional<RuleID> RID;
- for (RuleID R : Lang.Table.getReduceRules(Head->State)) {
- if (RID.has_value())
- return false;
- RID = R;
- }
- if (!RID)
- return true; // no reductions available, but we've processed the head!
- const auto &Rule = Lang.G.lookupRule(*RID);
- if (!Lang.Table.canFollow(Rule.Target, Lookahead))
- return true; // reduction is not available
- const GSS::Node *Base = Head;
- TempSequence.resize_for_overwrite(Rule.Size);
- for (unsigned I = 0; I < Rule.Size; ++I) {
- if (Base->parents().size() != 1)
- return false;
- TempSequence[Rule.Size - 1 - I] = Base->Payload;
- Base = Base->parents().front();
- }
- if (!canReduce(Rule, *RID, TempSequence))
- return true; // reduction is not available
- const ForestNode *Parsed =
- &Params.Forest.createSequence(Rule.Target, *RID, TempSequence);
- auto NextState = Lang.Table.getGoToState(Base->State, Rule.Target);
- assert(NextState.has_value() && "goto must succeed after reduce!");
- Heads->push_back(Params.GSStack.addNode(*NextState, Parsed, {Base}));
- LLVM_DEBUG(llvm::dbgs()
- << " Reduce (trivial) " << Lang.G.dumpRule(*RID) << "\n"
- << " --> S" << Heads->back()->State << "\n");
- return true;
- }
-};
-
-} // namespace
-
-ForestNode &glrParse(const ParseParams &Params, SymbolID StartSymbol,
- const Language &Lang) {
- GLRReduce Reduce(Params, Lang);
- assert(isNonterminal(StartSymbol) && "Start symbol must be a nonterminal");
- llvm::ArrayRef<ForestNode> Terminals = Params.Forest.createTerminals(Params.Code);
- auto &GSS = Params.GSStack;
-
- StateID StartState = Lang.Table.getStartState(StartSymbol);
- // Heads correspond to the parse of tokens [0, I), NextHeads to [0, I+1).
- std::vector<const GSS::Node *> Heads = {GSS.addNode(/*State=*/StartState,
- /*ForestNode=*/nullptr,
- {})};
- // Invariant: Heads is partitioned by source: {shifted | reduced}.
- // HeadsPartition is the index of the first head formed by reduction.
- // We use this to discard and recreate the reduced heads during recovery.
- unsigned HeadsPartition = Heads.size();
- std::vector<const GSS::Node *> NextHeads;
- auto MaybeGC = [&, Roots(std::vector<const GSS::Node *>{}), I(0u)]() mutable {
- assert(NextHeads.empty() && "Running GC at the wrong time!");
- if (++I != 20) // Run periodically to balance CPU and memory usage.
- return;
- I = 0;
-
- // We need to copy the list: Roots is consumed by the GC.
- Roots = Heads;
- GSS.gc(std::move(Roots));
- };
- // Each iteration fully processes a single token.
- for (unsigned I = 0; I < Terminals.size();) {
- LLVM_DEBUG(llvm::dbgs() << llvm::formatv(
- "Next token {0} (id={1})\n",
- Lang.G.symbolName(Terminals[I].symbol()), Terminals[I].symbol()));
- // Consume the token.
- glrShift(Heads, Terminals[I], Params, Lang, NextHeads);
-
- // If we weren't able to consume the token, try to skip over some tokens
- // so we can keep parsing.
- if (NextHeads.empty()) {
- // The reduction in the previous round was constrained by lookahead.
- // On valid code this only rejects dead ends, but on broken code we should
- // consider all possibilities.
- //
- // We discard all heads formed by reduction, and recreate them without
- // this constraint. This may duplicate some nodes, but it's rare.
- LLVM_DEBUG(llvm::dbgs() << "Shift failed, will attempt recovery. "
- "Re-reducing without lookahead.\n");
- Heads.resize(HeadsPartition);
- Reduce(Heads, /*allow all reductions*/ tokenSymbol(tok::unknown));
-
- glrRecover(Heads, I, Params, Lang, NextHeads);
- if (NextHeads.empty())
- // FIXME: Ensure the `_ := start-symbol` rules have a fallback
- // error-recovery strategy attached. Then this condition can't happen.
- return Params.Forest.createOpaque(StartSymbol, /*Token::Index=*/0);
- } else
- ++I;
-
- // Form nonterminals containing the token we just consumed.
- SymbolID Lookahead =
- I == Terminals.size() ? tokenSymbol(tok::eof) : Terminals[I].symbol();
- HeadsPartition = NextHeads.size();
- Reduce(NextHeads, Lookahead);
- // Prepare for the next token.
- std::swap(Heads, NextHeads);
- NextHeads.clear();
- MaybeGC();
- }
- LLVM_DEBUG(llvm::dbgs() << llvm::formatv("Reached eof\n"));
-
- // The parse was successful if in state `_ := start-symbol EOF .`
- // The GSS parent has `_ := start-symbol . EOF`; its payload is the parse.
- auto AfterStart = Lang.Table.getGoToState(StartState, StartSymbol);
- assert(AfterStart.has_value() && "goto must succeed after start symbol!");
- auto Accept = Lang.Table.getShiftState(*AfterStart, tokenSymbol(tok::eof));
- assert(Accept.has_value() && "shift EOF must succeed!");
- auto SearchForAccept = [&](llvm::ArrayRef<const GSS::Node *> Heads) {
- const ForestNode *Result = nullptr;
- for (const auto *Head : Heads) {
- if (Head->State == *Accept) {
- assert(Head->Payload->symbol() == tokenSymbol(tok::eof));
- assert(Result == nullptr && "multiple results!");
- Result = Head->parents().front()->Payload;
- assert(Result->symbol() == StartSymbol);
- }
- }
- return Result;
- };
- if (auto *Result = SearchForAccept(Heads))
- return *const_cast<ForestNode *>(Result); // Safe: we created all nodes.
- // We failed to parse the input, returning an opaque forest node for recovery.
- // FIXME: as above, we can add fallback error handling so this is impossible.
- return Params.Forest.createOpaque(StartSymbol, /*Token::Index=*/0);
-}
-
-void glrReduce(std::vector<const GSS::Node *> &Heads, SymbolID Lookahead,
- const ParseParams &Params, const Language &Lang) {
- // Create a new GLRReduce each time for tests, performance doesn't matter.
- GLRReduce{Params, Lang}(Heads, Lookahead);
-}
-
-const GSS::Node *GSS::addNode(LRTable::StateID State, const ForestNode *Symbol,
- llvm::ArrayRef<const Node *> Parents) {
- Node *Result = new (allocate(Parents.size())) Node();
- Result->State = State;
- Result->GCParity = GCParity;
- Result->ParentCount = Parents.size();
- Alive.push_back(Result);
- ++NodesCreated;
- Result->Payload = Symbol;
- if (!Parents.empty())
- llvm::copy(Parents, reinterpret_cast<const Node **>(Result + 1));
- return Result;
-}
-
-GSS::Node *GSS::allocate(unsigned Parents) {
- if (FreeList.size() <= Parents)
- FreeList.resize(Parents + 1);
- auto &SizedList = FreeList[Parents];
- if (!SizedList.empty()) {
- auto *Result = SizedList.back();
- SizedList.pop_back();
- return Result;
- }
- return static_cast<Node *>(
- Arena.Allocate(sizeof(Node) + Parents * sizeof(Node *), alignof(Node)));
-}
-
-void GSS::destroy(Node *N) {
- unsigned ParentCount = N->ParentCount;
- N->~Node();
- assert(FreeList.size() > ParentCount && "established on construction!");
- FreeList[ParentCount].push_back(N);
-}
-
-unsigned GSS::gc(std::vector<const Node *> &&Queue) {
-#ifndef NDEBUG
- auto ParityMatches = [&](const Node *N) { return N->GCParity == GCParity; };
- assert("Before GC" && llvm::all_of(Alive, ParityMatches));
- auto Deferred = llvm::make_scope_exit(
- [&] { assert("After GC" && llvm::all_of(Alive, ParityMatches)); });
- assert(llvm::all_of(
- Queue, [&](const Node *R) { return llvm::is_contained(Alive, R); }));
-#endif
- unsigned InitialCount = Alive.size();
-
- // Mark
- GCParity = !GCParity;
- while (!Queue.empty()) {
- Node *N = const_cast<Node *>(Queue.back()); // Safe: we created these nodes.
- Queue.pop_back();
- if (N->GCParity != GCParity) { // Not seen yet
- N->GCParity = GCParity; // Mark as seen
- for (const Node *P : N->parents()) // And walk parents
- Queue.push_back(P);
- }
- }
- // Sweep
- llvm::erase_if(Alive, [&](Node *N) {
- if (N->GCParity == GCParity) // Walk reached this node.
- return false;
- destroy(N);
- return true;
- });
-
- LLVM_DEBUG(llvm::dbgs() << "GC pruned " << (InitialCount - Alive.size())
- << "/" << InitialCount << " GSS nodes\n");
- return InitialCount - Alive.size();
-}
-
-} // namespace pseudo
-} // namespace clang
diff --git a/clang-tools-extra/pseudo/lib/cli/CLI.cpp b/clang-tools-extra/pseudo/lib/cli/CLI.cpp
deleted file mode 100644
index 5c7c3b6c827ea..0000000000000
--- a/clang-tools-extra/pseudo/lib/cli/CLI.cpp
+++ /dev/null
@@ -1,54 +0,0 @@
-//===--- CLI.cpp - ----------------------------------------------*- C++-*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "clang-pseudo/cli/CLI.h"
-#include "clang-pseudo/cxx/CXX.h"
-#include "clang-pseudo/grammar/Grammar.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/ErrorOr.h"
-#include "llvm/Support/MemoryBuffer.h"
-
-static llvm::cl::opt<std::string> Grammar(
- "grammar",
- llvm::cl::desc(
- "Specify a BNF grammar file path, or a builtin language (cxx)."),
- llvm::cl::init("cxx"));
-
-namespace clang {
-namespace pseudo {
-
-const Language &getLanguageFromFlags() {
- if (::Grammar == "cxx")
- return cxx::getLanguage();
-
- static Language *Lang = []() {
- // Read from a bnf grammar file.
- llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> GrammarText =
- llvm::MemoryBuffer::getFile(::Grammar);
- if (std::error_code EC = GrammarText.getError()) {
- llvm::errs() << "Error: can't read grammar file '" << ::Grammar
- << "': " << EC.message() << "\n";
- std::exit(1);
- }
- std::vector<std::string> Diags;
- auto G = Grammar::parseBNF(GrammarText->get()->getBuffer(), Diags);
- for (const auto &Diag : Diags)
- llvm::errs() << Diag << "\n";
- auto Table = LRTable::buildSLR(G);
- return new Language{
- std::move(G),
- std::move(Table),
- llvm::DenseMap<ExtensionID, RuleGuard>(),
- llvm::DenseMap<ExtensionID, RecoveryStrategy>(),
- };
- }();
- return *Lang;
-}
-
-} // namespace pseudo
-} // namespace clang
diff --git a/clang-tools-extra/pseudo/lib/cli/CMakeLists.txt b/clang-tools-extra/pseudo/lib/cli/CMakeLists.txt
deleted file mode 100644
index 68e644f62fded..0000000000000
--- a/clang-tools-extra/pseudo/lib/cli/CMakeLists.txt
+++ /dev/null
@@ -1,15 +0,0 @@
-set(LLVM_LINK_COMPONENTS
- Support
- )
-
-add_clang_library(clangPseudoCLI
- CLI.cpp
-
- # FIXME export the headers from clangPseudoCXX instead
- DEPENDS
- cxx_gen
-
- LINK_LIBS
- clangPseudoGrammar
- clangPseudoCXX
- )
diff --git a/clang-tools-extra/pseudo/lib/cxx/CMakeLists.txt b/clang-tools-extra/pseudo/lib/cxx/CMakeLists.txt
deleted file mode 100644
index d56d16c893c3d..0000000000000
--- a/clang-tools-extra/pseudo/lib/cxx/CMakeLists.txt
+++ /dev/null
@@ -1,15 +0,0 @@
-set(LLVM_LINK_COMPONENTS
- Support
- )
-
-add_clang_library(clangPseudoCXX
- CXX.cpp
-
- DEPENDS
- cxx_gen
-
- LINK_LIBS
- clangBasic
- clangPseudo
- clangPseudoGrammar
- )
diff --git a/clang-tools-extra/pseudo/lib/cxx/CXX.cpp b/clang-tools-extra/pseudo/lib/cxx/CXX.cpp
deleted file mode 100644
index 4188dab31d3a9..0000000000000
--- a/clang-tools-extra/pseudo/lib/cxx/CXX.cpp
+++ /dev/null
@@ -1,452 +0,0 @@
-//===--- CXX.cpp - Define public interfaces for C++ grammar ---------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "clang-pseudo/cxx/CXX.h"
-#include "clang-pseudo/Forest.h"
-#include "clang-pseudo/Language.h"
-#include "clang-pseudo/grammar/Grammar.h"
-#include "clang-pseudo/grammar/LRTable.h"
-#include "clang/Basic/CharInfo.h"
-#include "clang/Basic/TokenKinds.h"
-#include "llvm/ADT/StringSwitch.h"
-#include "llvm/Support/Debug.h"
-#include <utility>
-#define DEBUG_TYPE "CXX.cpp"
-
-namespace clang {
-namespace pseudo {
-namespace cxx {
-namespace {
-static const char *CXXBNF =
-#include "CXXBNF.inc"
- ;
-
-// User-defined string literals look like `""suffix`.
-bool isStringUserDefined(const Token &Tok) {
- return !Tok.text().ends_with("\"");
-}
-bool isCharUserDefined(const Token &Tok) { return !Tok.text().ends_with("'"); }
-
-// Combinable flags describing numbers.
-// Clang has just one numeric_token kind, the grammar has 4.
-enum NumericKind {
- Integer = 0,
- Floating = 1 << 0,
- UserDefined = 1 << 1,
-};
-// Determine the kind of numeric_constant we have.
-// We can assume it's something valid, as it has been lexed.
-// FIXME: is this expensive enough that we should set flags on the token
-// and reuse them rather than computing it for each guard?
-unsigned numKind(const Token &Tok) {
- assert(Tok.Kind == tok::numeric_constant);
- llvm::StringRef Text = Tok.text();
- if (Text.size() <= 1)
- return Integer;
- bool Hex =
- Text.size() > 2 && Text[0] == '0' && (Text[1] == 'x' || Text[1] == 'X');
- uint8_t K = Integer;
-
- for (char C : Text) {
- switch (C) {
- case '.':
- K |= Floating;
- break;
- case 'e':
- case 'E':
- if (!Hex)
- K |= Floating;
- break;
- case 'p':
- case 'P':
- if (Hex)
- K |= Floating;
- break;
- case '_':
- K |= UserDefined;
- break;
- default:
- break;
- }
- }
-
- // We would be done here, but there are stdlib UDLs that lack _.
- // We must distinguish these from the builtin suffixes.
- unsigned LastLetter = Text.size();
- while (LastLetter > 0 && isLetter(Text[LastLetter - 1]))
- --LastLetter;
- if (LastLetter == Text.size()) // Common case
- return NumericKind(K);
- // Trailing d/e/f are not part of the suffix in hex numbers.
- while (Hex && LastLetter < Text.size() && isHexDigit(Text[LastLetter]))
- ++LastLetter;
- return llvm::StringSwitch<int, unsigned>(Text.substr(LastLetter))
- // std::chrono
- .Cases("h", "min", "s", "ms", "us", "ns", "d", "y", K | UserDefined)
- // complex
- .Cases("il", "i", "if", K | UserDefined)
- .Default(K);
-}
-
-// RHS is expected to contain a single terminal.
-// Returns the corresponding token.
-const Token &onlyToken(tok::TokenKind Kind,
- const ArrayRef<const ForestNode *> RHS,
- const TokenStream &Tokens) {
- assert(RHS.size() == 1 && RHS.front()->symbol() == tokenSymbol(Kind));
- return Tokens.tokens()[RHS.front()->startTokenIndex()];
-}
-// RHS is expected to contain a single symbol.
-// Returns the corresponding ForestNode.
-const ForestNode &onlySymbol(SymbolID Kind,
- const ArrayRef<const ForestNode *> RHS,
- const TokenStream &Tokens) {
- assert(RHS.size() == 1 && RHS.front()->symbol() == Kind);
- return *RHS.front();
-}
-
-bool isFunctionDeclarator(const ForestNode *Declarator) {
- assert(Declarator->symbol() == cxx::Symbol::declarator);
- bool IsFunction = false;
- while (true) {
- // not well-formed code, return the best guess.
- if (Declarator->kind() != ForestNode::Sequence)
- return IsFunction;
-
- switch (Declarator->rule()) {
- case rule::noptr_declarator::declarator_id: // reached the bottom
- return IsFunction;
- // *X is a nonfunction (unless X is a function).
- case rule::ptr_declarator::ptr_operator__ptr_declarator:
- Declarator = Declarator->elements()[1];
- IsFunction = false;
- continue;
- // X() is a function (unless X is a pointer or similar).
- case rule::declarator::
- noptr_declarator__parameters_and_qualifiers__trailing_return_type:
- case rule::noptr_declarator::noptr_declarator__parameters_and_qualifiers:
- Declarator = Declarator->elements()[0];
- IsFunction = true;
- continue;
- // X[] is an array (unless X is a pointer or function).
- case rule::noptr_declarator::
- noptr_declarator__L_SQUARE__constant_expression__R_SQUARE:
- case rule::noptr_declarator::noptr_declarator__L_SQUARE__R_SQUARE:
- Declarator = Declarator->elements()[0];
- IsFunction = false;
- continue;
- // (X) is whatever X is.
- case rule::noptr_declarator::L_PAREN__ptr_declarator__R_PAREN:
- Declarator = Declarator->elements()[1];
- continue;
- case rule::ptr_declarator::noptr_declarator:
- case rule::declarator::ptr_declarator:
- Declarator = Declarator->elements()[0];
- continue;
-
- default:
- assert(false && "unhandled declarator for IsFunction");
- return IsFunction;
- }
- }
- llvm_unreachable("unreachable");
-}
-
-bool guardNextTokenNotElse(const GuardParams &P) {
- return symbolToToken(P.Lookahead) != tok::kw_else;
-}
-
-bool specifiesStructuredBinding(const GuardParams &P) {
- const auto DSS = P.RHS[0];
- assert(DSS->symbol() == Symbol::decl_specifier_seq);
-
- auto Length = P.RHS[1]->startTokenIndex() - DSS->startTokenIndex();
- for (const auto &T :
- P.Tokens.tokens().slice(DSS->startTokenIndex(), Length)) {
- switch (T.Kind) {
- case clang::tok::kw_static:
- case clang::tok::kw_thread_local:
- case clang::tok::kw_auto:
- case clang::tok::kw_const:
- case clang::tok::kw_volatile:
- break;
- default:
- return false;
- }
- }
- return true;
-}
-
-// Whether this e.g. decl-specifier contains an "exclusive" type such as a class
-// name, and thus can't combine with a second exclusive type.
-//
-// Returns false for
-// - non-types
-// - "unsigned" etc that may suffice as types but may modify others
-// - cases of uncertainty (e.g. due to ambiguity)
-bool hasExclusiveType(const ForestNode *N) {
- // FIXME: every time we apply this check, we walk the whole subtree.
- // Add per-node caching instead.
- while (true) {
- assert(N->symbol() == Symbol::decl_specifier_seq ||
- N->symbol() == Symbol::type_specifier_seq ||
- N->symbol() == Symbol::defining_type_specifier_seq ||
- N->symbol() == Symbol::decl_specifier ||
- N->symbol() == Symbol::type_specifier ||
- N->symbol() == Symbol::defining_type_specifier ||
- N->symbol() == Symbol::simple_type_specifier);
- if (N->kind() == ForestNode::Opaque)
- return false; // conservative
- if (N->kind() == ForestNode::Ambiguous)
- return llvm::all_of(N->alternatives(), hasExclusiveType); // conservative
- // All supported symbols are nonterminals.
- assert(N->kind() == ForestNode::Sequence);
- switch (N->rule()) {
- // seq := element seq: check element then continue into seq
- case rule::decl_specifier_seq::decl_specifier__decl_specifier_seq:
- case rule::defining_type_specifier_seq::defining_type_specifier__defining_type_specifier_seq:
- case rule::type_specifier_seq::type_specifier__type_specifier_seq:
- if (hasExclusiveType(N->children()[0]))
- return true;
- N = N->children()[1];
- continue;
- // seq := element: continue into element
- case rule::decl_specifier_seq::decl_specifier:
- case rule::type_specifier_seq::type_specifier:
- case rule::defining_type_specifier_seq::defining_type_specifier:
- N = N->children()[0];
- continue;
-
- // defining-type-specifier
- case rule::defining_type_specifier::type_specifier:
- N = N->children()[0];
- continue;
- case rule::defining_type_specifier::class_specifier:
- case rule::defining_type_specifier::enum_specifier:
- return true;
-
- // decl-specifier
- case rule::decl_specifier::defining_type_specifier:
- N = N->children()[0];
- continue;
- case rule::decl_specifier::CONSTEVAL:
- case rule::decl_specifier::CONSTEXPR:
- case rule::decl_specifier::CONSTINIT:
- case rule::decl_specifier::INLINE:
- case rule::decl_specifier::FRIEND:
- case rule::decl_specifier::storage_class_specifier:
- case rule::decl_specifier::TYPEDEF:
- case rule::decl_specifier::function_specifier:
- return false;
-
- // type-specifier
- case rule::type_specifier::elaborated_type_specifier:
- case rule::type_specifier::typename_specifier:
- return true;
- case rule::type_specifier::simple_type_specifier:
- N = N->children()[0];
- continue;
- case rule::type_specifier::cv_qualifier:
- return false;
-
- // simple-type-specifier
- case rule::simple_type_specifier::type_name:
- case rule::simple_type_specifier::template_name:
- case rule::simple_type_specifier::builtin_type:
- case rule::simple_type_specifier::nested_name_specifier__TEMPLATE__simple_template_id:
- case rule::simple_type_specifier::nested_name_specifier__template_name:
- case rule::simple_type_specifier::nested_name_specifier__type_name:
- case rule::simple_type_specifier::decltype_specifier:
- case rule::simple_type_specifier::placeholder_type_specifier:
- return true;
- case rule::simple_type_specifier::LONG:
- case rule::simple_type_specifier::SHORT:
- case rule::simple_type_specifier::SIGNED:
- case rule::simple_type_specifier::UNSIGNED:
- return false;
-
- default:
- LLVM_DEBUG(llvm::errs() << "Unhandled rule " << N->rule() << "\n");
- llvm_unreachable("hasExclusiveType be exhaustive!");
- }
- }
-}
-
-llvm::DenseMap<ExtensionID, RuleGuard> buildGuards() {
-#define GUARD(cond) \
- { \
- [](const GuardParams &P) { return cond; } \
- }
-#define TOKEN_GUARD(kind, cond) \
- [](const GuardParams& P) { \
- const Token &Tok = onlyToken(tok::kind, P.RHS, P.Tokens); \
- return cond; \
- }
-#define SYMBOL_GUARD(kind, cond) \
- [](const GuardParams& P) { \
- const ForestNode &N = onlySymbol(Symbol::kind, P.RHS, P.Tokens); \
- return cond; \
- }
- return {
- {rule::function_declarator::declarator,
- SYMBOL_GUARD(declarator, isFunctionDeclarator(&N))},
- {rule::non_function_declarator::declarator,
- SYMBOL_GUARD(declarator, !isFunctionDeclarator(&N))},
-
- // A {decl,type,defining-type}-specifier-sequence cannot have multiple
- // "exclusive" types (like class names): a value has only one type.
- {rule::defining_type_specifier_seq::
- defining_type_specifier__defining_type_specifier_seq,
- GUARD(!hasExclusiveType(P.RHS[0]) || !hasExclusiveType(P.RHS[1]))},
- {rule::type_specifier_seq::type_specifier__type_specifier_seq,
- GUARD(!hasExclusiveType(P.RHS[0]) || !hasExclusiveType(P.RHS[1]))},
- {rule::decl_specifier_seq::decl_specifier__decl_specifier_seq,
- GUARD(!hasExclusiveType(P.RHS[0]) || !hasExclusiveType(P.RHS[1]))},
-
- {rule::contextual_override::IDENTIFIER,
- TOKEN_GUARD(identifier, Tok.text() == "override")},
- {rule::contextual_final::IDENTIFIER,
- TOKEN_GUARD(identifier, Tok.text() == "final")},
- {rule::import_keyword::IDENTIFIER,
- TOKEN_GUARD(identifier, Tok.text() == "import")},
- {rule::export_keyword::IDENTIFIER,
- TOKEN_GUARD(identifier, Tok.text() == "export")},
- {rule::module_keyword::IDENTIFIER,
- TOKEN_GUARD(identifier, Tok.text() == "module")},
- {rule::contextual_zero::NUMERIC_CONSTANT,
- TOKEN_GUARD(numeric_constant, Tok.text() == "0")},
-
- {rule::selection_statement::IF__L_PAREN__condition__R_PAREN__statement,
- guardNextTokenNotElse},
- {rule::selection_statement::
- IF__L_PAREN__init_statement__condition__R_PAREN__statement,
- guardNextTokenNotElse},
- {rule::selection_statement::
- IF__CONSTEXPR__L_PAREN__condition__R_PAREN__statement,
- guardNextTokenNotElse},
- {rule::selection_statement::
- IF__CONSTEXPR__L_PAREN__init_statement__condition__R_PAREN__statement,
- guardNextTokenNotElse},
-
- // Implement C++ [basic.lookup.qual.general]:
- // If a name, template-id, or decltype-specifier is followed by a
- // ::, it shall designate a namespace, class, enumeration, or
- // dependent type, and the :: is never interpreted as a complete
- // nested-name-specifier.
- {rule::nested_name_specifier::COLONCOLON,
- TOKEN_GUARD(coloncolon, Tok.prev().Kind != tok::identifier)},
-
- // Implement C++ [dcl.pre#6]:
- // A simple-declaration with an identifier-list is called a structured
- // binding declaration ([dcl.struct.bind]). If the decl-specifier-seq
- // contains any decl-specifier other than static, thread_local, auto,
- // or cv-qualifiers, the program is ill-formed.
- {rule::simple_declaration::
- decl_specifier_seq__ref_qualifier__L_SQUARE__identifier_list__R_SQUARE__initializer__SEMI,
- specifiesStructuredBinding},
- {rule::simple_declaration::
- decl_specifier_seq__L_SQUARE__identifier_list__R_SQUARE__initializer__SEMI,
- specifiesStructuredBinding},
-
- // The grammar distinguishes (only) user-defined vs plain string literals,
- // where the clang lexer distinguishes (only) encoding types.
- {rule::user_defined_string_literal_chunk::STRING_LITERAL,
- TOKEN_GUARD(string_literal, isStringUserDefined(Tok))},
- {rule::user_defined_string_literal_chunk::UTF8_STRING_LITERAL,
- TOKEN_GUARD(utf8_string_literal, isStringUserDefined(Tok))},
- {rule::user_defined_string_literal_chunk::UTF16_STRING_LITERAL,
- TOKEN_GUARD(utf16_string_literal, isStringUserDefined(Tok))},
- {rule::user_defined_string_literal_chunk::UTF32_STRING_LITERAL,
- TOKEN_GUARD(utf32_string_literal, isStringUserDefined(Tok))},
- {rule::user_defined_string_literal_chunk::WIDE_STRING_LITERAL,
- TOKEN_GUARD(wide_string_literal, isStringUserDefined(Tok))},
- {rule::string_literal_chunk::STRING_LITERAL,
- TOKEN_GUARD(string_literal, !isStringUserDefined(Tok))},
- {rule::string_literal_chunk::UTF8_STRING_LITERAL,
- TOKEN_GUARD(utf8_string_literal, !isStringUserDefined(Tok))},
- {rule::string_literal_chunk::UTF16_STRING_LITERAL,
- TOKEN_GUARD(utf16_string_literal, !isStringUserDefined(Tok))},
- {rule::string_literal_chunk::UTF32_STRING_LITERAL,
- TOKEN_GUARD(utf32_string_literal, !isStringUserDefined(Tok))},
- {rule::string_literal_chunk::WIDE_STRING_LITERAL,
- TOKEN_GUARD(wide_string_literal, !isStringUserDefined(Tok))},
- // And the same for chars.
- {rule::user_defined_character_literal::CHAR_CONSTANT,
- TOKEN_GUARD(char_constant, isCharUserDefined(Tok))},
- {rule::user_defined_character_literal::UTF8_CHAR_CONSTANT,
- TOKEN_GUARD(utf8_char_constant, isCharUserDefined(Tok))},
- {rule::user_defined_character_literal::UTF16_CHAR_CONSTANT,
- TOKEN_GUARD(utf16_char_constant, isCharUserDefined(Tok))},
- {rule::user_defined_character_literal::UTF32_CHAR_CONSTANT,
- TOKEN_GUARD(utf32_char_constant, isCharUserDefined(Tok))},
- {rule::user_defined_character_literal::WIDE_CHAR_CONSTANT,
- TOKEN_GUARD(wide_char_constant, isCharUserDefined(Tok))},
- {rule::character_literal::CHAR_CONSTANT,
- TOKEN_GUARD(char_constant, !isCharUserDefined(Tok))},
- {rule::character_literal::UTF8_CHAR_CONSTANT,
- TOKEN_GUARD(utf8_char_constant, !isCharUserDefined(Tok))},
- {rule::character_literal::UTF16_CHAR_CONSTANT,
- TOKEN_GUARD(utf16_char_constant, !isCharUserDefined(Tok))},
- {rule::character_literal::UTF32_CHAR_CONSTANT,
- TOKEN_GUARD(utf32_char_constant, !isCharUserDefined(Tok))},
- {rule::character_literal::WIDE_CHAR_CONSTANT,
- TOKEN_GUARD(wide_char_constant, !isCharUserDefined(Tok))},
- // clang just has one NUMERIC_CONSTANT token for {ud,plain}x{float,int}
- {rule::user_defined_integer_literal::NUMERIC_CONSTANT,
- TOKEN_GUARD(numeric_constant, numKind(Tok) == (Integer | UserDefined))},
- {rule::user_defined_floating_point_literal::NUMERIC_CONSTANT,
- TOKEN_GUARD(numeric_constant, numKind(Tok) == (Floating | UserDefined))},
- {rule::integer_literal::NUMERIC_CONSTANT,
- TOKEN_GUARD(numeric_constant, numKind(Tok) == Integer)},
- {rule::floating_point_literal::NUMERIC_CONSTANT,
- TOKEN_GUARD(numeric_constant, numKind(Tok) == Floating)},
- };
-#undef TOKEN_GUARD
-#undef SYMBOL_GUARD
-}
-
-Token::Index recoverBrackets(Token::Index Begin, const TokenStream &Tokens) {
- assert(Begin > 0);
- const Token &Left = Tokens.tokens()[Begin - 1];
- assert(Left.Kind == tok::l_brace || Left.Kind == tok::l_paren ||
- Left.Kind == tok::l_square);
- if (const Token *Right = Left.pair()) {
- assert(Tokens.index(*Right) > Begin - 1);
- return Tokens.index(*Right);
- }
- return Token::Invalid;
-}
-
-llvm::DenseMap<ExtensionID, RecoveryStrategy> buildRecoveryStrategies() {
- return {
- {Extension::Brackets, recoverBrackets},
- };
-}
-
-} // namespace
-
-const Language &getLanguage() {
- static const auto &CXXLanguage = []() -> const Language & {
- std::vector<std::string> Diags;
- auto G = Grammar::parseBNF(CXXBNF, Diags);
- assert(Diags.empty());
- LRTable Table = LRTable::buildSLR(G);
- const Language *PL = new Language{
- std::move(G),
- std::move(Table),
- buildGuards(),
- buildRecoveryStrategies(),
- };
- return *PL;
- }();
- return CXXLanguage;
-}
-
-} // namespace cxx
-} // namespace pseudo
-} // namespace clang
diff --git a/clang-tools-extra/pseudo/lib/cxx/cxx.bnf b/clang-tools-extra/pseudo/lib/cxx/cxx.bnf
deleted file mode 100644
index 36caf7b1e6337..0000000000000
--- a/clang-tools-extra/pseudo/lib/cxx/cxx.bnf
+++ /dev/null
@@ -1,776 +0,0 @@
-# This is a C++ grammar from the C++ standard [1].
-#
-# The grammar is a superset of the true grammar requring semantic constraints to
-# resolve ambiguities. The grammar is context-free and ambiguous (beyond the
-# limit of LR(k)). We use general parsing algorithm (e.g GLR) to handle the
-# grammar and generate a transition table which is used to drive the parsing.
-#
-# It aims to align with the ISO C++ grammar as much as possible. We adjust it
-# to fit the need for the grammar-based parser:
-# - attributes are omitted, which will be handled as comments;
-# - we don't allow nullable nonterminal symbols. There are few nullable
-# nonterminals in the spec grammar, they are adjusted to be non-nullable;
-# - the file merely describes the core C++ grammar. Preprocessor directives and
-# lexical conversions are omitted as we reuse clang's lexer and run a fake
-# preprocessor;
-# - grammar rules with the >> token are adjusted, the greatergreater token is
-# split into two > tokens, to make the GLR parser aware of nested templates
-# and right shift operator;
-#
-# Guidelines:
-# - nonterminals are lower_case; terminals (aka tokens) correspond to
-# clang::TokenKind, written as "IDENTIFIER", "USING", "::" etc;
-# - optional symbols are supported, with a _opt suffix;
-#
-# [1] https://isocpp.org/files/papers/N4860.pdf
-
-# _ lists all the start-symbols which we support parsing.
-#
-# We list important nonterminals as start symbols, rather than doing it for all
-# nonterminals by default, this reduces the number of states by 30% and LRTable
-# actions by 16%.
-_ := translation-unit EOF
-_ := statement-seq EOF
-_ := declaration-seq EOF
-
-# gram.key
-#! we don't distinguish between namespaces and namespace aliases, as it's hard
-#! and uninteresting.
-namespace-name := IDENTIFIER
-template-name := IDENTIFIER
-
-# gram.basic
-#! Custom modifications to eliminate optional declaration-seq
-translation-unit := declaration-seq
-translation-unit := global-module-fragment_opt module-declaration declaration-seq_opt private-module-fragment_opt
-
-# gram.expr
-# expr.prim
-primary-expression := literal
-primary-expression := THIS
-primary-expression := ( expression )
-primary-expression := id-expression
-primary-expression := lambda-expression
-primary-expression := fold-expression
-primary-expression := requires-expression
-id-expression := unqualified-id
-id-expression := qualified-id
-unqualified-id := IDENTIFIER
-unqualified-id := operator-function-id
-unqualified-id := conversion-function-id
-unqualified-id := literal-operator-id
-unqualified-id := ~ type-name
-unqualified-id := ~ decltype-specifier
-unqualified-id := template-id
-qualified-id := nested-name-specifier TEMPLATE_opt unqualified-id
-nested-name-specifier := :: [guard]
-nested-name-specifier := type-name ::
-nested-name-specifier := namespace-name ::
-nested-name-specifier := decltype-specifier ::
-nested-name-specifier := nested-name-specifier IDENTIFIER ::
-nested-name-specifier := nested-name-specifier TEMPLATE_opt simple-template-id ::
-lambda-expression := lambda-introducer lambda-declarator_opt compound-statement
-lambda-expression := lambda-introducer < template-parameter-list > requires-clause_opt lambda-declarator_opt compound-statement
-#! We allow a capture-default to appear anywhere in a capture-list.
-# This simplifies the grammar and error recovery.
-lambda-introducer := [ capture-list_opt ]
-lambda-declarator := ( parameter-declaration-clause_opt ) decl-specifier-seq_opt noexcept-specifier_opt trailing-return-type_opt requires-clause_opt
-capture-list := capture
-capture-list := capture-list , capture
-capture := capture-default
-capture := simple-capture
-capture := init-capture
-capture-default := &
-capture-default := =
-simple-capture := IDENTIFIER ..._opt
-simple-capture := & IDENTIFIER ..._opt
-simple-capture := THIS
-simple-capture := * THIS
-init-capture := ..._opt IDENTIFIER initializer
-init-capture := & ..._opt IDENTIFIER initializer
-fold-expression := ( cast-expression fold-operator ... )
-fold-expression := ( ... fold-operator cast-expression )
-fold-expression := ( cast-expression fold-operator ... fold-operator cast-expression )
-fold-operator := +
-fold-operator := -
-fold-operator := *
-fold-operator := /
-fold-operator := %
-fold-operator := ^
-fold-operator := |
-fold-operator := <<
-fold-operator := greatergreater
-fold-operator := +=
-fold-operator := -=
-fold-operator := *=
-fold-operator := /=
-fold-operator := %=
-fold-operator := ^=
-fold-operator := &=
-fold-operator := |=
-fold-operator := <<=
-fold-operator := >>=
-fold-operator := =
-fold-operator := ==
-fold-operator := !=
-fold-operator := <
-fold-operator := >
-fold-operator := <=
-fold-operator := >=
-fold-operator := &&
-fold-operator := ||
-fold-operator := ,
-fold-operator := .*
-fold-operator := ->*
-requires-expression := REQUIRES requirement-parameter-list_opt requirement-body
-requirement-parameter-list := ( parameter-declaration-clause_opt )
-requirement-body := { requirement-seq }
-requirement-seq := requirement
-requirement-seq := requirement-seq requirement
-requirement := simple-requirement
-requirement := type-requirement
-requirement := compound-requirement
-requirement := nested-requirement
-simple-requirement := expression ;
-type-requirement := TYPENAME nested-name-specifier_opt type-name ;
-compound-requirement := { expression } NOEXCEPT_opt return-type-requirement_opt ;
-return-type-requirement := -> type-constraint
-nested-requirement := REQUIRES constraint-expression ;
-# expr.post
-postfix-expression := primary-expression
-postfix-expression := postfix-expression [ expr-or-braced-init-list ]
-postfix-expression := postfix-expression ( expression-list_opt )
-postfix-expression := simple-type-specifier ( expression-list_opt )
-postfix-expression := typename-specifier ( expression-list_opt )
-postfix-expression := simple-type-specifier braced-init-list
-postfix-expression := postfix-expression . TEMPLATE_opt id-expression
-postfix-expression := postfix-expression -> TEMPLATE_opt id-expression
-postfix-expression := postfix-expression ++
-postfix-expression := postfix-expression --
-postfix-expression := DYNAMIC_CAST < type-id > ( expression )
-postfix-expression := STATIC_CAST < type-id > ( expression )
-postfix-expression := REINTERPRET_CAST < type-id > ( expression )
-postfix-expression := CONST_CAST < type-id > ( expression )
-postfix-expression := TYPEID ( expression )
-postfix-expression := TYPEID ( type-id )
-#! Standard defines expression-list in terms of initializer-list, but our
-# initializer-list allows designators.
-expression-list := initializer-clause ..._opt
-expression-list := expression-list , initializer-clause ..._opt
-# expr.unary
-unary-expression := postfix-expression
-unary-expression := unary-operator cast-expression
-unary-expression := ++ cast-expression
-unary-expression := -- cast-expression
-unary-expression := await-expression
-unary-expression := SIZEOF unary-expression
-unary-expression := SIZEOF ( type-id )
-unary-expression := SIZEOF ... ( IDENTIFIER )
-unary-expression := ALIGNOF ( type-id )
-unary-expression := noexcept-expression
-unary-expression := new-expression
-unary-expression := delete-expression
-unary-operator := *
-unary-operator := &
-unary-operator := +
-unary-operator := -
-unary-operator := !
-unary-operator := ~
-await-expression := CO_AWAIT cast-expression
-noexcept-expression := NOEXCEPT ( expression )
-new-expression := ::_opt NEW new-placement_opt new-type-id new-initializer_opt
-new-expression := ::_opt NEW new-placement_opt ( type-id ) new-initializer_opt
-new-placement := ( expression-list )
-new-type-id := type-specifier-seq new-declarator_opt
-new-declarator := ptr-operator new-declarator_opt
-new-declarator := noptr-new-declarator
-noptr-new-declarator := [ expression_opt ]
-noptr-new-declarator := noptr-new-declarator [ constant-expression ]
-new-initializer := ( expression-list_opt )
-new-initializer := braced-init-list
-delete-expression := ::_opt DELETE cast-expression
-delete-expression := ::_opt DELETE [ ] cast-expression
-cast-expression := unary-expression
-cast-expression := ( type-id ) cast-expression
-# expr.mptr.oper
-pm-expression := cast-expression
-pm-expression := pm-expression .* cast-expression
-pm-expression := pm-expression ->* cast-expression
-# expr.mul
-multiplicative-expression := pm-expression
-multiplicative-expression := multiplicative-expression * pm-expression
-multiplicative-expression := multiplicative-expression / pm-expression
-multiplicative-expression := multiplicative-expression % pm-expression
-# expr.add
-additive-expression := multiplicative-expression
-additive-expression := additive-expression + multiplicative-expression
-additive-expression := additive-expression - multiplicative-expression
-# expr.shift
-shift-expression := additive-expression
-shift-expression := shift-expression << additive-expression
-shift-expression := shift-expression greatergreater additive-expression
-# expr.spaceship
-compare-expression := shift-expression
-compare-expression := compare-expression <=> shift-expression
-# expr.rel
-relational-expression := compare-expression
-relational-expression := relational-expression < compare-expression
-relational-expression := relational-expression > compare-expression
-relational-expression := relational-expression <= compare-expression
-relational-expression := relational-expression >= compare-expression
-# expr.eq
-equality-expression := relational-expression
-equality-expression := equality-expression == relational-expression
-equality-expression := equality-expression != relational-expression
-# expr.bit.and
-and-expression := equality-expression
-and-expression := and-expression & equality-expression
-# expr.xor
-exclusive-or-expression := and-expression
-exclusive-or-expression := exclusive-or-expression ^ and-expression
-# expr.or
-inclusive-or-expression := exclusive-or-expression
-inclusive-or-expression := inclusive-or-expression | exclusive-or-expression
-# expr.log.and
-logical-and-expression := inclusive-or-expression
-logical-and-expression := logical-and-expression && inclusive-or-expression
-# expr.log.or
-logical-or-expression := logical-and-expression
-logical-or-expression := logical-or-expression || logical-and-expression
-# expr.cond
-conditional-expression := logical-or-expression
-conditional-expression := logical-or-expression ? expression : assignment-expression
-# expr.ass
-yield-expression := CO_YIELD assignment-expression
-yield-expression := CO_YIELD braced-init-list
-throw-expression := THROW assignment-expression_opt
-assignment-expression := conditional-expression
-assignment-expression := yield-expression
-assignment-expression := throw-expression
-assignment-expression := logical-or-expression assignment-operator initializer-clause
-assignment-operator := =
-assignment-operator := *=
-assignment-operator := /=
-assignment-operator := %=
-assignment-operator := +=
-assignment-operator := -=
-assignment-operator := >>=
-assignment-operator := <<=
-assignment-operator := &=
-assignment-operator := ^=
-assignment-operator := |=
-# expr.comma
-expression := assignment-expression
-expression := expression , assignment-expression
-# expr.const
-constant-expression := conditional-expression
-
-# gram.stmt
-statement := labeled-statement
-statement := expression-statement
-statement := compound-statement
-statement := selection-statement
-statement := iteration-statement
-statement := jump-statement
-statement := declaration-statement
-statement := try-block
-init-statement := expression-statement
-init-statement := simple-declaration
-condition := expression
-condition := decl-specifier-seq declarator brace-or-equal-initializer
-labeled-statement := IDENTIFIER : statement
-labeled-statement := CASE constant-expression : statement
-labeled-statement := DEFAULT : statement
-expression-statement := expression_opt ;
-compound-statement := { statement-seq_opt [recover=Brackets] }
-statement-seq := statement
-statement-seq := statement-seq statement
-selection-statement := IF CONSTEXPR_opt ( init-statement_opt condition ) statement [guard]
-selection-statement := IF CONSTEXPR_opt ( init-statement_opt condition ) statement ELSE statement
-selection-statement := SWITCH ( init-statement_opt condition ) statement
-iteration-statement := WHILE ( condition ) statement
-iteration-statement := DO statement WHILE ( expression ) ;
-iteration-statement := FOR ( init-statement condition_opt ; expression_opt ) statement
-iteration-statement := FOR ( init-statement_opt for-range-declaration : for-range-initializer ) statement
-for-range-declaration := decl-specifier-seq declarator
-for-range-declaration := decl-specifier-seq ref-qualifier_opt [ identifier-list ]
-for-range-initializer := expr-or-braced-init-list
-jump-statement := BREAK ;
-jump-statement := CONTINUE ;
-jump-statement := RETURN expr-or-braced-init-list_opt ;
-jump-statement := coroutine-return-statement
-jump-statement := GOTO IDENTIFIER ;
-coroutine-return-statement := CO_RETURN expr-or-braced-init-list_opt ;
-declaration-statement := block-declaration
-
-# gram.dcl
-declaration-seq := declaration
-declaration-seq := declaration-seq declaration
-declaration := block-declaration
-declaration := nodeclspec-function-declaration
-declaration := function-definition
-declaration := template-declaration
-declaration := deduction-guide
-declaration := explicit-instantiation
-declaration := explicit-specialization
-declaration := export-declaration
-declaration := linkage-specification
-declaration := namespace-definition
-declaration := empty-declaration
-declaration := module-import-declaration
-block-declaration := simple-declaration
-block-declaration := asm-declaration
-block-declaration := namespace-alias-definition
-block-declaration := using-declaration
-block-declaration := using-enum-declaration
-block-declaration := using-directive
-block-declaration := static_assert-declaration
-block-declaration := alias-declaration
-block-declaration := opaque-enum-declaration
-nodeclspec-function-declaration := function-declarator ;
-alias-declaration := USING IDENTIFIER = defining-type-id ;
-simple-declaration := decl-specifier-seq init-declarator-list_opt ;
-simple-declaration := decl-specifier-seq ref-qualifier_opt [ identifier-list ] initializer ; [guard]
-static_assert-declaration := STATIC_ASSERT ( constant-expression ) ;
-static_assert-declaration := STATIC_ASSERT ( constant-expression , string-literal ) ;
-empty-declaration := ;
-# dcl.spec
-decl-specifier := storage-class-specifier
-decl-specifier := defining-type-specifier
-decl-specifier := function-specifier
-decl-specifier := FRIEND
-decl-specifier := TYPEDEF
-decl-specifier := CONSTEXPR
-decl-specifier := CONSTEVAL
-decl-specifier := CONSTINIT
-decl-specifier := INLINE
-decl-specifier-seq := decl-specifier
-decl-specifier-seq := decl-specifier decl-specifier-seq [guard]
-storage-class-specifier := STATIC
-storage-class-specifier := THREAD_LOCAL
-storage-class-specifier := EXTERN
-storage-class-specifier := MUTABLE
-function-specifier := VIRTUAL
-function-specifier := explicit-specifier
-explicit-specifier := EXPLICIT ( constant-expression )
-explicit-specifier := EXPLICIT
-type-specifier := simple-type-specifier
-type-specifier := elaborated-type-specifier
-type-specifier := typename-specifier
-type-specifier := cv-qualifier
-type-specifier-seq := type-specifier
-type-specifier-seq := type-specifier type-specifier-seq [guard]
-defining-type-specifier := type-specifier
-defining-type-specifier := class-specifier
-defining-type-specifier := enum-specifier
-defining-type-specifier-seq := defining-type-specifier
-defining-type-specifier-seq := defining-type-specifier defining-type-specifier-seq [guard]
-simple-type-specifier := nested-name-specifier_opt type-name
-simple-type-specifier := nested-name-specifier TEMPLATE simple-template-id
-simple-type-specifier := decltype-specifier
-simple-type-specifier := placeholder-type-specifier
-simple-type-specifier := nested-name-specifier_opt template-name
-simple-type-specifier := SHORT
-simple-type-specifier := LONG
-simple-type-specifier := SIGNED
-simple-type-specifier := UNSIGNED
-simple-type-specifier := builtin-type
-#! builtin-type added to aid in classifying which specifiers may combined.
-builtin-type := CHAR
-builtin-type := CHAR8_T
-builtin-type := CHAR16_T
-builtin-type := CHAR32_T
-builtin-type := WCHAR_T
-builtin-type := BOOL
-builtin-type := INT
-builtin-type := FLOAT
-builtin-type := DOUBLE
-builtin-type := VOID
-#! Unlike C++ standard grammar, we don't distinguish the underlying type (class,
-#! enum, typedef) of the IDENTIFIER, as these ambiguities are "local" and don't
-#! affect the final parse tree. Eliminating them gives a significant performance
-#! boost to the parser.
-type-name := IDENTIFIER
-type-name := simple-template-id
-elaborated-type-specifier := class-key nested-name-specifier_opt IDENTIFIER
-elaborated-type-specifier := class-key simple-template-id
-elaborated-type-specifier := class-key nested-name-specifier TEMPLATE_opt simple-template-id
-elaborated-type-specifier := elaborated-enum-specifier
-elaborated-enum-specifier := ENUM nested-name-specifier_opt IDENTIFIER
-decltype-specifier := DECLTYPE ( expression )
-placeholder-type-specifier := type-constraint_opt AUTO
-placeholder-type-specifier := type-constraint_opt DECLTYPE ( AUTO )
-init-declarator-list := init-declarator
-init-declarator-list := init-declarator-list , init-declarator
-#! The standard grammar allows:
-#! 1) an initializer with any declarator, including a function declarator, this
-#! creates an ambiguity where a function definition is misparsed as a simple
-#! declaration;
-#! 2) an function-body with any declarator, includeing a non-function
-#! declarator, this creates an ambiguity whwere a simple-declaration is
-#! misparsed as a function-definition;
-#! We extend the standard declarator to function-declarator and non-function-declarator
-#! to eliminate these false parses.
-init-declarator := non-function-declarator initializer_opt
-init-declarator := function-declarator requires-clause_opt
-function-declarator := declarator [guard]
-non-function-declarator := declarator [guard]
-declarator := ptr-declarator
-declarator := noptr-declarator parameters-and-qualifiers trailing-return-type
-ptr-declarator := noptr-declarator
-ptr-declarator := ptr-operator ptr-declarator
-noptr-declarator := declarator-id
-noptr-declarator := noptr-declarator parameters-and-qualifiers
-noptr-declarator := noptr-declarator [ constant-expression_opt ]
-noptr-declarator := ( ptr-declarator )
-parameters-and-qualifiers := ( parameter-declaration-clause_opt [recover=Brackets] ) cv-qualifier-seq_opt ref-qualifier_opt noexcept-specifier_opt
-trailing-return-type := -> type-id
-ptr-operator := * cv-qualifier-seq_opt
-ptr-operator := &
-ptr-operator := &&
-ptr-operator := nested-name-specifier * cv-qualifier-seq_opt
-cv-qualifier-seq := cv-qualifier cv-qualifier-seq_opt
-cv-qualifier := CONST
-cv-qualifier := VOLATILE
-ref-qualifier := &
-ref-qualifier := &&
-declarator-id := ..._opt id-expression
-type-id := type-specifier-seq abstract-declarator_opt
-defining-type-id := defining-type-specifier-seq abstract-declarator_opt
-abstract-declarator := ptr-abstract-declarator
-abstract-declarator := noptr-abstract-declarator_opt parameters-and-qualifiers trailing-return-type
-abstract-declarator := abstract-pack-declarator
-ptr-abstract-declarator := noptr-abstract-declarator
-ptr-abstract-declarator := ptr-operator ptr-abstract-declarator_opt
-noptr-abstract-declarator := noptr-abstract-declarator_opt parameters-and-qualifiers
-noptr-abstract-declarator := noptr-abstract-declarator_opt [ constant-expression_opt ]
-noptr-abstract-declarator := ( ptr-abstract-declarator )
-abstract-pack-declarator := noptr-abstract-pack-declarator
-abstract-pack-declarator := ptr-operator abstract-pack-declarator
-noptr-abstract-pack-declarator := noptr-abstract-pack-declarator parameters-and-qualifiers
-noptr-abstract-pack-declarator := noptr-abstract-pack-declarator [ constant-expression_opt ]
-noptr-abstract-pack-declarator := ...
-#! Custom modifications to avoid nullable clause.
-parameter-declaration-clause := parameter-declaration-list
-parameter-declaration-clause := parameter-declaration-list_opt ...
-parameter-declaration-clause := parameter-declaration-list , ...
-parameter-declaration-list := parameter-declaration
-parameter-declaration-list := parameter-declaration-list , parameter-declaration
-parameter-declaration := decl-specifier-seq declarator
-parameter-declaration := decl-specifier-seq declarator = initializer-clause
-parameter-declaration := decl-specifier-seq abstract-declarator_opt
-parameter-declaration := decl-specifier-seq abstract-declarator_opt = initializer-clause
-# dcl.init
-initializer := brace-or-equal-initializer
-initializer := ( expression-list )
-brace-or-equal-initializer := = initializer-clause
-brace-or-equal-initializer := braced-init-list
-initializer-clause := assignment-expression
-initializer-clause := braced-init-list
-#! Allow mixed designated/non-designated init-list.
-# This is standard C, and accepted by clang and others as an extension.
-# FIXME: Decouple recovery from is-there-a-trailing-comma!
-braced-init-list := { initializer-list [recover=Brackets] }
-braced-init-list := { initializer-list , }
-braced-init-list := { }
-initializer-list := initializer-list-item
-initializer-list := initializer-list , initializer-list-item
-initializer-list-item := initializer-clause ..._opt
-initializer-list-item := designator brace-or-equal-initializer ..._opt
-designator := . IDENTIFIER
-#! Array designators are legal in C, and a common extension in C++.
-designator := [ expression ]
-expr-or-braced-init-list := expression
-expr-or-braced-init-list := braced-init-list
-# dcl.fct
-function-definition := decl-specifier-seq_opt function-declarator virt-specifier-seq_opt function-body
-function-definition := decl-specifier-seq_opt function-declarator requires-clause function-body
-function-body := ctor-initializer_opt compound-statement
-function-body := function-try-block
-function-body := = DEFAULT ;
-function-body := = DELETE ;
-# dcl.enum
-enum-specifier := enum-head { enumerator-list_opt }
-enum-specifier := enum-head { enumerator-list , }
-enum-head := enum-key enum-head-name_opt enum-base_opt
-enum-head-name := nested-name-specifier_opt IDENTIFIER
-opaque-enum-declaration := enum-key enum-head-name enum-base_opt ;
-enum-key := ENUM
-enum-key := ENUM CLASS
-enum-key := ENUM STRUCT
-enum-base := : type-specifier-seq
-enumerator-list := enumerator-definition
-enumerator-list := enumerator-list , enumerator-definition
-enumerator-definition := enumerator
-enumerator-definition := enumerator = constant-expression
-enumerator := IDENTIFIER
-using-enum-declaration := USING elaborated-enum-specifier ;
-# basic.namespace
-namespace-definition := named-namespace-definition
-namespace-definition := unnamed-namespace-definition
-namespace-definition := nested-namespace-definition
-named-namespace-definition := INLINE_opt NAMESPACE IDENTIFIER { namespace-body_opt }
-unnamed-namespace-definition := INLINE_opt NAMESPACE { namespace-body_opt }
-nested-namespace-definition := NAMESPACE enclosing-namespace-specifier :: INLINE_opt IDENTIFIER { namespace-body }
-enclosing-namespace-specifier := IDENTIFIER
-enclosing-namespace-specifier := enclosing-namespace-specifier :: INLINE_opt IDENTIFIER
-#! Custom modification to avoid nullable namespace-body.
-namespace-body := declaration-seq
-namespace-alias-definition := NAMESPACE IDENTIFIER = qualified-namespace-specifier ;
-qualified-namespace-specifier := nested-name-specifier_opt namespace-name
-using-directive := USING NAMESPACE nested-name-specifier_opt namespace-name ;
-using-declaration := USING using-declarator-list ;
-using-declarator-list := using-declarator ..._opt
-using-declarator-list := using-declarator-list , using-declarator ..._opt
-using-declarator := TYPENAME_opt nested-name-specifier unqualified-id
-# dcl.asm
-asm-declaration := ASM ( string-literal ) ;
-# dcl.link
-linkage-specification := EXTERN string-literal { declaration-seq_opt }
-linkage-specification := EXTERN string-literal declaration
-
-# gram.module
-module-declaration := export-keyword_opt module-keyword module-name module-partition_opt ;
-module-name := module-name-qualifier_opt IDENTIFIER
-module-partition := : module-name-qualifier_opt IDENTIFIER
-module-name-qualifier := IDENTIFIER .
-module-name-qualifier := module-name-qualifier IDENTIFIER .
-export-declaration := EXPORT declaration
-export-declaration := EXPORT { declaration-seq_opt }
-export-declaration := export-keyword module-import-declaration
-module-import-declaration := import-keyword module-name ;
-module-import-declaration := import-keyword module-partition ;
-# FIXME: we don't have header-name in the grammar. Handle these in PP?
-# module-import-declaration := import-keyword header-name ;
-global-module-fragment := module-keyword ; declaration-seq_opt
-private-module-fragment := module-keyword : PRIVATE ; declaration-seq_opt
-
-# gram.class
-class-specifier := class-head { member-specification_opt [recover=Brackets] }
-class-head := class-key class-head-name class-virt-specifier_opt base-clause_opt
-class-head := class-key base-clause_opt
-class-head-name := nested-name-specifier_opt type-name
-class-virt-specifier := contextual-final
-class-key := CLASS
-class-key := STRUCT
-class-key := UNION
-member-specification := member-declaration member-specification_opt
-member-specification := access-specifier : member-specification_opt
-member-declaration := decl-specifier-seq member-declarator-list_opt ;
-member-declaration := member-declarator-list ;
-member-declaration := function-definition
-member-declaration := using-declaration
-member-declaration := using-enum-declaration
-member-declaration := static_assert-declaration
-member-declaration := template-declaration
-member-declaration := explicit-specialization
-member-declaration := deduction-guide
-member-declaration := alias-declaration
-member-declaration := opaque-enum-declaration
-member-declaration := empty-declaration
-member-declarator-list := member-declarator
-member-declarator-list := member-declarator-list , member-declarator
-member-declarator := function-declarator virt-specifier-seq_opt pure-specifier_opt
-member-declarator := function-declarator requires-clause
-member-declarator := non-function-declarator brace-or-equal-initializer_opt
-member-declarator := IDENTIFIER_opt : constant-expression brace-or-equal-initializer_opt
-virt-specifier-seq := virt-specifier
-virt-specifier-seq := virt-specifier-seq virt-specifier
-virt-specifier := contextual-override
-virt-specifier := contextual-final
-pure-specifier := = contextual-zero
-conversion-function-id := OPERATOR conversion-type-id
-conversion-type-id := type-specifier-seq conversion-declarator_opt
-conversion-declarator := ptr-operator conversion-declarator_opt
-base-clause := : base-specifier-list
-base-specifier-list := base-specifier ..._opt
-base-specifier-list := base-specifier-list , base-specifier ..._opt
-base-specifier := class-or-decltype
-base-specifier := VIRTUAL access-specifier_opt class-or-decltype
-base-specifier := access-specifier VIRTUAL_opt class-or-decltype
-class-or-decltype := nested-name-specifier_opt type-name
-class-or-decltype := nested-name-specifier TEMPLATE simple-template-id
-class-or-decltype := decltype-specifier
-access-specifier := PRIVATE
-access-specifier := PROTECTED
-access-specifier := PUBLIC
-ctor-initializer := : mem-initializer-list
-mem-initializer-list := mem-initializer ..._opt
-mem-initializer-list := mem-initializer-list , mem-initializer ..._opt
-mem-initializer := mem-initializer-id ( expression-list_opt )
-mem-initializer := mem-initializer-id braced-init-list
-mem-initializer-id := class-or-decltype
-mem-initializer-id := IDENTIFIER
-
-# gram.over
-operator-function-id := OPERATOR operator-name
-operator-name := NEW
-operator-name := DELETE
-operator-name := NEW [ ]
-operator-name := DELETE [ ]
-operator-name := CO_AWAIT
-operator-name := ( )
-operator-name := [ ]
-operator-name := ->
-operator-name := ->*
-operator-name := ~
-operator-name := !
-operator-name := +
-operator-name := -
-operator-name := *
-operator-name := /
-operator-name := %
-operator-name := ^
-operator-name := &
-operator-name := |
-operator-name := =
-operator-name := +=
-operator-name := -=
-operator-name := *=
-operator-name := /=
-operator-name := %=
-operator-name := ^=
-operator-name := &=
-operator-name := |=
-operator-name := ==
-operator-name := !=
-operator-name := <
-operator-name := >
-operator-name := <=
-operator-name := >=
-operator-name := <=>
-operator-name := ^^
-operator-name := ||
-operator-name := <<
-operator-name := greatergreater
-operator-name := <<=
-operator-name := >>=
-operator-name := ++
-operator-name := --
-operator-name := ,
-literal-operator-id := OPERATOR string-literal IDENTIFIER
-literal-operator-id := OPERATOR user-defined-string-literal
-
-# gram.temp
-template-declaration := template-head declaration
-template-declaration := template-head concept-definition
-template-head := TEMPLATE < template-parameter-list > requires-clause_opt
-template-parameter-list := template-parameter
-template-parameter-list := template-parameter-list , template-parameter
-requires-clause := REQUIRES constraint-logical-or-expression
-constraint-logical-or-expression := constraint-logical-and-expression
-constraint-logical-or-expression := constraint-logical-or-expression || constraint-logical-and-expression
-constraint-logical-and-expression := primary-expression
-constraint-logical-and-expression := constraint-logical-and-expression && primary-expression
-template-parameter := type-parameter
-template-parameter := parameter-declaration
-type-parameter := type-parameter-key ..._opt IDENTIFIER_opt
-type-parameter := type-parameter-key IDENTIFIER_opt = type-id
-type-parameter := type-constraint ..._opt IDENTIFIER_opt
-type-parameter := type-constraint IDENTIFIER_opt = type-id
-type-parameter := template-head type-parameter-key ..._opt IDENTIFIER_opt
-type-parameter := template-head type-parameter-key IDENTIFIER_opt = id-expression
-type-parameter-key := CLASS
-type-parameter-key := TYPENAME
-type-constraint := nested-name-specifier_opt concept-name
-type-constraint := nested-name-specifier_opt concept-name < template-argument-list_opt >
-simple-template-id := template-name < template-argument-list_opt >
-template-id := simple-template-id
-template-id := operator-function-id < template-argument-list_opt >
-template-id := literal-operator-id < template-argument-list_opt >
-template-argument-list := template-argument ..._opt
-template-argument-list := template-argument-list , template-argument ..._opt
-template-argument := constant-expression
-template-argument := type-id
-template-argument := id-expression
-constraint-expression := logical-or-expression
-deduction-guide := explicit-specifier_opt template-name ( parameter-declaration-list_opt ) -> simple-template-id ;
-concept-definition := CONCEPT concept-name = constraint-expression ;
-concept-name := IDENTIFIER
-typename-specifier := TYPENAME nested-name-specifier IDENTIFIER
-typename-specifier := TYPENAME nested-name-specifier TEMPLATE_opt simple-template-id
-explicit-instantiation := EXTERN_opt TEMPLATE declaration
-explicit-specialization := TEMPLATE < > declaration
-
-# gram.except
-try-block := TRY compound-statement handler-seq
-function-try-block := TRY ctor-initializer_opt compound-statement handler-seq
-handler-seq := handler handler-seq_opt
-handler := CATCH ( exception-declaration ) compound-statement
-exception-declaration := type-specifier-seq declarator
-exception-declaration := type-specifier-seq abstract-declarator_opt
-noexcept-specifier := NOEXCEPT ( constant-expression )
-noexcept-specifier := NOEXCEPT
-
-# gram.cpp
-identifier-list := IDENTIFIER
-identifier-list := identifier-list , IDENTIFIER
-
-# gram.lex
-#! As we use clang lexer, most of lexical symbols are not needed, we only add
-#! needed literals.
-literal := integer-literal
-literal := character-literal
-literal := floating-point-literal
-literal := string-literal
-literal := boolean-literal
-literal := pointer-literal
-literal := user-defined-literal
-integer-literal := NUMERIC_CONSTANT [guard]
-character-literal := CHAR_CONSTANT [guard]
-character-literal := WIDE_CHAR_CONSTANT [guard]
-character-literal := UTF8_CHAR_CONSTANT [guard]
-character-literal := UTF16_CHAR_CONSTANT [guard]
-character-literal := UTF32_CHAR_CONSTANT [guard]
-floating-point-literal := NUMERIC_CONSTANT [guard]
-string-literal-chunk := STRING_LITERAL [guard]
-string-literal-chunk := WIDE_STRING_LITERAL [guard]
-string-literal-chunk := UTF8_STRING_LITERAL [guard]
-string-literal-chunk := UTF16_STRING_LITERAL [guard]
-string-literal-chunk := UTF32_STRING_LITERAL [guard]
-#! Technically, string concatenation happens at phase 6 which is before parsing,
-#! so it doesn't belong to the grammar. However, we extend the grammar to
-#! support it, to make the pseudoparser fully functional on practical code.
-string-literal := string-literal-chunk
-string-literal := string-literal string-literal-chunk
-user-defined-literal := user-defined-integer-literal
-user-defined-literal := user-defined-floating-point-literal
-user-defined-literal := user-defined-string-literal
-user-defined-literal := user-defined-character-literal
-user-defined-integer-literal := NUMERIC_CONSTANT [guard]
-user-defined-string-literal-chunk := STRING_LITERAL [guard]
-user-defined-string-literal-chunk := WIDE_STRING_LITERAL [guard]
-user-defined-string-literal-chunk := UTF8_STRING_LITERAL [guard]
-user-defined-string-literal-chunk := UTF16_STRING_LITERAL [guard]
-user-defined-string-literal-chunk := UTF32_STRING_LITERAL [guard]
-user-defined-string-literal := user-defined-string-literal-chunk
-user-defined-string-literal := string-literal-chunk user-defined-string-literal
-user-defined-string-literal := user-defined-string-literal string-literal-chunk
-user-defined-floating-point-literal := NUMERIC_CONSTANT [guard]
-user-defined-character-literal := CHAR_CONSTANT [guard]
-user-defined-character-literal := WIDE_CHAR_CONSTANT [guard]
-user-defined-character-literal := UTF8_CHAR_CONSTANT [guard]
-user-defined-character-literal := UTF16_CHAR_CONSTANT [guard]
-user-defined-character-literal := UTF32_CHAR_CONSTANT [guard]
-boolean-literal := FALSE
-boolean-literal := TRUE
-pointer-literal := NULLPTR
-
-#! Contextual keywords -- clang lexer always lexes them as identifier tokens.
-#! Placeholders for literal text in the grammar that lex as other things.
-contextual-override := IDENTIFIER [guard]
-contextual-final := IDENTIFIER [guard]
-contextual-zero := NUMERIC_CONSTANT [guard]
-module-keyword := IDENTIFIER [guard]
-import-keyword := IDENTIFIER [guard]
-export-keyword := IDENTIFIER [guard]
-
-#! greatergreater token -- clang lexer always lexes it as a single token, we
-#! split it into two tokens to make the GLR parser aware of the nested-template
-#! case.
-greatergreater := > >
-
-#! C++ predefined identifier, __func__ [dcl.fct.def.general] p8
-#! FIXME: add other (MSVC, GNU extension) predefined identifiers.
-primary-expression := predefined-expression
-predefined-expression := __FUNC__
diff --git a/clang-tools-extra/pseudo/lib/grammar/CMakeLists.txt b/clang-tools-extra/pseudo/lib/grammar/CMakeLists.txt
deleted file mode 100644
index bb08ebab0fa62..0000000000000
--- a/clang-tools-extra/pseudo/lib/grammar/CMakeLists.txt
+++ /dev/null
@@ -1,10 +0,0 @@
-set(LLVM_LINK_COMPONENTS Support)
-
-add_clang_library(clangPseudoGrammar
- Grammar.cpp
- GrammarBNF.cpp
- LRGraph.cpp
- LRTable.cpp
- LRTableBuild.cpp
- )
-
diff --git a/clang-tools-extra/pseudo/lib/grammar/Grammar.cpp b/clang-tools-extra/pseudo/lib/grammar/Grammar.cpp
deleted file mode 100644
index 3e9c5c3c7a6c4..0000000000000
--- a/clang-tools-extra/pseudo/lib/grammar/Grammar.cpp
+++ /dev/null
@@ -1,190 +0,0 @@
-//===--- Grammar.cpp - Grammar for clang pseudoparser -----------*- C++-*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "clang-pseudo/grammar/Grammar.h"
-#include "clang/Basic/TokenKinds.h"
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/Support/FormatVariadic.h"
-#include "llvm/Support/raw_ostream.h"
-#include <optional>
-
-namespace clang {
-namespace pseudo {
-
-Rule::Rule(SymbolID Target, llvm::ArrayRef<SymbolID> Sequence)
- : Target(Target), Size(static_cast<uint8_t>(Sequence.size())) {
- assert(Sequence.size() <= Rule::MaxElements);
- llvm::copy(Sequence, this->Sequence);
-}
-
-Grammar::Grammar(std::unique_ptr<GrammarTable> Table) : T(std::move(Table)) {
- Underscore = *findNonterminal("_");
-}
-
-llvm::ArrayRef<Rule> Grammar::rulesFor(SymbolID SID) const {
- assert(isNonterminal(SID));
- const auto &R = T->Nonterminals[SID].RuleRange;
- assert(R.End <= T->Rules.size());
- return llvm::ArrayRef(&T->Rules[R.Start], R.End - R.Start);
-}
-
-const Rule &Grammar::lookupRule(RuleID RID) const {
- assert(RID < T->Rules.size());
- return T->Rules[RID];
-}
-
-llvm::StringRef Grammar::symbolName(SymbolID SID) const {
- if (isToken(SID))
- return T->Terminals[symbolToToken(SID)];
- return T->Nonterminals[SID].Name;
-}
-
-std::optional<SymbolID> Grammar::findNonterminal(llvm::StringRef Name) const {
- auto It = llvm::partition_point(
- T->Nonterminals,
- [&](const GrammarTable::Nonterminal &X) { return X.Name < Name; });
- if (It != T->Nonterminals.end() && It->Name == Name)
- return It - T->Nonterminals.begin();
- return std::nullopt;
-}
-
-std::string Grammar::dumpRule(RuleID RID) const {
- std::string Result;
- llvm::raw_string_ostream OS(Result);
- const Rule &R = T->Rules[RID];
- OS << symbolName(R.Target) << " :=";
- for (unsigned I = 0; I < R.Size; ++I) {
- OS << " " << symbolName(R.Sequence[I]);
- if (R.RecoveryIndex == I)
- OS << " [recover=" << T->AttributeValues[R.Recovery] << "]";
- }
- if (R.Guarded)
- OS << " [guard]";
- return Result;
-}
-
-std::string Grammar::dumpRules(SymbolID SID) const {
- assert(isNonterminal(SID));
- std::string Result;
- const auto &Range = T->Nonterminals[SID].RuleRange;
- for (RuleID RID = Range.Start; RID < Range.End; ++RID)
- Result.append(dumpRule(RID)).push_back('\n');
- return Result;
-}
-
-std::string Grammar::dump() const {
- std::string Result;
- llvm::raw_string_ostream OS(Result);
- OS << "Nonterminals:\n";
- for (SymbolID SID = 0; SID < T->Nonterminals.size(); ++SID)
- OS << llvm::formatv(" {0} {1}\n", SID, symbolName(SID));
- OS << "Rules:\n";
- for (RuleID RID = 0; RID < T->Rules.size(); ++RID)
- OS << llvm::formatv(" {0} {1}\n", RID, dumpRule(RID));
- return OS.str();
-}
-
-std::vector<llvm::DenseSet<SymbolID>> firstSets(const Grammar &G) {
- std::vector<llvm::DenseSet<SymbolID>> FirstSets(
- G.table().Nonterminals.size());
- auto ExpandFirstSet = [&FirstSets](SymbolID Target, SymbolID First) {
- assert(isNonterminal(Target));
- if (isToken(First))
- return FirstSets[Target].insert(First).second;
- bool Changed = false;
- for (SymbolID SID : FirstSets[First])
- Changed |= FirstSets[Target].insert(SID).second;
- return Changed;
- };
-
- // A rule S := T ... implies elements in FIRST(S):
- // - if T is a terminal, FIRST(S) contains T
- // - if T is a nonterminal, FIRST(S) contains FIRST(T)
- // Since FIRST(T) may not have been fully computed yet, FIRST(S) itself may
- // end up being incomplete.
- // We iterate until we hit a fixed point.
- // (This isn't particularly efficient, but table building isn't on the
- // critical path).
- bool Changed = true;
- while (Changed) {
- Changed = false;
- for (const auto &R : G.table().Rules)
- // We only need to consider the first element because symbols are
- // non-nullable.
- Changed |= ExpandFirstSet(R.Target, R.seq().front());
- }
- return FirstSets;
-}
-
-std::vector<llvm::DenseSet<SymbolID>> followSets(const Grammar &G) {
- auto FirstSets = firstSets(G);
- std::vector<llvm::DenseSet<SymbolID>> FollowSets(
- G.table().Nonterminals.size());
- // Expand the follow set of a nonterminal symbol Y by adding all from the
- // given symbol set.
- auto ExpandFollowSet = [&FollowSets](SymbolID Y,
- const llvm::DenseSet<SymbolID> &ToAdd) {
- assert(isNonterminal(Y));
- bool Changed = false;
- for (SymbolID F : ToAdd)
- Changed |= FollowSets[Y].insert(F).second;
- return Changed;
- };
- // Follow sets is computed based on the following 3 rules, the computation
- // is completed at a fixed point where there is no more new symbols can be
- // added to any of the follow sets.
- //
- // Rule 1: add endmarker to the FOLLOW(S), where S is the start symbol of the
- // augmented grammar, in our case it is '_'.
- FollowSets[G.underscore()].insert(tokenSymbol(tok::eof));
- bool Changed = true;
- while (Changed) {
- Changed = false;
- for (const auto &R : G.table().Rules) {
- // Rule 2: for a rule X := ... Y Z, we add all symbols from FIRST(Z) to
- // FOLLOW(Y).
- for (size_t I = 0; I + 1 < R.seq().size(); ++I) {
- if (isToken(R.seq()[I]))
- continue;
- // We only need to consider the next symbol because symbols are
- // non-nullable.
- SymbolID Next = R.seq()[I + 1];
- if (isToken(Next))
- // First set for a terminal is itself.
- Changed |= ExpandFollowSet(R.seq()[I], {Next});
- else
- Changed |= ExpandFollowSet(R.seq()[I], FirstSets[Next]);
- }
- // Rule 3: for a rule X := ... Z, we add all symbols from FOLLOW(X) to
- // FOLLOW(Z).
- SymbolID Z = R.seq().back();
- if (isNonterminal(Z))
- Changed |= ExpandFollowSet(Z, FollowSets[R.Target]);
- }
- }
- return FollowSets;
-}
-
-static llvm::ArrayRef<std::string> getTerminalNames() {
- static const auto &TerminalNames = []() {
- auto TerminalNames = new std::string[NumTerminals];
-#define PUNCTUATOR(Tok, Spelling) TerminalNames[tok::Tok] = Spelling;
-#define KEYWORD(Keyword, Condition) \
- TerminalNames[tok::kw_##Keyword] = llvm::StringRef(#Keyword).upper();
-#define TOK(Tok) TerminalNames[tok::Tok] = llvm::StringRef(#Tok).upper();
-#include "clang/Basic/TokenKinds.def"
- return llvm::ArrayRef(TerminalNames, NumTerminals);
- }();
- return TerminalNames;
-}
-GrammarTable::GrammarTable() : Terminals(getTerminalNames()) {}
-
-} // namespace pseudo
-} // namespace clang
diff --git a/clang-tools-extra/pseudo/lib/grammar/GrammarBNF.cpp b/clang-tools-extra/pseudo/lib/grammar/GrammarBNF.cpp
deleted file mode 100644
index f1b8e06e22432..0000000000000
--- a/clang-tools-extra/pseudo/lib/grammar/GrammarBNF.cpp
+++ /dev/null
@@ -1,362 +0,0 @@
-//===--- GrammarBNF.cpp - build grammar from BNF files ----------*- C++-*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "clang-pseudo/grammar/Grammar.h"
-#include "clang/Basic/TokenKinds.h"
-#include "llvm/ADT/DenseSet.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/Support/FormatVariadic.h"
-#include <memory>
-#include <utility>
-
-namespace clang {
-namespace pseudo {
-
-namespace {
-static const llvm::StringRef OptSuffix = "_opt";
-static const llvm::StringRef StartSymbol = "_";
-
-// Builds grammar from BNF files.
-class GrammarBuilder {
-public:
- GrammarBuilder(std::vector<std::string> &Diagnostics)
- : Diagnostics(Diagnostics) {}
-
- Grammar build(llvm::StringRef BNF) {
- auto Specs = eliminateOptional(parse(BNF));
-
- assert(llvm::all_of(Specs,
- [](const RuleSpec &R) {
- if (R.Target.ends_with(OptSuffix))
- return false;
- return llvm::all_of(
- R.Sequence, [](const RuleSpec::Element &E) {
- return !E.Symbol.ends_with(OptSuffix);
- });
- }) &&
- "Optional symbols should be eliminated!");
-
- auto T = std::make_unique<GrammarTable>();
-
- // Assemble the name->ID and ID->nonterminal name maps.
- llvm::DenseSet<llvm::StringRef> UniqueNonterminals;
- llvm::DenseMap<llvm::StringRef, SymbolID> SymbolIds;
-
- llvm::DenseSet<llvm::StringRef> UniqueAttributeValues;
-
- for (uint16_t I = 0; I < NumTerminals; ++I)
- SymbolIds.try_emplace(T->Terminals[I], tokenSymbol(tok::TokenKind(I)));
- auto Consider = [&](llvm::StringRef Name) {
- if (!SymbolIds.count(Name))
- UniqueNonterminals.insert(Name);
- };
- for (const auto &Spec : Specs) {
- Consider(Spec.Target);
- for (const RuleSpec::Element &Elt : Spec.Sequence) {
- Consider(Elt.Symbol);
- for (const auto& KV : Elt.Attributes)
- UniqueAttributeValues.insert(KV.second);
- }
- }
- for (llvm::StringRef Name : UniqueNonterminals) {
- T->Nonterminals.emplace_back();
- T->Nonterminals.back().Name = Name.str();
- }
- assert(T->Nonterminals.size() < (1 << (SymbolBits - 1)) &&
- "Too many nonterminals to fit in SymbolID bits!");
- llvm::sort(T->Nonterminals, [](const GrammarTable::Nonterminal &L,
- const GrammarTable::Nonterminal &R) {
- return L.Name < R.Name;
- });
- // Add an empty string for the corresponding sentinel unset attribute.
- T->AttributeValues.push_back("");
- UniqueAttributeValues.erase("");
- for (llvm::StringRef Name : UniqueAttributeValues) {
- T->AttributeValues.emplace_back();
- T->AttributeValues.back() = Name.str();
- }
- llvm::sort(T->AttributeValues);
- assert(T->AttributeValues.front() == "");
-
- // Build name -> ID maps for nonterminals.
- for (SymbolID SID = 0; SID < T->Nonterminals.size(); ++SID)
- SymbolIds.try_emplace(T->Nonterminals[SID].Name, SID);
-
- // Convert the rules.
- T->Rules.reserve(Specs.size());
- std::vector<SymbolID> Symbols;
- auto Lookup = [SymbolIds](llvm::StringRef Name) {
- auto It = SymbolIds.find(Name);
- assert(It != SymbolIds.end() && "Didn't find the symbol in SymbolIds!");
- return It->second;
- };
- for (const auto &Spec : Specs) {
- assert(Spec.Sequence.size() <= Rule::MaxElements);
- Symbols.clear();
- for (const RuleSpec::Element &Elt : Spec.Sequence)
- Symbols.push_back(Lookup(Elt.Symbol));
- T->Rules.push_back(Rule(Lookup(Spec.Target), Symbols));
- applyAttributes(Spec, *T, T->Rules.back());
- }
-
- assert(T->Rules.size() < (1 << RuleBits) &&
- "Too many rules to fit in RuleID bits!");
- const auto &SymbolOrder = getTopologicalOrder(T.get());
- llvm::stable_sort(
- T->Rules, [&SymbolOrder](const Rule &Left, const Rule &Right) {
- // Sorted by the topological order of the nonterminal Target.
- return SymbolOrder[Left.Target] < SymbolOrder[Right.Target];
- });
- for (SymbolID SID = 0; SID < T->Nonterminals.size(); ++SID) {
- auto StartIt = llvm::partition_point(T->Rules, [&](const Rule &R) {
- return SymbolOrder[R.Target] < SymbolOrder[SID];
- });
- RuleID Start = StartIt - T->Rules.begin();
- RuleID End = Start;
- while (End < T->Rules.size() && T->Rules[End].Target == SID)
- ++End;
- T->Nonterminals[SID].RuleRange = {Start, End};
- }
- Grammar G(std::move(T));
- diagnoseGrammar(G);
- return G;
- }
-
- // Gets topological order for nonterminal symbols.
- //
- // The topological order is defined as: if a *single* nonterminal A produces
- // (or transitively) a nonterminal B (that said, there is a production rule
- // B := A), then A is less than B.
- //
- // It returns the sort key for each symbol, the array is indexed by SymbolID.
- std::vector<unsigned> getTopologicalOrder(GrammarTable *T) {
- std::vector<std::pair<SymbolID, SymbolID>> Dependencies;
- for (const auto &Rule : T->Rules) {
- // if A := B, A depends on B.
- if (Rule.Size == 1 && pseudo::isNonterminal(Rule.Sequence[0]))
- Dependencies.push_back({Rule.Target, Rule.Sequence[0]});
- }
- llvm::sort(Dependencies);
- std::vector<SymbolID> Order;
- // Each nonterminal state flows: NotVisited -> Visiting -> Visited.
- enum State {
- NotVisited,
- Visiting,
- Visited,
- };
- std::vector<State> VisitStates(T->Nonterminals.size(), NotVisited);
- std::function<void(SymbolID)> DFS = [&](SymbolID SID) -> void {
- if (VisitStates[SID] == Visited)
- return;
- if (VisitStates[SID] == Visiting) {
- Diagnostics.push_back(
- llvm::formatv("The grammar contains a cycle involving symbol {0}",
- T->Nonterminals[SID].Name));
- return;
- }
- VisitStates[SID] = Visiting;
- for (auto It = llvm::lower_bound(Dependencies,
- std::pair<SymbolID, SymbolID>{SID, 0});
- It != Dependencies.end() && It->first == SID; ++It)
- DFS(It->second);
- VisitStates[SID] = Visited;
- Order.push_back(SID);
- };
- for (SymbolID ID = 0; ID != T->Nonterminals.size(); ++ID)
- DFS(ID);
- std::vector<unsigned> Result(T->Nonterminals.size(), 0);
- for (size_t I = 0; I < Order.size(); ++I)
- Result[Order[I]] = I;
- return Result;
- }
-
-private:
- // Text representation of a BNF grammar rule.
- struct RuleSpec {
- llvm::StringRef Target;
- struct Element {
- llvm::StringRef Symbol; // Name of the symbol
- // Attributes that are associated to the sequence symbol or rule.
- std::vector<std::pair<llvm::StringRef/*Key*/, llvm::StringRef/*Value*/>>
- Attributes;
- };
- std::vector<Element> Sequence;
-
- std::string toString() const {
- std::vector<llvm::StringRef> Body;
- for (const auto &E : Sequence)
- Body.push_back(E.Symbol);
- return llvm::formatv("{0} := {1}", Target, llvm::join(Body, " "));
- }
- };
-
- std::vector<RuleSpec> parse(llvm::StringRef Lines) {
- std::vector<RuleSpec> Specs;
- for (llvm::StringRef Line : llvm::split(Lines, '\n')) {
- Line = Line.trim();
- // Strip anything coming after the '#' (comment).
- Line = Line.take_while([](char C) { return C != '#'; });
- if (Line.empty())
- continue;
- RuleSpec Rule;
- if (parseLine(Line, Rule))
- Specs.push_back(std::move(Rule));
- }
- return Specs;
- }
-
- bool parseLine(llvm::StringRef Line, RuleSpec &Out) {
- auto Parts = Line.split(":=");
- if (Parts.first == Line) { // no separator in Line
- Diagnostics.push_back(
- llvm::formatv("Failed to parse '{0}': no separator :=", Line).str());
- return false;
- }
-
- Out.Target = Parts.first.trim();
- Out.Sequence.clear();
- for (llvm::StringRef Chunk : llvm::split(Parts.second, ' ')) {
- Chunk = Chunk.trim();
- if (Chunk.empty())
- continue; // skip empty
- if (Chunk.starts_with("[") && Chunk.ends_with("]")) {
- if (Out.Sequence.empty())
- continue;
-
- parseAttributes(Chunk, Out.Sequence.back().Attributes);
- continue;
- }
-
- Out.Sequence.push_back({Chunk, /*Attributes=*/{}});
- }
- return true;
- }
-
- bool parseAttributes(
- llvm::StringRef Content,
- std::vector<std::pair<llvm::StringRef, llvm::StringRef>> &Out) {
- assert(Content.starts_with("[") && Content.ends_with("]"));
- auto KV = Content.drop_front().drop_back().split('=');
- Out.push_back({KV.first, KV.second.trim()});
-
- return true;
- }
- // Apply the parsed extensions (stored in RuleSpec) to the grammar Rule.
- void applyAttributes(const RuleSpec& Spec, const GrammarTable& T, Rule& R) {
- auto LookupExtensionID = [&T](llvm::StringRef Name) {
- const auto It = llvm::partition_point(
- T.AttributeValues, [&](llvm::StringRef X) { return X < Name; });
- assert(It != T.AttributeValues.end() && *It == Name &&
- "Didn't find the attribute in AttrValues!");
- return It - T.AttributeValues.begin();
- };
- for (unsigned I = 0; I < Spec.Sequence.size(); ++I) {
- for (const auto &KV : Spec.Sequence[I].Attributes) {
- if (KV.first == "guard") {
- R.Guarded = true;
- } else if (KV.first == "recover") {
- R.Recovery = LookupExtensionID(KV.second);
- R.RecoveryIndex = I;
- } else {
- Diagnostics.push_back(
- llvm::formatv("Unknown attribute '{0}'", KV.first).str());
- }
- }
- }
- }
-
- // Inlines all _opt symbols.
- // For example, a rule E := id +_opt id, after elimination, we have two
- // equivalent rules:
- // 1) E := id + id
- // 2) E := id id
- std::vector<RuleSpec> eliminateOptional(llvm::ArrayRef<RuleSpec> Input) {
- std::vector<RuleSpec> Results;
- std::vector<RuleSpec::Element> Storage;
- for (const auto &R : Input) {
- eliminateOptionalTail(
- R.Sequence, Storage, [&Results, &Storage, &R, this]() {
- if (Storage.empty()) {
- Diagnostics.push_back(
- llvm::formatv("Rule '{0}' has a nullable RHS", R.toString()));
- return;
- }
- Results.push_back({R.Target, Storage});
- });
- assert(Storage.empty());
- }
- return Results;
- }
- void eliminateOptionalTail(llvm::ArrayRef<RuleSpec::Element> Elements,
- std::vector<RuleSpec::Element> &Result,
- llvm::function_ref<void()> CB) {
- if (Elements.empty())
- return CB();
- auto Front = Elements.front();
- if (!Front.Symbol.ends_with(OptSuffix)) {
- Result.push_back(std::move(Front));
- eliminateOptionalTail(Elements.drop_front(1), Result, CB);
- Result.pop_back();
- return;
- }
- // Enumerate two options: skip the opt symbol, or inline the symbol.
- eliminateOptionalTail(Elements.drop_front(1), Result, CB); // skip
- Front.Symbol = Front.Symbol.drop_back(OptSuffix.size()); // drop "_opt"
- Result.push_back(std::move(Front));
- eliminateOptionalTail(Elements.drop_front(1), Result, CB);
- Result.pop_back();
- }
-
- // Diagnoses the grammar and emit warnings if any.
- void diagnoseGrammar(const Grammar &G) {
- const auto &T = G.table();
- for (SymbolID SID = 0; SID < T.Nonterminals.size(); ++SID) {
- auto Range = T.Nonterminals[SID].RuleRange;
- if (Range.Start == Range.End)
- Diagnostics.push_back(
- llvm::formatv("No rules for nonterminal: {0}", G.symbolName(SID)));
- llvm::StringRef NameRef = T.Nonterminals[SID].Name;
- if (llvm::all_of(NameRef, llvm::isAlpha) && NameRef.upper() == NameRef) {
- Diagnostics.push_back(llvm::formatv(
- "Token-like name {0} is used as a nonterminal", G.symbolName(SID)));
- }
- }
- llvm::DenseSet<llvm::hash_code> VisitedRules;
- for (RuleID RID = 0; RID < T.Rules.size(); ++RID) {
- const auto &R = T.Rules[RID];
- auto Code = llvm::hash_combine(
- R.Target, llvm::hash_combine_range(R.seq().begin(), R.seq().end()));
- auto [_, New] = VisitedRules.insert(Code);
- if (!New)
- Diagnostics.push_back(
- llvm::formatv("Duplicate rule: `{0}`", G.dumpRule(RID)));
- }
- // symbol-id -> used counts
- std::vector<unsigned> UseCounts(T.Nonterminals.size(), 0);
- for (const Rule &R : T.Rules)
- for (SymbolID SID : R.seq())
- if (isNonterminal(SID))
- ++UseCounts[SID];
- for (SymbolID SID = 0; SID < UseCounts.size(); ++SID)
- if (UseCounts[SID] == 0 && T.Nonterminals[SID].Name != StartSymbol)
- Diagnostics.push_back(
- llvm::formatv("Nonterminal never used: {0}", G.symbolName(SID)));
- }
- std::vector<std::string> &Diagnostics;
-};
-} // namespace
-
-Grammar Grammar::parseBNF(llvm::StringRef BNF,
- std::vector<std::string> &Diagnostics) {
- Diagnostics.clear();
- return GrammarBuilder(Diagnostics).build(BNF);
-}
-
-} // namespace pseudo
-} // namespace clang
diff --git a/clang-tools-extra/pseudo/lib/grammar/LRGraph.cpp b/clang-tools-extra/pseudo/lib/grammar/LRGraph.cpp
deleted file mode 100644
index 82c7cc7d8b293..0000000000000
--- a/clang-tools-extra/pseudo/lib/grammar/LRGraph.cpp
+++ /dev/null
@@ -1,265 +0,0 @@
-//===--- LRGraph.cpp - -------------------------------------------*- C++-*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "clang-pseudo/grammar/LRGraph.h"
-#include "clang-pseudo/grammar/Grammar.h"
-#include "llvm/ADT/DenseSet.h"
-#include "llvm/ADT/Hashing.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/Support/FormatVariadic.h"
-#include "llvm/Support/raw_ostream.h"
-
-using ItemSet = std::vector<clang::pseudo::Item>;
-
-namespace llvm {
-// Support clang::pseudo::Item as DenseMap keys.
-template <> struct DenseMapInfo<ItemSet> {
- static inline ItemSet getEmptyKey() {
- return {DenseMapInfo<clang::pseudo::Item>::getEmptyKey()};
- }
- static inline ItemSet getTombstoneKey() {
- return {DenseMapInfo<clang::pseudo::Item>::getTombstoneKey()};
- }
- static unsigned getHashValue(const ItemSet &I) {
- return llvm::hash_combine_range(I.begin(), I.end());
- }
- static bool isEqual(const ItemSet &LHS, const ItemSet &RHS) {
- return LHS == RHS;
- }
-};
-} // namespace llvm
-
-namespace clang {
-namespace pseudo {
-namespace {
-
-struct SortByNextSymbol {
- SortByNextSymbol(const Grammar &G) : G(G) {}
- bool operator()(const Item &L, const Item &R) {
- if (L.hasNext() && R.hasNext() && L.next(G) != R.next(G))
- return L.next(G) < R.next(G);
- if (L.hasNext() != R.hasNext())
- return L.hasNext() < R.hasNext(); // a trailing dot is minimal.
- return L < R;
- }
- const Grammar &G;
-};
-
-// Computes a closure of the given item set S:
-// - extends the given S to contain all options for parsing next token;
-// - nonterminals after a dot are recursively expanded into the begin-state
-// of all production rules that produce that nonterminal;
-//
-// Given
-// Grammar rules = [ _ := E, E := E - T, E := T, T := n, T := ( E ) ]
-// Input = [ E := . T ]
-// returns [ E := . T, T := . n, T := . ( E ) ]
-State closure(ItemSet Queue, const Grammar &G) {
- llvm::DenseSet<Item> InQueue = {Queue.begin(), Queue.end()};
- // We reuse the passed-by-value Queue as the final result, as it's already
- // initialized to the right elements.
- size_t ItIndex = 0;
- while (ItIndex < Queue.size()) {
- const Item &ExpandingItem = Queue[ItIndex];
- ++ItIndex;
- if (!ExpandingItem.hasNext())
- continue;
-
- SymbolID NextSym = ExpandingItem.next(G);
- if (pseudo::isToken(NextSym))
- continue;
- auto RRange = G.table().Nonterminals[NextSym].RuleRange;
- for (RuleID RID = RRange.Start; RID < RRange.End; ++RID) {
- Item NewItem = Item::start(RID, G);
- if (InQueue.insert(NewItem).second) // new
- Queue.push_back(std::move(NewItem));
- }
- }
- Queue.shrink_to_fit();
- llvm::sort(Queue, SortByNextSymbol(G));
- return {std::move(Queue)};
-}
-
-// Returns all next (with a dot advanced) kernel item sets, partitioned by the
-// advanced symbol.
-//
-// Given
-// S = [ E := . a b, E := E . - T ]
-// returns [
-// {id(a), [ E := a . b ]},
-// {id(-), [ E := E - . T ]}
-// ]
-std::vector<std::pair<SymbolID, ItemSet>>
-nextAvailableKernelItems(const State &S, const Grammar &G) {
- std::vector<std::pair<SymbolID, ItemSet>> Results;
- llvm::ArrayRef<Item> AllItems = S.Items;
- AllItems = AllItems.drop_while([](const Item &I) { return !I.hasNext(); });
- while (!AllItems.empty()) {
- SymbolID AdvancedSymbol = AllItems.front().next(G);
- auto Batch = AllItems.take_while([AdvancedSymbol, &G](const Item &I) {
- assert(I.hasNext());
- return I.next(G) == AdvancedSymbol;
- });
- assert(!Batch.empty());
- AllItems = AllItems.drop_front(Batch.size());
-
- // Advance a dot over the Symbol.
- ItemSet Next;
- for (const Item &I : Batch)
- Next.push_back(I.advance());
- // sort the set to keep order determinism for hash computation.
- llvm::sort(Next);
- Results.push_back({AdvancedSymbol, std::move(Next)});
- }
- return Results;
-}
-
-std::vector<std::pair<ExtensionID, SymbolID>>
-availableRecovery(const State &S, const Grammar &G) {
- std::vector<std::pair<ExtensionID, SymbolID>> Result;
- for (const Item &I : S.Items) {
- const auto &Rule = G.lookupRule(I.rule());
- if (I.dot() != Rule.RecoveryIndex)
- continue;
- Result.push_back({Rule.Recovery, Rule.seq()[Rule.RecoveryIndex]});
- }
- llvm::sort(Result);
- Result.erase(std::unique(Result.begin(), Result.end()), Result.end());
- return Result;
-}
-
-} // namespace
-
-std::string Item::dump(const Grammar &G) const {
- const auto &Rule = G.lookupRule(RID);
- auto ToNames = [&](llvm::ArrayRef<SymbolID> Syms) {
- std::vector<llvm::StringRef> Results;
- for (auto SID : Syms)
- Results.push_back(G.symbolName(SID));
- return Results;
- };
- return llvm::formatv("{0} := {1} • {2}{3}", G.symbolName(Rule.Target),
- llvm::join(ToNames(Rule.seq().take_front(DotPos)), " "),
- llvm::join(ToNames(Rule.seq().drop_front(DotPos)), " "),
- Rule.RecoveryIndex == DotPos ? " [recovery]" : "")
- .str();
-}
-
-std::string State::dump(const Grammar &G, unsigned Indent) const {
- std::string Result;
- llvm::raw_string_ostream OS(Result);
- for (const auto &Item : Items)
- OS.indent(Indent) << llvm::formatv("{0}\n", Item.dump(G));
- return OS.str();
-}
-
-std::string LRGraph::dumpForTests(const Grammar &G) const {
- std::string Result;
- llvm::raw_string_ostream OS(Result);
- OS << "States:\n";
- for (StateID ID = 0; ID < States.size(); ++ID) {
- OS << llvm::formatv("State {0}\n", ID);
- OS << States[ID].dump(G, /*Indent*/ 4);
- }
- for (const auto &E : Edges) {
- OS << llvm::formatv("{0} ->[{1}] {2}\n", E.Src, G.symbolName(E.Label),
- E.Dst);
- }
- return OS.str();
-}
-
-LRGraph LRGraph::buildLR0(const Grammar &G) {
- class Builder {
- public:
- Builder(const Grammar &G) : G(G) {}
-
- // Adds a given state if not existed.
- std::pair<StateID, /*inserted*/ bool> insert(ItemSet KernelItems) {
- assert(llvm::is_sorted(KernelItems) &&
- "Item must be sorted before inserting to a hash map!");
- auto It = StatesIndex.find(KernelItems);
- if (It != StatesIndex.end())
- return {It->second, false};
- States.push_back(closure(KernelItems, G));
- StateID NextStateID = States.size() - 1;
- StatesIndex.insert({std::move(KernelItems), NextStateID});
- return {NextStateID, true};
- }
-
- void insertEdge(StateID Src, StateID Dst, SymbolID Label) {
- Edges.push_back({Src, Dst, Label});
- }
-
- void insertRecovery(StateID Src, ExtensionID Strategy, SymbolID Result) {
- Recoveries.push_back({Src, Strategy, Result});
- }
-
- // Returns a state with the given id.
- const State &find(StateID ID) const {
- assert(ID < States.size());
- return States[ID];
- }
-
- void addStartState(SymbolID Sym, StateID State) {
- StartStates.push_back({Sym, State});
- }
-
- LRGraph build() && {
- States.shrink_to_fit();
- Edges.shrink_to_fit();
- Recoveries.shrink_to_fit();
- llvm::sort(StartStates);
- StartStates.shrink_to_fit();
- return LRGraph(std::move(States), std::move(Edges), std::move(Recoveries),
- std::move(StartStates));
- }
-
- private:
- // Key is the **kernel** item sets.
- llvm::DenseMap<ItemSet, /*index of States*/ size_t> StatesIndex;
- std::vector<State> States;
- std::vector<Edge> Edges;
- std::vector<Recovery> Recoveries;
- const Grammar &G;
- std::vector<std::pair<SymbolID, StateID>> StartStates;
- } Builder(G);
-
- std::vector<StateID> PendingStates;
- // Initialize states with the start symbol.
- auto RRange = G.table().Nonterminals[G.underscore()].RuleRange;
- for (RuleID RID = RRange.Start; RID < RRange.End; ++RID) {
- auto StartState = std::vector<Item>{Item::start(RID, G)};
- auto Result = Builder.insert(std::move(StartState));
- assert(Result.second && "State must be new");
- PendingStates.push_back(Result.first);
-
- const Rule &StartRule = G.lookupRule(RID);
- assert(StartRule.Size == 2 &&
- StartRule.seq().back() == tokenSymbol(tok::eof) &&
- "Start rule must be of the form `_ := start-symbol EOF`!");
- Builder.addStartState(StartRule.seq().front(), Result.first);
- }
-
- while (!PendingStates.empty()) {
- auto StateID = PendingStates.back();
- PendingStates.pop_back();
- for (auto Next : nextAvailableKernelItems(Builder.find(StateID), G)) {
- auto Insert = Builder.insert(Next.second);
- if (Insert.second) // new state, insert to the pending queue.
- PendingStates.push_back(Insert.first);
- Builder.insertEdge(StateID, Insert.first, Next.first);
- }
- for (auto Recovery : availableRecovery(Builder.find(StateID), G))
- Builder.insertRecovery(StateID, Recovery.first, Recovery.second);
- }
- return std::move(Builder).build();
-}
-
-} // namespace pseudo
-} // namespace clang
diff --git a/clang-tools-extra/pseudo/lib/grammar/LRTable.cpp b/clang-tools-extra/pseudo/lib/grammar/LRTable.cpp
deleted file mode 100644
index 6a68f1489d57a..0000000000000
--- a/clang-tools-extra/pseudo/lib/grammar/LRTable.cpp
+++ /dev/null
@@ -1,79 +0,0 @@
-//===--- LRTable.cpp - Parsing table for LR parsers --------------*- C++-*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "clang-pseudo/grammar/LRTable.h"
-#include "clang-pseudo/grammar/Grammar.h"
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/Support/FormatVariadic.h"
-#include "llvm/Support/raw_ostream.h"
-
-namespace clang {
-namespace pseudo {
-
-std::string LRTable::dumpStatistics() const {
- return llvm::formatv(R"(
-Statistics of the LR parsing table:
- number of states: {0}
- number of actions: shift={1} goto={2} reduce={3}
- size of the table (bytes): {4}
-)",
- numStates(), Shifts.size(), Gotos.size(), Reduces.size(),
- bytes())
- .str();
-}
-
-std::string LRTable::dumpForTests(const Grammar &G) const {
- std::string Result;
- llvm::raw_string_ostream OS(Result);
- OS << "LRTable:\n";
- for (StateID S = 0; S < numStates(); ++S) {
- OS << llvm::formatv("State {0}\n", S);
- for (uint16_t Terminal = 0; Terminal < NumTerminals; ++Terminal) {
- SymbolID TokID = tokenSymbol(static_cast<tok::TokenKind>(Terminal));
- if (auto SS = getShiftState(S, TokID))
- OS.indent(4) << llvm::formatv("{0}: shift state {1}\n",
- G.symbolName(TokID), SS);
- }
- for (RuleID R : getReduceRules(S)) {
- SymbolID Target = G.lookupRule(R).Target;
- std::vector<llvm::StringRef> Terminals;
- for (unsigned Terminal = 0; Terminal < NumTerminals; ++Terminal) {
- SymbolID TokID = tokenSymbol(static_cast<tok::TokenKind>(Terminal));
- if (canFollow(Target, TokID))
- Terminals.push_back(G.symbolName(TokID));
- }
- OS.indent(4) << llvm::formatv("{0}: reduce by rule {1} '{2}'\n",
- llvm::join(Terminals, " "), R,
- G.dumpRule(R));
- }
- for (SymbolID NontermID = 0; NontermID < G.table().Nonterminals.size();
- ++NontermID) {
- if (auto GS = getGoToState(S, NontermID)) {
- OS.indent(4) << llvm::formatv("{0}: go to state {1}\n",
- G.symbolName(NontermID), *GS);
- }
- }
- }
- return OS.str();
-}
-
-LRTable::StateID LRTable::getStartState(SymbolID Target) const {
- assert(llvm::is_sorted(StartStates) && "StartStates must be sorted!");
- auto It = llvm::partition_point(
- StartStates, [Target](const std::pair<SymbolID, StateID> &X) {
- return X.first < Target;
- });
- assert(It != StartStates.end() && It->first == Target &&
- "target symbol doesn't have a start state!");
- return It->second;
-}
-
-} // namespace pseudo
-} // namespace clang
diff --git a/clang-tools-extra/pseudo/lib/grammar/LRTableBuild.cpp b/clang-tools-extra/pseudo/lib/grammar/LRTableBuild.cpp
deleted file mode 100644
index 387e1c54ee99b..0000000000000
--- a/clang-tools-extra/pseudo/lib/grammar/LRTableBuild.cpp
+++ /dev/null
@@ -1,121 +0,0 @@
-//===--- LRTableBuild.cpp - Build a LRTable from LRGraph ---------*- C++-*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "clang-pseudo/grammar/Grammar.h"
-#include "clang-pseudo/grammar/LRGraph.h"
-#include "clang-pseudo/grammar/LRTable.h"
-#include "clang/Basic/TokenKinds.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallSet.h"
-#include <cstdint>
-
-namespace clang {
-namespace pseudo {
-
-LRTable LRTable::Builder::build() && {
- assert(NumNonterminals != 0 && "Set NumNonterminals or init with grammar");
- LRTable Table;
-
- // Count number of states: every state has to be reachable somehow.
- StateID MaxState = 0;
- for (const auto &Entry : StartStates)
- MaxState = std::max(MaxState, Entry.second);
- for (const auto &Entry : Transition)
- MaxState = std::max(MaxState, Entry.second);
- unsigned NumStates = MaxState + 1;
-
- Table.StartStates = std::move(StartStates);
-
- // Compile the goto and shift actions into transition tables.
- llvm::DenseMap<unsigned, SymbolID> Gotos;
- llvm::DenseMap<unsigned, SymbolID> Shifts;
- for (const auto &E : Transition) {
- if (isToken(E.first.second))
- Shifts.try_emplace(shiftIndex(E.first.first, E.first.second, NumStates),
- E.second);
- else
- Gotos.try_emplace(gotoIndex(E.first.first, E.first.second, NumStates),
- E.second);
- }
- Table.Shifts = TransitionTable(Shifts, NumStates * NumTerminals);
- Table.Gotos = TransitionTable(Gotos, NumStates * NumNonterminals);
-
- // Compile the follow sets into a bitmap.
- Table.FollowSets.resize(tok::NUM_TOKENS * FollowSets.size());
- for (SymbolID NT = 0; NT < FollowSets.size(); ++NT)
- for (SymbolID Follow : FollowSets[NT])
- Table.FollowSets.set(NT * tok::NUM_TOKENS + symbolToToken(Follow));
-
- // Store the reduce actions in a vector partitioned by state.
- Table.ReduceOffset.reserve(NumStates + 1);
- std::vector<RuleID> StateRules;
- for (StateID S = 0; S < NumStates; ++S) {
- Table.ReduceOffset.push_back(Table.Reduces.size());
- auto It = Reduce.find(S);
- if (It == Reduce.end())
- continue;
- Table.Reduces.insert(Table.Reduces.end(), It->second.begin(),
- It->second.end());
- llvm::sort(Table.Reduces.begin() + Table.ReduceOffset.back(),
- Table.Reduces.end());
- }
- Table.ReduceOffset.push_back(Table.Reduces.size());
-
- // Error recovery entries: sort (no dups already), and build offset lookup.
- llvm::sort(Recoveries, [&](const auto &L, const auto &R) {
- return std::tie(L.first, L.second.Result, L.second.Strategy) <
- std::tie(R.first, R.second.Result, R.second.Strategy);
- });
- Table.Recoveries.reserve(Recoveries.size());
- for (const auto &R : Recoveries)
- Table.Recoveries.push_back({R.second.Strategy, R.second.Result});
- Table.RecoveryOffset = std::vector<uint32_t>(NumStates + 1, 0);
- unsigned SortedIndex = 0;
- for (StateID State = 0; State < NumStates; ++State) {
- Table.RecoveryOffset[State] = SortedIndex;
- while (SortedIndex < Recoveries.size() &&
- Recoveries[SortedIndex].first == State)
- SortedIndex++;
- }
- Table.RecoveryOffset[NumStates] = SortedIndex;
- assert(SortedIndex == Recoveries.size());
-
- return Table;
-}
-
-LRTable LRTable::buildSLR(const Grammar &G) {
- auto Graph = LRGraph::buildLR0(G);
- Builder Build(G);
- Build.StartStates = Graph.startStates();
- for (const auto &T : Graph.edges())
- Build.Transition.try_emplace({T.Src, T.Label}, T.Dst);
- for (const auto &Entry : Graph.recoveries())
- Build.Recoveries.push_back(
- {Entry.Src, Recovery{Entry.Strategy, Entry.Result}});
- Build.FollowSets = followSets(G);
- assert(Graph.states().size() <= (1 << StateBits) &&
- "Graph states execceds the maximum limit!");
- // Add reduce actions.
- for (StateID SID = 0; SID < Graph.states().size(); ++SID) {
- for (const Item &I : Graph.states()[SID].Items) {
- // If we've just parsed the start symbol, this means we successfully parse
- // the input. We don't add the reduce action of `_ := start_symbol` in the
- // LRTable (the GLR parser handles it specifically).
- if (G.lookupRule(I.rule()).Target == G.underscore() && !I.hasNext())
- continue;
- if (!I.hasNext())
- // If we've reached the end of a rule A := ..., then we can reduce if
- // the next token is in the follow set of A.
- Build.Reduce[SID].insert(I.rule());
- }
- }
- return std::move(Build).build();
-}
-
-} // namespace pseudo
-} // namespace clang
diff --git a/clang-tools-extra/pseudo/test/CMakeLists.txt b/clang-tools-extra/pseudo/test/CMakeLists.txt
index 712527f78140e..56694c4a9f5a6 100644
--- a/clang-tools-extra/pseudo/test/CMakeLists.txt
+++ b/clang-tools-extra/pseudo/test/CMakeLists.txt
@@ -1,6 +1,4 @@
set(CLANG_PSEUDO_TEST_DEPS
- clang-pseudo
- clang-pseudo-fuzzer
ClangPseudoTests
)
diff --git a/clang-tools-extra/pseudo/test/check-cxx-bnf.test b/clang-tools-extra/pseudo/test/check-cxx-bnf.test
deleted file mode 100644
index b825ff32faa1c..0000000000000
--- a/clang-tools-extra/pseudo/test/check-cxx-bnf.test
+++ /dev/null
@@ -1,2 +0,0 @@
-// verify clang/lib/Tooling/Syntax/Pseudo/cxx/cxx.bnf
-// RUN: clang-pseudo -grammar=%cxx-bnf-file
diff --git a/clang-tools-extra/pseudo/test/crash/backslashes.c b/clang-tools-extra/pseudo/test/crash/backslashes.c
deleted file mode 100644
index 4ca70c609a0e6..0000000000000
--- a/clang-tools-extra/pseudo/test/crash/backslashes.c
+++ /dev/null
@@ -1,4 +0,0 @@
-// We used to try to interpret these backslashes as UCNs.
-// RUN: clang-pseudo -source=%s -print-tokens
-\
-\ x
diff --git a/clang-tools-extra/pseudo/test/cxx/capture-list.cpp b/clang-tools-extra/pseudo/test/cxx/capture-list.cpp
deleted file mode 100644
index fde46e4f0e038..0000000000000
--- a/clang-tools-extra/pseudo/test/cxx/capture-list.cpp
+++ /dev/null
@@ -1,23 +0,0 @@
-// RUN: clang-pseudo -grammar=cxx -source=%s --print-forest | FileCheck %s
-// We loosely allow capture defaults in any position/multiple times.
-auto lambda = [&, &foo, bar(x), =]{};
-// CHECK: lambda-introducer := [ capture-list ]
-// CHECK-NEXT: ├─[
-// CHECK-NEXT: ├─capture-list
-// CHECK-NEXT: │ ├─capture-list
-// CHECK-NEXT: │ │ ├─capture-list
-// CHECK-NEXT: │ │ │ ├─capture-list~& := tok[4]
-// CHECK-NEXT: │ │ │ ├─,
-// CHECK-NEXT: │ │ │ └─capture~simple-capture
-// CHECK-NEXT: │ │ │ ├─&
-// CHECK-NEXT: │ │ │ └─IDENTIFIER := tok[7]
-// CHECK-NEXT: │ │ ├─,
-// CHECK-NEXT: │ │ └─capture~init-capture
-// CHECK-NEXT: │ │ ├─IDENTIFIER := tok[9]
-// CHECK-NEXT: │ │ └─initializer := ( expression-list )
-// CHECK-NEXT: │ │ ├─(
-// CHECK-NEXT: │ │ ├─expression-list~IDENTIFIER := tok[11]
-// CHECK-NEXT: │ │ └─)
-// CHECK-NEXT: │ ├─,
-// CHECK-NEXT: │ └─capture~=
-// CHECK-NEXT: └─]
diff --git a/clang-tools-extra/pseudo/test/cxx/contextual-keywords.cpp b/clang-tools-extra/pseudo/test/cxx/contextual-keywords.cpp
deleted file mode 100644
index ae74353c0a156..0000000000000
--- a/clang-tools-extra/pseudo/test/cxx/contextual-keywords.cpp
+++ /dev/null
@@ -1,9 +0,0 @@
-// RUN: clang-pseudo -grammar=cxx -source=%s --print-forest | FileCheck %s
-// Verify that the contextual-{final,override} rules are guarded conditionally,
-// No ambiguous parsing for the virt-specifier.
-class Foo {
- void foo1() override;
-// CHECK: virt-specifier-seq~IDENTIFIER := tok[7]
- void foo2() final;
-// CHECK: virt-specifier-seq~IDENTIFIER := tok[13]
-};
diff --git a/clang-tools-extra/pseudo/test/cxx/dangling-else.cpp b/clang-tools-extra/pseudo/test/cxx/dangling-else.cpp
deleted file mode 100644
index 151f3931b53f9..0000000000000
--- a/clang-tools-extra/pseudo/test/cxx/dangling-else.cpp
+++ /dev/null
@@ -1,22 +0,0 @@
-// RUN: clang-pseudo -grammar=cxx -source=%s --start-symbol=statement-seq --print-forest | FileCheck %s
-
-// Verify the else should belong to the nested if statement
-if (true) if (true) {} else {}
-
-// CHECK: statement-seq~selection-statement := IF ( condition ) statement
-// CHECK-NEXT: ├─IF
-// CHECK-NEXT: ├─(
-// CHECK-NEXT: ├─condition~TRUE
-// CHECK-NEXT: ├─)
-// CHECK-NEXT: └─statement~selection-statement
-// CHECK-NEXT: ├─IF
-// CHECK-NEXT: ├─(
-// CHECK-NEXT: ├─condition~TRUE
-// CHECK-NEXT: ├─)
-// CHECK-NEXT: ├─statement~compound-statement := { }
-// CHECK-NEXT: │ ├─{
-// CHECK-NEXT: │ └─}
-// CHECK-NEXT: ├─ELSE
-// CHECK-NEXT: └─statement~compound-statement := { }
-// CHECK-NEXT: ├─{
-// CHECK-NEXT: └─}
diff --git a/clang-tools-extra/pseudo/test/cxx/decl-specfier-seq.cpp b/clang-tools-extra/pseudo/test/cxx/decl-specfier-seq.cpp
deleted file mode 100644
index 255e8bedac497..0000000000000
--- a/clang-tools-extra/pseudo/test/cxx/decl-specfier-seq.cpp
+++ /dev/null
@@ -1,27 +0,0 @@
-// RUN: clang-pseudo -grammar=cxx -source=%s --print-forest | FileCheck %s
-
-// not parsed as Type{foo} Type{bar}
-foo bar;
-// CHECK-NOT: simple-declaration := decl-specifier-seq ;
-// CHECK: simple-declaration := decl-specifier-seq init-declarator-list ;
-// CHECK: ├─decl-specifier-seq~simple-type-specifier
-// CHECK: ├─init-declarator-list~IDENTIFIER
-// CHECK: └─;
-// CHECK-NOT: simple-declaration := decl-specifier-seq ;
-
-// not parsed as Type{std} Type{::string} Declarator{s};
-std::string s;
-// CHECK-NOT: nested-name-specifier := ::
-// CHECK: simple-declaration := decl-specifier-seq init-declarator-list ;
-// CHECK: ├─decl-specifier-seq~simple-type-specifier := <ambiguous>
-// CHECK: │ ├─simple-type-specifier := nested-name-specifier type-name
-// CHECK: │ │ ├─nested-name-specifier := <ambiguous> #1
-// CHECK: │ │ │ ├─nested-name-specifier := type-name ::
-// CHECK: │ │ │ └─nested-name-specifier := namespace-name ::
-// CHECK: │ │ └─type-name
-// CHECK: │ └─simple-type-specifier := nested-name-specifier template-name
-// CHECK: │ ├─nested-name-specifier =#1
-// CHECK: │ └─template-name~IDENTIFIER
-// CHECK: ├─init-declarator-list~IDENTIFIER
-// CHECK: └─;
-// CHECK-NOT: nested-name-specifier := ::
diff --git a/clang-tools-extra/pseudo/test/cxx/declarator-function.cpp b/clang-tools-extra/pseudo/test/cxx/declarator-function.cpp
deleted file mode 100644
index 4d7972807c6db..0000000000000
--- a/clang-tools-extra/pseudo/test/cxx/declarator-function.cpp
+++ /dev/null
@@ -1,9 +0,0 @@
-// The standard grammar allows an init-list with any declarator, including
-// a function declarator. This creates an ambiguity where a function-definition
-// is misparsed as a simple-declaration.
-
-// RUN: clang-pseudo -grammar=cxx -source=%s --print-forest | FileCheck %s
-void s(){};
-// CHECK-NOT: simple-declaration
-// CHECK: function-definition := decl-specifier-seq function-declarator function-body
-// CHECK-NOT: simple-declaration
diff --git a/clang-tools-extra/pseudo/test/cxx/declarator-var.cpp b/clang-tools-extra/pseudo/test/cxx/declarator-var.cpp
deleted file mode 100644
index 5aedd8037513f..0000000000000
--- a/clang-tools-extra/pseudo/test/cxx/declarator-var.cpp
+++ /dev/null
@@ -1,9 +0,0 @@
-// The standard grammar allows an function-body to use any declarator, including
-// a non-function declarator. This creates an ambiguity where a
-// simple-declaration is misparsed as a function-definition.
-
-// RUN: clang-pseudo -grammar=cxx -source=%s --print-forest | FileCheck %s
-void (*s)(){};
-// CHECK-NOT: function-definition
-// CHECK: init-declarator := non-function-declarator initializer
-// CHECK-NOT: function-definition
diff --git a/clang-tools-extra/pseudo/test/cxx/declator-member-function.cpp b/clang-tools-extra/pseudo/test/cxx/declator-member-function.cpp
deleted file mode 100644
index 58d0ff4ccae9a..0000000000000
--- a/clang-tools-extra/pseudo/test/cxx/declator-member-function.cpp
+++ /dev/null
@@ -1,9 +0,0 @@
-// RUN: clang-pseudo -grammar=cxx -source=%s --print-forest | FileCheck %s
-
-// Similiar to declarator-function.cpp, but for member functions.
-class Foo {
- void foo() {};
-// CHECK-NOT: member-declarator := declarator brace-or-equal-initializer
-// CHECK: member-declaration~function-definition := decl-specifier-seq function-declarator function-body
-// CHECK-NOT: member-declarator := declarator brace-or-equal-initializer
-};
diff --git a/clang-tools-extra/pseudo/test/cxx/empty-member-declaration.cpp b/clang-tools-extra/pseudo/test/cxx/empty-member-declaration.cpp
deleted file mode 100644
index 2540dd010fcef..0000000000000
--- a/clang-tools-extra/pseudo/test/cxx/empty-member-declaration.cpp
+++ /dev/null
@@ -1,7 +0,0 @@
-// RUN: clang-pseudo -grammar=cxx -source=%s --print-forest --forest-abbrev=false | FileCheck %s
-class A {
- ;
-// CHECK-NOT: member-declaration := ;
-// CHECK: member-declaration := empty-declaration
-// CHECK-NOT: member-declaration := ;
-};
diff --git a/clang-tools-extra/pseudo/test/cxx/empty-member-spec.cpp b/clang-tools-extra/pseudo/test/cxx/empty-member-spec.cpp
deleted file mode 100644
index 4d15835565b7e..0000000000000
--- a/clang-tools-extra/pseudo/test/cxx/empty-member-spec.cpp
+++ /dev/null
@@ -1,13 +0,0 @@
-// RUN: clang-pseudo -grammar=cxx -source=%s --print-forest | FileCheck %s
-class Foo {
-public:
-};
-// CHECK: decl-specifier-seq~class-specifier := class-head { member-specification [recover=Brackets] }
-// CHECK-NEXT: ├─class-head := class-key class-head-name
-// CHECK-NEXT: │ ├─class-key~CLASS := tok[0]
-// CHECK-NEXT: │ └─class-head-name~IDENTIFIER := tok[1]
-// CHECK-NEXT: ├─{ := tok[2]
-// CHECK-NEXT: ├─member-specification := access-specifier :
-// CHECK-NEXT: │ ├─access-specifier~PUBLIC := tok[3]
-// CHECK-NEXT: │ └─: := tok[4]
-// CHECK-NEXT: └─} := tok[5]
diff --git a/clang-tools-extra/pseudo/test/cxx/keyword.cpp b/clang-tools-extra/pseudo/test/cxx/keyword.cpp
deleted file mode 100644
index 318db4ccc49b9..0000000000000
--- a/clang-tools-extra/pseudo/test/cxx/keyword.cpp
+++ /dev/null
@@ -1,12 +0,0 @@
-// RUN: clang-pseudo -grammar=cxx -source=%s --print-forest | FileCheck %s
-bool operator<();
-// CHECK: translation-unit~simple-declaration := decl-specifier-seq init-declarator-list ;
-// CHECK-NEXT: ├─decl-specifier-seq~BOOL
-// CHECK-NEXT: ├─init-declarator-list~noptr-declarator := noptr-declarator parameters-and-qualifiers
-// CHECK-NEXT: │ ├─noptr-declarator~operator-function-id := OPERATOR operator-name
-// CHECK-NEXT: │ │ ├─OPERATOR
-// CHECK-NEXT: │ │ └─operator-name~<
-// CHECK-NEXT: │ └─parameters-and-qualifiers := ( )
-// CHECK-NEXT: │ ├─(
-// CHECK-NEXT: │ └─)
-// CHECK-NEXT: └─;
diff --git a/clang-tools-extra/pseudo/test/cxx/literals.cpp b/clang-tools-extra/pseudo/test/cxx/literals.cpp
deleted file mode 100644
index e1cec8985b25f..0000000000000
--- a/clang-tools-extra/pseudo/test/cxx/literals.cpp
+++ /dev/null
@@ -1,43 +0,0 @@
-// RUN: clang-pseudo -grammar=cxx -source=%s --print-forest -forest-abbrev=0 | FileCheck %s --implicit-check-not=ambiguous
-auto list = {
- 0, // CHECK: := integer-literal
- 0b1011, // CHECK: := integer-literal
- 0777, // CHECK: := integer-literal
- 42_u, // CHECK: := user-defined-integer-literal
- 0LL, // CHECK: := integer-literal
- 0h, // CHECK: := user-defined-integer-literal
- 0., // CHECK: := floating-point-literal
- .2, // CHECK: := floating-point-literal
- 2e1, // CHECK: := floating-point-literal
- 0x42d, // CHECK: := integer-literal
- 0x42_d, // CHECK: := user-defined-integer-literal
- 0x42ds, // CHECK: := user-defined-integer-literal
- 0x1.2p2,// CHECK: := floating-point-literal
-
- "", // CHECK: literal := string-literal
- L"", // CHECK: literal := string-literal
- u8"", // CHECK: literal := string-literal
- u"", // CHECK: literal := string-literal
- U"", // CHECK: literal := string-literal
- R"()", // CHECK: literal := string-literal
- uR"()", // CHECK: literal := string-literal
- "a" "b", // CHECK: literal := string-literal
- u8"a" "b", // CHECK: literal := string-literal
- u"a" u"b", // CHECK: literal := string-literal
- "a"_u "b", // CHECK: user-defined-literal := user-defined-string-literal
- "a"_u u"b", // CHECK: user-defined-literal := user-defined-string-literal
- R"(a)" "\n", // CHECK: literal := string-literal
- R"c(a)c"_u u"\n", // CHECK: user-defined-literal := user-defined-string-literal
-
- 'a', // CHECK: := character-literal
- 'abc', // CHECK: := character-literal
- 'abcdef', // CHECK: := character-literal
- u'a', // CHECK: := character-literal
- U'a', // CHECK: := character-literal
- L'a', // CHECK: := character-literal
- L'abc', // CHECK: := character-literal
- U'\u1234',// CHECK: := character-literal
- '\u1234', // CHECK: := character-literal
- u'a'_u, // CHECK: := user-defined-character-literal
-};
-
diff --git a/clang-tools-extra/pseudo/test/cxx/mixed-designator.cpp b/clang-tools-extra/pseudo/test/cxx/mixed-designator.cpp
deleted file mode 100644
index d605a3d66a5de..0000000000000
--- a/clang-tools-extra/pseudo/test/cxx/mixed-designator.cpp
+++ /dev/null
@@ -1,27 +0,0 @@
-// RUN: clang-pseudo -grammar=cxx -source=%s --print-forest | FileCheck %s
-// FIXME: tighten CHECK to CHECK-NEXT once numeric literals are unambiguous.
-auto x = { 1, .f = 2, [c]{3} };
-// CHECK: initializer-clause~braced-init-list
-// CHECK-NEXT: ├─{ := tok[3]
-// CHECK-NEXT: ├─initializer-list
-// CHECK-NEXT: │ ├─initializer-list
-// CHECK-NEXT: │ │ ├─initializer-list~NUMERIC_CONSTANT
-// CHECK-NEXT: │ │ ├─, := tok[5]
-// CHECK-NEXT: │ │ └─initializer-list-item
-// CHECK-NEXT: │ │ ├─designator
-// CHECK-NEXT: │ │ │ ├─. := tok[6]
-// CHECK-NEXT: │ │ │ └─IDENTIFIER := tok[7]
-// CHECK-NEXT: │ │ └─brace-or-equal-initializer
-// CHECK-NEXT: │ │ ├─= := tok[8]
-// CHECK-NEXT: │ │ └─initializer-clause~NUMERIC_CONSTANT
-// CHECK-NEXT: │ ├─, := tok[10]
-// CHECK-NEXT: │ └─initializer-list-item
-// CHECK-NEXT: │ ├─designator
-// CHECK-NEXT: │ │ ├─[ := tok[11]
-// CHECK-NEXT: │ │ ├─expression~IDENTIFIER := tok[12]
-// CHECK-NEXT: │ │ └─] := tok[13]
-// CHECK-NEXT: │ └─brace-or-equal-initializer~braced-init-list
-// CHECK-NEXT: │ ├─{ := tok[14]
-// CHECK-NEXT: │ ├─initializer-list~NUMERIC_CONSTANT
-// CHECK: │ └─} := tok[16]
-// CHECK-NEXT: └─} := tok[17]
diff --git a/clang-tools-extra/pseudo/test/cxx/nested-name-specifier.cpp b/clang-tools-extra/pseudo/test/cxx/nested-name-specifier.cpp
deleted file mode 100644
index 41d0fa13ff6dd..0000000000000
--- a/clang-tools-extra/pseudo/test/cxx/nested-name-specifier.cpp
+++ /dev/null
@@ -1,28 +0,0 @@
-// RUN: clang-pseudo -grammar=cxx -source=%s --print-forest | FileCheck %s
-
-// Verify that we don't form a complete `::` nested-name-specifier if there is
-// an identifier preceding it.
-Foo::Foo() {} // No "Foo ::Foo()" false parse
-// CHECK: ├─declaration-seq~function-definition := function-declarator function-body
-// CHECK-NEXT: │ ├─function-declarator~noptr-declarator := noptr-declarator parameters-and-qualifiers
-
-int ::x;
-// CHECK: declaration~simple-declaration := decl-specifier-seq init-declarator-list ;
-// CHECK-NEXT: ├─decl-specifier-seq~INT
-
-void test() {
- X::Y::Z; // No false qualified-declarator parses "X ::Y::Z" and "X::Y ::Z".
-// CHECK: statement-seq~statement := <ambiguous>
-// CHECK: statement~expression-statement := expression ;
-// CHECK: statement~simple-declaration := decl-specifier-seq ;
-// CHECK-NOT: simple-declaration := decl-specifier-seq init-declarator-list ;
-
- // FIXME: eliminate the false `a<b> ::c` declaration parse.
- a<b>::c;
-// CHECK: statement := <ambiguous>
-// CHECK-NEXT: ├─statement~expression-statement := expression ;
-// CHECK-NEXT: │ ├─expression~relational-expression :=
-// CHECK: └─statement~simple-declaration := <ambiguous>
-// CHECK-NEXT: ├─simple-declaration := decl-specifier-seq ;
-// CHECK: └─simple-declaration := decl-specifier-seq init-declarator-list ;
-}
diff --git a/clang-tools-extra/pseudo/test/cxx/parameter-decl-clause.cpp b/clang-tools-extra/pseudo/test/cxx/parameter-decl-clause.cpp
deleted file mode 100644
index 1426f4e0a9bc0..0000000000000
--- a/clang-tools-extra/pseudo/test/cxx/parameter-decl-clause.cpp
+++ /dev/null
@@ -1,14 +0,0 @@
-// RUN: clang-pseudo -grammar=cxx -source=%s --print-forest | FileCheck %s
-void foo2(int, ...);
-// CHECK: translation-unit~simple-declaration := decl-specifier-seq init-declarator-list ;
-// CHECK-NEXT: ├─decl-specifier-seq~VOID :=
-// CHECK-NEXT: ├─init-declarator-list~noptr-declarator := noptr-declarator parameters-and-qualifiers
-// CHECK-NEXT: │ ├─noptr-declarator~IDENTIFIER :=
-// CHECK-NEXT: │ └─parameters-and-qualifiers := ( parameter-declaration-clause [recover=Brackets] )
-// CHECK-NEXT: │ ├─( :=
-// CHECK-NEXT: │ ├─parameter-declaration-clause := parameter-declaration-list , ...
-// CHECK-NEXT: │ │ ├─parameter-declaration-list~INT :=
-// CHECK-NEXT: │ │ ├─, :=
-// CHECK-NEXT: │ │ └─... :=
-// CHECK-NEXT: │ └─) :=
-// CHECK-NEXT: └─; :=
diff --git a/clang-tools-extra/pseudo/test/cxx/predefined-identifier.cpp b/clang-tools-extra/pseudo/test/cxx/predefined-identifier.cpp
deleted file mode 100644
index 5d48a3a43d027..0000000000000
--- a/clang-tools-extra/pseudo/test/cxx/predefined-identifier.cpp
+++ /dev/null
@@ -1,5 +0,0 @@
-// RUN: clang-pseudo -grammar=cxx -source=%s --print-forest | FileCheck %s
-void s() {
- __func__;
- // CHECK: expression~__FUNC__ := tok[5]
-}
diff --git a/clang-tools-extra/pseudo/test/cxx/recovery-func-parameters.cpp b/clang-tools-extra/pseudo/test/cxx/recovery-func-parameters.cpp
deleted file mode 100644
index 0b41f881fa3bf..0000000000000
--- a/clang-tools-extra/pseudo/test/cxx/recovery-func-parameters.cpp
+++ /dev/null
@@ -1,13 +0,0 @@
-// RUN: clang-pseudo -grammar=cxx -source=%s --print-forest | FileCheck %s
-void foo(complete garbage???) {}
-// CHECK: translation-unit~function-definition := decl-specifier-seq function-declarator function-body
-// CHECK-NEXT: ├─decl-specifier-seq~VOID := tok[0]
-// CHECK-NEXT: ├─function-declarator~noptr-declarator := noptr-declarator parameters-and-qualifiers
-// CHECK-NEXT: │ ├─noptr-declarator~IDENTIFIER := tok[1]
-// CHECK-NEXT: │ └─parameters-and-qualifiers := ( parameter-declaration-clause [recover=Brackets] )
-// CHECK-NEXT: │ ├─( := tok[2]
-// CHECK-NEXT: │ ├─parameter-declaration-clause := <opaque>
-// CHECK-NEXT: │ └─) := tok[8]
-// CHECK-NEXT: └─function-body~compound-statement := { }
-// CHECK-NEXT: ├─{ := tok[9]
-// CHECK-NEXT: └─} := tok[10]
diff --git a/clang-tools-extra/pseudo/test/cxx/recovery-init-list.cpp b/clang-tools-extra/pseudo/test/cxx/recovery-init-list.cpp
deleted file mode 100644
index 38216ad964772..0000000000000
--- a/clang-tools-extra/pseudo/test/cxx/recovery-init-list.cpp
+++ /dev/null
@@ -1,13 +0,0 @@
-// RUN: clang-pseudo -grammar=cxx -source=%s --print-forest | FileCheck %s
-auto x = { complete garbage };
-// CHECK: translation-unit~simple-declaration
-// CHECK-NEXT: ├─decl-specifier-seq~AUTO := tok[0]
-// CHECK-NEXT: ├─init-declarator-list~init-declarator
-// CHECK-NEXT: │ ├─non-function-declarator~IDENTIFIER := tok[1]
-// CHECK-NEXT: │ └─initializer~brace-or-equal-initializer
-// CHECK-NEXT: │ ├─= := tok[2]
-// CHECK-NEXT: │ └─initializer-clause~braced-init-list
-// CHECK-NEXT: │ ├─{ := tok[3]
-// CHECK-NEXT: │ ├─initializer-list := <opaque>
-// CHECK-NEXT: │ └─} := tok[6]
-// CHECK-NEXT: └─; := tok[7]
diff --git a/clang-tools-extra/pseudo/test/cxx/structured-binding.cpp b/clang-tools-extra/pseudo/test/cxx/structured-binding.cpp
deleted file mode 100644
index 1c68e928ddd62..0000000000000
--- a/clang-tools-extra/pseudo/test/cxx/structured-binding.cpp
+++ /dev/null
@@ -1,6 +0,0 @@
-// RUN: clang-pseudo -grammar=cxx -source=%s --start-symbol=statement-seq --print-forest | FileCheck %s
-
-// Verify there is no false parse of the structured binding declaration.
-ABC[post] = abc;
-// CHECK: statement-seq~expression-statement := expression ;
-// CHECK: postfix-expression [ expr-or-braced-init-list ]
diff --git a/clang-tools-extra/pseudo/test/cxx/template-empty-type-parameter.cpp b/clang-tools-extra/pseudo/test/cxx/template-empty-type-parameter.cpp
deleted file mode 100644
index 02aff285f838c..0000000000000
--- a/clang-tools-extra/pseudo/test/cxx/template-empty-type-parameter.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-// RUN: clang-pseudo -grammar=cxx -source=%s --print-forest | FileCheck %s
-template <typename> struct MatchParents;
-// CHECK: template-parameter-list~TYPENAME := tok[2]
diff --git a/clang-tools-extra/pseudo/test/cxx/unsized-array.cpp b/clang-tools-extra/pseudo/test/cxx/unsized-array.cpp
deleted file mode 100644
index 1f7b106e0e93b..0000000000000
--- a/clang-tools-extra/pseudo/test/cxx/unsized-array.cpp
+++ /dev/null
@@ -1,7 +0,0 @@
-// RUN: clang-pseudo -grammar=cxx -source=%s --print-forest | FileCheck %s
-void s(int[]);
-// CHECK: parameter-declaration-clause~parameter-declaration := decl-specifier-seq abstract-declarator
-// CHECK-NEXT: ├─decl-specifier-seq~INT := tok[3]
-// CHECK-NEXT: └─abstract-declarator~noptr-abstract-declarator := [ ]
-// CHECK-NEXT: ├─[ := tok[4]
-// CHECK-NEXT: └─] := tok[5]
diff --git a/clang-tools-extra/pseudo/test/fuzzer.cpp b/clang-tools-extra/pseudo/test/fuzzer.cpp
deleted file mode 100644
index 400746a9d12d5..0000000000000
--- a/clang-tools-extra/pseudo/test/fuzzer.cpp
+++ /dev/null
@@ -1,4 +0,0 @@
-// RUN: clang-pseudo-fuzzer -grammar=%cxx-bnf-file -print %s | FileCheck %s
-int x;
-// CHECK: translation-unit := declaration-seq
-// CHECK: builtin-type := INT
diff --git a/clang-tools-extra/pseudo/test/glr-variant-start.cpp b/clang-tools-extra/pseudo/test/glr-variant-start.cpp
deleted file mode 100644
index 1bd073707353b..0000000000000
--- a/clang-tools-extra/pseudo/test/glr-variant-start.cpp
+++ /dev/null
@@ -1,9 +0,0 @@
-// RUN: clang-pseudo -grammar=%cxx-bnf-file -source=%s --start-symbol=statement-seq --print-forest | FileCheck %s
-
-a + a;
-// CHECK: statement-seq~expression-statement := expression ;
-// CHECK-NEXT: ├─expression~additive-expression := additive-expression + multiplicative-expression
-// CHECK-NEXT: │ ├─additive-expression~IDENTIFIER :=
-// CHECK-NEXT: │ ├─+ :=
-// CHECK-NEXT: │ └─multiplicative-expression~IDENTIFIER :=
-// CHECK-NEXT: └─; :=
diff --git a/clang-tools-extra/pseudo/test/glr.cpp b/clang-tools-extra/pseudo/test/glr.cpp
deleted file mode 100644
index f805e42ffa6dd..0000000000000
--- a/clang-tools-extra/pseudo/test/glr.cpp
+++ /dev/null
@@ -1,30 +0,0 @@
-// RUN: clang-pseudo -grammar=cxx -source=%s --print-forest -print-statistics | FileCheck %s
-
-void foo() {
- T* a; // a multiply expression or a pointer declaration?
-// CHECK: statement-seq~statement := <ambiguous>
-// CHECK-NEXT: ├─statement~expression-statement := expression ;
-// CHECK-NEXT: │ ├─expression~multiplicative-expression := multiplicative-expression * pm-expression
-// CHECK-NEXT: │ │ ├─multiplicative-expression~IDENTIFIER := tok[5]
-// CHECK-NEXT: │ │ ├─* := tok[6]
-// CHECK-NEXT: │ │ └─pm-expression~id-expression := unqualified-id #1
-// CHECK-NEXT: │ │ └─unqualified-id~IDENTIFIER := tok[7]
-// CHECK-NEXT: │ └─; := tok[8]
-// CHECK-NEXT: └─statement~simple-declaration := decl-specifier-seq init-declarator-list ;
-// CHECK-NEXT: ├─decl-specifier-seq~simple-type-specifier := <ambiguous>
-// CHECK-NEXT: │ ├─simple-type-specifier~IDENTIFIER := tok[5]
-// CHECK-NEXT: │ └─simple-type-specifier~IDENTIFIER := tok[5]
-// CHECK-NEXT: ├─init-declarator-list~ptr-declarator := ptr-operator ptr-declarator
-// CHECK-NEXT: │ ├─ptr-operator~* := tok[6]
-// CHECK-NEXT: │ └─ptr-declarator~id-expression =#1
-// CHECK-NEXT: └─; := tok[8]
-}
-
-// CHECK: 2 Ambiguous nodes:
-// CHECK-NEXT: 1 simple-type-specifier
-// CHECK-NEXT: 1 statement
-// CHECK-EMPTY:
-// CHECK-NEXT: 0 Opaque nodes:
-// CHECK-EMPTY:
-// CHECK-NEXT: Ambiguity: 0.20 misparses/token
-// CHECK-NEXT: Unparsed: 0.00%
diff --git a/clang-tools-extra/pseudo/test/html-forest.c b/clang-tools-extra/pseudo/test/html-forest.c
deleted file mode 100644
index 0be08da49f4a7..0000000000000
--- a/clang-tools-extra/pseudo/test/html-forest.c
+++ /dev/null
@@ -1,8 +0,0 @@
-// RUN: clang-pseudo -source %s -html-forest=%t.html
-// RUN: FileCheck %s < %t.html
-int main() {
-}
-// Sanity check for some obvious strings.
-// CHECK-DAG: <body>
-// CHECK-DAG: "compound-statement"
-// CHECK-DAG: main
diff --git a/clang-tools-extra/pseudo/test/lex.c b/clang-tools-extra/pseudo/test/lex.c
deleted file mode 100644
index ebebd2e0fb72f..0000000000000
--- a/clang-tools-extra/pseudo/test/lex.c
+++ /dev/null
@@ -1,42 +0,0 @@
-int is_debug() {
-#ifndef NDEBUG
- return 1; // in debug mode
-#else
- return 0;
-#endif
-}
-
-/* This comment gets lexed along with the input above! We just don't CHECK it.
-
-RUN: clang-pseudo -source %s -print-source | FileCheck %s -check-prefix=SOURCE --strict-whitespace
- SOURCE: int is_debug() {
-SOURCE-NEXT: #ifndef NDEBUG
-SOURCE-NEXT: return 1; // in debug mode
-SOURCE-NEXT: #else
-SOURCE-NEXT: return 0;
-SOURCE-NEXT: #end
-SOURCE-NEXT: }
-
-RUN: clang-pseudo -source %s -print-tokens | FileCheck %s -check-prefix=TOKEN
- TOKEN: 0: raw_identifier 0:0 "int" flags=1
-TOKEN-NEXT: raw_identifier 0:0 "is_debug"
-TOKEN-NEXT: l_paren 0:0 "("
-TOKEN-NEXT: r_paren 0:0 ")"
-TOKEN-NEXT: l_brace 0:0 "{"
-TOKEN-NEXT: hash 1:0 "#" flags=1
-TOKEN-NEXT: raw_identifier 1:0 "ifndef"
-TOKEN-NEXT: raw_identifier 1:0 "NDEBUG"
-TOKEN-NEXT: raw_identifier 2:2 "return" flags=1
-TOKEN-NEXT: numeric_constant 2:2 "1"
-TOKEN-NEXT: semi 2:2 ";"
-TOKEN-NEXT: comment 2:2 "// in debug mode"
-TOKEN-NEXT: hash 3:0 "#" flags=1
-TOKEN-NEXT: raw_identifier 3:0 "else"
-TOKEN-NEXT: raw_identifier 4:2 "return" flags=1
-TOKEN-NEXT: numeric_constant 4:2 "0"
-TOKEN-NEXT: semi 4:2 ";"
-TOKEN-NEXT: hash 5:0 "#" flags=1
-TOKEN-NEXT: raw_identifier 5:0 "endif"
-TOKEN-NEXT: r_brace 6:0 "}" flags=1
-
-*******************************************************************************/
diff --git a/clang-tools-extra/pseudo/test/lr-build-basic.test b/clang-tools-extra/pseudo/test/lr-build-basic.test
deleted file mode 100644
index 13036349eb8c1..0000000000000
--- a/clang-tools-extra/pseudo/test/lr-build-basic.test
+++ /dev/null
@@ -1,32 +0,0 @@
-_ := expr EOF
-expr := id
-id := IDENTIFIER
-
-# RUN: clang-pseudo -grammar %s -print-graph | FileCheck %s --check-prefix=GRAPH
-# GRAPH: States:
-# GRAPH-NEXT: State 0
-# GRAPH-NEXT: _ := • expr EOF
-# GRAPH-NEXT: expr := • id
-# GRAPH-NEXT: id := • IDENTIFIER
-# GRAPH-NEXT: State 1
-# GRAPH-NEXT: _ := expr • EOF
-# GRAPH-NEXT: State 2
-# GRAPH-NEXT: expr := id •
-# GRAPH-NEXT: State 3
-# GRAPH-NEXT: id := IDENTIFIER •
-# GRAPH-NEXT: State 4
-# GRAPH-NEXT: _ := expr EOF •
-
-# RUN: clang-pseudo -grammar %s -print-table | FileCheck %s --check-prefix=TABLE
-# TABLE: LRTable:
-# TABLE-NEXT: State 0
-# TABLE-NEXT: IDENTIFIER: shift state 3
-# TABLE-NEXT: expr: go to state 1
-# TABLE-NEXT: id: go to state 2
-# TABLE-NEXT: State 1
-# TABLE-NEXT: EOF: shift state 4
-# TABLE-NEXT: State 2
-# TABLE-NEXT: EOF: reduce by rule 2 'expr := id'
-# TABLE-NEXT: State 3
-# TABLE-NEXT: EOF: reduce by rule 1 'id := IDENTIFIER'
-# TABLE-NEXT: State 4
diff --git a/clang-tools-extra/pseudo/test/lr-build-conflicts.test b/clang-tools-extra/pseudo/test/lr-build-conflicts.test
deleted file mode 100644
index a66ce4d622ca1..0000000000000
--- a/clang-tools-extra/pseudo/test/lr-build-conflicts.test
+++ /dev/null
@@ -1,49 +0,0 @@
-_ := expr EOF
-expr := expr - expr # S/R conflict at state 4 on '-' token
-expr := IDENTIFIER
-
-# RUN: clang-pseudo -grammar %s -print-graph | FileCheck %s --check-prefix=GRAPH
-# GRAPH: States
-# GRAPH-NEXT: State 0
-# GRAPH-NEXT: _ := • expr EOF
-# GRAPH-NEXT: expr := • expr - expr
-# GRAPH-NEXT: expr := • IDENTIFIER
-# GRAPH-NEXT: State 1
-# GRAPH-NEXT: _ := expr • EOF
-# GRAPH-NEXT: expr := expr • - expr
-# GRAPH-NEXT: State 2
-# GRAPH-NEXT: expr := IDENTIFIER •
-# GRAPH-NEXT: State 3
-# GRAPH-NEXT: _ := expr EOF •
-# GRAPH-NEXT: State 4
-# GRAPH-NEXT: expr := • expr - expr
-# GRAPH-NEXT: expr := expr - • expr
-# GRAPH-NEXT: expr := • IDENTIFIER
-# GRAPH-NEXT: State 5
-# GRAPH-NEXT: expr := expr - expr •
-# GRAPH-NEXT: expr := expr • - expr
-# GRAPH-NEXT: 0 ->[expr] 1
-# GRAPH-NEXT: 0 ->[IDENTIFIER] 2
-# GRAPH-NEXT: 1 ->[EOF] 3
-# GRAPH-NEXT: 1 ->[-] 4
-# GRAPH-NEXT: 4 ->[expr] 5
-# GRAPH-NEXT: 4 ->[IDENTIFIER] 2
-# GRAPH-NEXT: 5 ->[-] 4
-
-# RUN: clang-pseudo -grammar %s -print-table | FileCheck %s --check-prefix=TABLE
-# TABLE: LRTable:
-# TABLE-NEXT: State 0
-# TABLE-NEXT: IDENTIFIER: shift state 2
-# TABLE-NEXT: expr: go to state 1
-# TABLE-NEXT: State 1
-# TABLE-NEXT: EOF: shift state 3
-# TABLE-NEXT: -: shift state 4
-# TABLE-NEXT: State 2
-# TABLE-NEXT: EOF -: reduce by rule 2 'expr := IDENTIFIER'
-# TABLE-NEXT: State 3
-# TABLE-NEXT: State 4
-# TABLE-NEXT: IDENTIFIER: shift state 2
-# TABLE-NEXT: expr: go to state 5
-# TABLE-NEXT: State 5
-# TABLE-NEXT: -: shift state 4
-# TABLE-NEXT: EOF -: reduce by rule 1 'expr := expr - expr'
diff --git a/clang-tools-extra/pseudo/test/strip-directives.c b/clang-tools-extra/pseudo/test/strip-directives.c
deleted file mode 100644
index c7878d9295a08..0000000000000
--- a/clang-tools-extra/pseudo/test/strip-directives.c
+++ /dev/null
@@ -1,49 +0,0 @@
-#include <stdio.h>
-int main() {
-#error This was inevitable...
-#if HELLO
- printf("hello, world\n");
- return 0;
-#else
- abort();
-#endif
-}
-
-/* This comment gets lexed along with the input above! We just don't CHECK it.
-
-RUN: clang-pseudo -source %s -print-directive-tree | FileCheck %s -check-prefix=PPT --strict-whitespace
- PPT: #include (7 tokens)
-PPT-NEXT: code (5 tokens)
-PPT-NEXT: #error (6 tokens)
-PPT-NEXT: #if (3 tokens) TAKEN
-PPT-NEXT: code (8 tokens)
-PPT-NEXT: #else (2 tokens)
-PPT-NEXT: code (4 tokens)
-PPT-NEXT: #endif (2 tokens)
-PPT-NEXT: code (2 tokens)
- ^ including this block comment
-
-RUN: clang-pseudo -source %s -strip-directives -print-source | FileCheck %s --strict-whitespace
- CHECK: int main() {
-CHECK-NEXT: printf("hello, world\n");
-CHECK-NEXT: return 0;
-CHECK-NEXT: }
-
-RUN: clang-pseudo -source %s -strip-directives -print-tokens | FileCheck %s --check-prefix=TOKEN
- TOKEN: 0: raw_identifier 1:0 "int" flags=1
-TOKEN-NEXT: raw_identifier 1:0 "main"
-TOKEN-NEXT: l_paren 1:0 "("
-TOKEN-NEXT: r_paren 1:0 ")"
-TOKEN-NEXT: l_brace 1:0 "{"
-TOKEN-NEXT: raw_identifier 4:2 "printf" flags=1
-TOKEN-NEXT: l_paren 4:2 "("
-TOKEN-NEXT: string_literal 4:2 "\22hello, world\\n\22"
-TOKEN-NEXT: r_paren 4:2 ")"
-TOKEN-NEXT: semi 4:2 ";"
-TOKEN-NEXT: raw_identifier 5:2 "return" flags=1
-TOKEN-NEXT: numeric_constant 5:2 "0"
-TOKEN-NEXT: semi 5:2 ";"
-TOKEN-NEXT: r_brace 9:0 "}" flags=1
-
-*******************************************************************************/
-
diff --git a/clang-tools-extra/pseudo/tool/CMakeLists.txt b/clang-tools-extra/pseudo/tool/CMakeLists.txt
deleted file mode 100644
index 49e1dc29a5a4e..0000000000000
--- a/clang-tools-extra/pseudo/tool/CMakeLists.txt
+++ /dev/null
@@ -1,29 +0,0 @@
-set(LLVM_LINK_COMPONENTS support)
-
-add_clang_tool(clang-pseudo
- ClangPseudo.cpp
- HTMLForest.cpp
- )
-
-clang_target_link_libraries(clang-pseudo
- PRIVATE
- clangBasic
- )
-
-target_link_libraries(clang-pseudo
- PRIVATE
- clangPseudo
- clangPseudoGrammar
- clangPseudoCLI
- )
-
-add_custom_command(OUTPUT HTMLForestResources.inc
- COMMAND "${Python3_EXECUTABLE}" ${CLANG_SOURCE_DIR}/utils/bundle_resources.py
- ${CMAKE_CURRENT_BINARY_DIR}/HTMLForestResources.inc
- HTMLForest.css HTMLForest.js HTMLForest.html
- WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
- COMMENT "Bundling HTMLForest resources"
- DEPENDS ${CLANG_SOURCE_DIR}/utils/bundle_resources.py HTMLForest.css HTMLForest.js HTMLForest.html
- VERBATIM)
-add_custom_target(clang-pseudo-resources DEPENDS HTMLForestResources.inc)
-add_dependencies(clang-pseudo clang-pseudo-resources)
diff --git a/clang-tools-extra/pseudo/tool/ClangPseudo.cpp b/clang-tools-extra/pseudo/tool/ClangPseudo.cpp
deleted file mode 100644
index 6a64760749cef..0000000000000
--- a/clang-tools-extra/pseudo/tool/ClangPseudo.cpp
+++ /dev/null
@@ -1,243 +0,0 @@
-//===-- ClangPseudo.cpp - Clang pseudoparser tool -------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "clang-pseudo/Bracket.h"
-#include "clang-pseudo/DirectiveTree.h"
-#include "clang-pseudo/Disambiguate.h"
-#include "clang-pseudo/Forest.h"
-#include "clang-pseudo/GLR.h"
-#include "clang-pseudo/Language.h"
-#include "clang-pseudo/Token.h"
-#include "clang-pseudo/cli/CLI.h"
-#include "clang-pseudo/grammar/Grammar.h"
-#include "clang-pseudo/grammar/LRGraph.h"
-#include "clang-pseudo/grammar/LRTable.h"
-#include "clang/Basic/LangOptions.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/STLFunctionalExtras.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/FormatVariadic.h"
-#include "llvm/Support/MemoryBuffer.h"
-#include "llvm/Support/Signals.h"
-#include <optional>
-
-using clang::pseudo::ForestNode;
-using clang::pseudo::Token;
-using clang::pseudo::TokenStream;
-using llvm::cl::desc;
-using llvm::cl::init;
-using llvm::cl::opt;
-
-static opt<bool> PrintGrammar("print-grammar", desc("Print the grammar"));
-static opt<bool> PrintGraph("print-graph",
- desc("Print the LR graph for the grammar"));
-static opt<bool> PrintTable("print-table",
- desc("Print the LR table for the grammar"));
-static opt<std::string> Source("source", desc("Source file"));
-static opt<bool> PrintSource("print-source", desc("Print token stream"));
-static opt<bool> PrintTokens("print-tokens", desc("Print detailed token info"));
-static opt<bool>
- PrintDirectiveTree("print-directive-tree",
- desc("Print directive structure of source code"));
-static opt<bool>
- StripDirectives("strip-directives",
- desc("Strip directives and select conditional sections"));
-static opt<bool> Disambiguate("disambiguate",
- desc("Choose best tree from parse forest"));
-static opt<bool> PrintStatistics("print-statistics", desc("Print GLR parser statistics"));
-static opt<bool> PrintForest("print-forest", desc("Print parse forest"));
-static opt<bool> ForestAbbrev("forest-abbrev", desc("Abbreviate parse forest"),
- init(true));
-static opt<std::string> HTMLForest("html-forest",
- desc("output file for HTML forest"));
-static opt<std::string> StartSymbol("start-symbol",
- desc("Specify the start symbol to parse"),
- init("translation-unit"));
-
-static std::string readOrDie(llvm::StringRef Path) {
- llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> Text =
- llvm::MemoryBuffer::getFile(Path);
- if (std::error_code EC = Text.getError()) {
- llvm::errs() << "Error: can't read file '" << Path
- << "': " << EC.message() << "\n";
- ::exit(1);
- }
- return Text.get()->getBuffer().str();
-}
-
-namespace clang {
-namespace pseudo {
-// Defined in HTMLForest.cpp
-void writeHTMLForest(llvm::raw_ostream &OS, const Grammar &,
- const ForestNode &Root, const Disambiguation &,
- const TokenStream &);
-namespace {
-
-struct NodeStats {
- unsigned Total = 0;
- std::vector<std::pair<SymbolID, unsigned>> BySymbol;
-
- NodeStats(const ForestNode &Root,
- llvm::function_ref<bool(const ForestNode &)> Filter) {
- llvm::DenseMap<SymbolID, unsigned> Map;
- for (const ForestNode &N : Root.descendants())
- if (Filter(N)) {
- ++Total;
- ++Map[N.symbol()];
- }
- BySymbol = {Map.begin(), Map.end()};
- // Sort by count descending, then symbol ascending.
- llvm::sort(BySymbol, [](const auto &L, const auto &R) {
- return std::tie(R.second, L.first) < std::tie(L.second, R.first);
- });
- }
-};
-
-} // namespace
-} // namespace pseudo
-} // namespace clang
-
-int main(int argc, char *argv[]) {
- llvm::cl::ParseCommandLineOptions(argc, argv, "");
- llvm::sys::PrintStackTraceOnErrorSignal(argv[0]);
-
- clang::LangOptions LangOpts = clang::pseudo::genericLangOpts();
- std::string SourceText;
- std::optional<clang::pseudo::TokenStream> RawStream;
- std::optional<TokenStream> PreprocessedStream;
- std::optional<clang::pseudo::TokenStream> ParseableStream;
- if (Source.getNumOccurrences()) {
- SourceText = readOrDie(Source);
- RawStream = clang::pseudo::lex(SourceText, LangOpts);
- TokenStream *Stream = &*RawStream;
-
- auto DirectiveStructure = clang::pseudo::DirectiveTree::parse(*RawStream);
- clang::pseudo::chooseConditionalBranches(DirectiveStructure, *RawStream);
-
- std::optional<TokenStream> Preprocessed;
- if (StripDirectives) {
- Preprocessed = DirectiveStructure.stripDirectives(*Stream);
- Stream = &*Preprocessed;
- }
-
- if (PrintSource)
- Stream->print(llvm::outs());
- if (PrintTokens)
- llvm::outs() << *Stream;
- if (PrintDirectiveTree)
- llvm::outs() << DirectiveStructure;
-
- ParseableStream = clang::pseudo::stripComments(cook(*Stream, LangOpts));
- pairBrackets(*ParseableStream);
- }
-
- const auto &Lang = clang::pseudo::getLanguageFromFlags();
- if (PrintGrammar)
- llvm::outs() << Lang.G.dump();
- if (PrintGraph)
- llvm::outs() << clang::pseudo::LRGraph::buildLR0(Lang.G).dumpForTests(
- Lang.G);
-
- if (PrintTable)
- llvm::outs() << Lang.Table.dumpForTests(Lang.G);
- if (PrintStatistics)
- llvm::outs() << Lang.Table.dumpStatistics();
-
- if (ParseableStream) {
- clang::pseudo::ForestArena Arena;
- clang::pseudo::GSS GSS;
- std::optional<clang::pseudo::SymbolID> StartSymID =
- Lang.G.findNonterminal(StartSymbol);
- if (!StartSymID) {
- llvm::errs() << llvm::formatv(
- "The start symbol {0} doesn't exit in the grammar!\n", StartSymbol);
- return 2;
- }
- auto &Root =
- glrParse(clang::pseudo::ParseParams{*ParseableStream, Arena, GSS},
- *StartSymID, Lang);
- // If we're disambiguating, we'll print at the end instead.
- if (PrintForest && !Disambiguate)
- llvm::outs() << Root.dumpRecursive(Lang.G, /*Abbreviated=*/ForestAbbrev);
- clang::pseudo::Disambiguation Disambig;
- if (Disambiguate)
- Disambig = clang::pseudo::disambiguate(&Root, {});
-
- if (HTMLForest.getNumOccurrences()) {
- std::error_code EC;
- llvm::raw_fd_ostream HTMLOut(HTMLForest, EC);
- if (EC) {
- llvm::errs() << "Couldn't write " << HTMLForest << ": " << EC.message()
- << "\n";
- return 2;
- }
- clang::pseudo::writeHTMLForest(HTMLOut, Lang.G, Root, Disambig,
- *ParseableStream);
- }
-
- if (PrintStatistics) {
- llvm::outs() << "Forest bytes: " << Arena.bytes()
- << " nodes: " << Arena.nodeCount() << "\n";
- llvm::outs() << "GSS bytes: " << GSS.bytes()
- << " nodes: " << GSS.nodesCreated() << "\n";
-
- for (auto &P : {std::make_pair("Ambiguous", ForestNode::Ambiguous),
- std::make_pair("Opaque", ForestNode::Opaque)}) {
- clang::pseudo::NodeStats Stats(
- Root, [&](const auto &N) { return N.kind() == P.second; });
- llvm::outs() << "\n" << Stats.Total << " " << P.first << " nodes:\n";
- for (const auto &S : Stats.BySymbol)
- llvm::outs() << llvm::formatv(" {0,3} {1}\n", S.second,
- Lang.G.symbolName(S.first));
- }
-
- // Metrics for how imprecise parsing was.
- // These are rough but aim to be:
- // - linear: if we eliminate half the errors the metric should halve
- // - length-independent
- unsigned UnparsedTokens = 0; // Tokens covered by Opaque. (not unique)
- unsigned Misparses = 0; // Sum of alternatives-1
- llvm::DenseSet<const ForestNode *> Visited;
- auto DFS = [&](const ForestNode &N, Token::Index End, auto &DFS) -> void {
- if (N.kind() == ForestNode::Opaque) {
- UnparsedTokens += End - N.startTokenIndex();
- } else if (N.kind() == ForestNode::Ambiguous) {
- Misparses += N.alternatives().size() - 1;
- for (const auto *C : N.alternatives())
- if (Visited.insert(C).second)
- DFS(*C, End, DFS);
- } else if (N.kind() == ForestNode::Sequence) {
- for (unsigned I = 0, E = N.children().size(); I < E; ++I)
- if (Visited.insert(N.children()[I]).second)
- DFS(*N.children()[I],
- I + 1 == N.children().size()
- ? End
- : N.children()[I + 1]->startTokenIndex(),
- DFS);
- }
- };
- unsigned Len = ParseableStream->tokens().size();
- DFS(Root, Len, DFS);
- llvm::outs() << "\n";
- llvm::outs() << llvm::formatv("Ambiguity: {0} misparses/token\n",
- double(Misparses) / Len);
- llvm::outs() << llvm::formatv("Unparsed: {0}%\n",
- 100.0 * UnparsedTokens / Len);
- }
-
- if (Disambiguate && PrintForest) {
- ForestNode *DisambigRoot = &Root;
- removeAmbiguities(DisambigRoot, Disambig);
- llvm::outs() << "Disambiguated tree:\n";
- llvm::outs() << DisambigRoot->dumpRecursive(Lang.G,
- /*Abbreviated=*/ForestAbbrev);
- }
- }
-
- return 0;
-}
diff --git a/clang-tools-extra/pseudo/tool/HTMLForest.cpp b/clang-tools-extra/pseudo/tool/HTMLForest.cpp
deleted file mode 100644
index 184430bddd8d6..0000000000000
--- a/clang-tools-extra/pseudo/tool/HTMLForest.cpp
+++ /dev/null
@@ -1,192 +0,0 @@
-//===-- HTMLForest.cpp - browser-based parse forest explorer
-//---------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// The plain text forest node dump (clang-pseudo -print-forest) is useful but
-// hard to reconcile with the code being examined, especially when it is large.
-//
-// HTMLForest produces a self-contained HTML file containing both the code and
-// the forest representation, linking them interactively with javascript.
-// At any given time, a single parse tree is shown (ambiguities resolved).
-// The user can switch between ambiguous alternatives.
-//
-// +-------+---------------+
-// | | +-----+|
-// | #tree | #code |#info||
-// | | +-----+|
-// | | |
-// +-------+---------------+
-//
-// #tree is a hierarchical view of the nodes (nested <ul>s), like -print-forest.
-// (It is a simple tree, not a DAG, because ambiguities have been resolved).
-// Like -print-forest, trivial sequences are collapsed (expression~IDENTIFIER).
-//
-// #code is the source code, annotated with <span>s marking the node ranges.
-// These spans are usually invisible (exception: ambiguities are marked), but
-// they are used to show and change the selection.
-//
-// #info is a floating box that shows details of the currently selected node:
-// - rule (for sequence nodes). Abbreviated rules are also shown.
-// - alternatives (for ambiguous nodes). The user can choose an alternative.
-// - ancestors. The parent nodes show how this node fits in translation-unit.
-//
-// There are two types of 'active' node:
-// - *highlight* is what the cursor is over, and is colored blue.
-// Near ancestors are shaded faintly (onion-skin) to show local structure.
-// - *selection* is set by clicking.
-// The #info box shows the selection, and selected nodes have a dashed ring.
-//
-//===----------------------------------------------------------------------===//
-
-#include "clang-pseudo/Disambiguate.h"
-#include "clang-pseudo/Forest.h"
-#include "clang-pseudo/grammar/Grammar.h"
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/Support/JSON.h"
-#include "llvm/Support/raw_ostream.h"
-namespace clang {
-namespace pseudo {
-namespace {
-
-// Defines const char HTMLForest_css[] = "...contents of HTMLForest.css..."; etc
-#include "HTMLForestResources.inc"
-
-struct Writer {
- llvm::raw_ostream &Out;
- const Grammar &G;
- const ForestNode &Root;
- const TokenStream &Stream;
- const Disambiguation &Disambig;
-
- void write() {
- Out << "<!doctype html>\n";
- tag("html", [&] {
- tag("head", [&] {
- tag("title", [&] { Out << "HTMLForest"; });
- tag("script", [&] { Out << HTMLForest_js; });
- tag("style", [&] { Out << HTMLForest_css; });
- tag("script", [&] {
- Out << "var forest=";
- writeForestJSON();
- Out << ";";
- });
- tag("pre id='hidden-code' hidden", [&] { writeCode(); });
- });
- tag("body", [&] { Out << HTMLForest_html; });
- });
- }
-
- void writeCode();
- void writeForestJSON();
- void tag(llvm::StringRef Opener, llvm::function_ref<void()> Body) {
- Out << "<" << Opener << ">";
- Body();
- Out << "</" << Opener.split(' ').first << ">\n";
- }
-};
-
-void Writer::writeCode() {
- // This loop (whitespace logic) is cribbed from TokenStream::Print.
- bool FirstToken = true;
- unsigned LastLine = -1;
- StringRef LastText;
- for (const auto &T : Stream.tokens()) {
- StringRef Text = T.text();
- if (FirstToken) {
- FirstToken = false;
- } else if (T.Line == LastLine) {
- if (LastText.data() + LastText.size() != Text.data())
- Out << ' ';
- } else {
- Out << " \n"; // Extra space aids selection.
- Out.indent(T.Indent);
- }
- Out << "<span class='token' id='t" << Stream.index(T) << "'>";
- llvm::printHTMLEscaped(Text, Out);
- Out << "</span>";
- LastLine = T.Line;
- LastText = Text;
- }
- if (!FirstToken)
- Out << '\n';
-}
-
-// Writes a JSON array of forest nodes. Items are e.g.:
-// {kind:'sequence', symbol:'compound-stmt', children:[5,8,33],
-// rule:'compound-stmt := ...'} {kind:'terminal', symbol:'VOID', token:'t52'}
-// {kind:'ambiguous', symbol:'type-specifier', children:[3,100] selected:3}
-// {kind:'opaque', symbol:'statement-seq', firstToken:'t5', lastToken:'t6'}
-void Writer::writeForestJSON() {
- // This is the flat array of nodes: the index into this array is the node ID.
- std::vector<std::pair<const ForestNode *, /*End*/ Token::Index>> Sequence;
- llvm::DenseMap<const ForestNode *, unsigned> Index;
- auto AssignID = [&](const ForestNode *N, Token::Index End) -> unsigned {
- auto R = Index.try_emplace(N, Sequence.size());
- if (R.second)
- Sequence.push_back({N, End});
- return R.first->second;
- };
- AssignID(&Root, Stream.tokens().size());
- auto TokenID = [](Token::Index I) { return ("t" + llvm::Twine(I)).str(); };
-
- llvm::json::OStream Out(this->Out, 2);
- Out.array([&] {
- for (unsigned I = 0; I < Sequence.size(); ++I) {
- const ForestNode *N = Sequence[I].first;
- Token::Index End = Sequence[I].second;
- Out.object([&] {
- Out.attribute("symbol", G.symbolName(N->symbol()));
- switch (N->kind()) {
- case ForestNode::Terminal:
- Out.attribute("kind", "terminal");
- Out.attribute("token", TokenID(N->startTokenIndex()));
- break;
- case ForestNode::Sequence:
- Out.attribute("kind", "sequence");
- Out.attribute("rule", G.dumpRule(N->rule()));
- break;
- case ForestNode::Ambiguous:
- Out.attribute("kind", "ambiguous");
- Out.attribute("selected",
- AssignID(N->children()[Disambig.lookup(N)], End));
- break;
- case ForestNode::Opaque:
- Out.attribute("kind", "opaque");
- Out.attribute("firstToken", TokenID(N->startTokenIndex()));
- // [firstToken, lastToken] is a closed range.
- // If empty, lastToken is omitted.
- if (N->startTokenIndex() != End)
- Out.attribute("lastToken", TokenID(End - 1));
- break;
- }
- auto Children = N->children();
- if (!Children.empty())
- Out.attributeArray("children", [&] {
- for (unsigned I = 0; I < Children.size(); ++I)
- Out.value(AssignID(Children[I],
- I + 1 == Children.size()
- ? End
- : Children[I + 1]->startTokenIndex()));
- });
- });
- }
- });
-}
-
-} // namespace
-
-// We only accept the derived stream here.
-// FIXME: allow the original stream instead?
-void writeHTMLForest(llvm::raw_ostream &OS, const Grammar &G,
- const ForestNode &Root, const Disambiguation &Disambig,
- const TokenStream &Stream) {
- Writer{OS, G, Root, Stream, Disambig}.write();
-}
-
-} // namespace pseudo
-} // namespace clang
diff --git a/clang-tools-extra/pseudo/tool/HTMLForest.css b/clang-tools-extra/pseudo/tool/HTMLForest.css
deleted file mode 100644
index 674cd59f0e76b..0000000000000
--- a/clang-tools-extra/pseudo/tool/HTMLForest.css
+++ /dev/null
@@ -1,93 +0,0 @@
-body {
- position: absolute;
- top: 0;
- bottom: 0;
- right: 0;
- left: 0;
-
- display: flex;
- align-items: stretch;
- margin: 0;
- font-family: sans-serif;
- white-space: nowrap;
- height: 100%;
-}
-body > * {
- overflow-y: auto; /* Scroll sections independently*/
- margin: 0;
-}
-
-#code {
- font-size: 18px;
- line-height: 36px;
- flex-grow: 1;
- padding-right: 10em; /* Leave space for #info */
-}
-#code span {
- padding: 9px 0; /* No "gaps" between lines due to line-height */
-}
-.node.ambiguous::before, .ancestors.ambiguous::after, .tree-node.ambiguous > header::after {
- content: /*the thinking man's emoji*/'\01F914';
-}
-
-#info {
- position: fixed;
- right: 2em;
- top: 1em;
- width: 25em;
- border: 1px solid black;
- min-height: 20em;
- background-color: whiteSmoke;
- overflow-x: clip;
- box-shadow: 3px 3px 5px rgba(0,0,0,0.2);
-}
-#info header {
- background-color: black;
- color: white;
- font-size: larger;
- padding: 0.5em;
-}
-#info.ambiguous header { background-color: #803; }
-#info.sequence header { background-color: darkBlue; }
-#info.terminal header { background-color: darkGreen; }
-#info.opaque header { background-color: orangeRed; }
-#i_kind {
- float: right;
- font-size: small;
-}
-#info section {
- padding: 0.5em;
- border-top: 1px solid lightGray;
- overflow-x: auto;
-}
-#i_ancestors { font-size: small; }
-
-#tree {
- flex-grow: 0;
- min-width: 20em;
- margin-right: 1em;
- border-right: 1px solid darkGray;
- background-color: azure;
- font-size: small;
- overflow-x: auto;
- resize: horizontal;
-}
-#tree ul {
- margin: 0;
- display: inline-block;
- padding-left: 6px;
- border-left: 1px solid rgba(0,0,0,0.2);
- list-style: none;
-}
-#tree > ul { border-left: none; }
-.tree-node.selected > header .name { font-weight: bold; }
-.tree-node.terminal .name { font-family: monospace; }
-.tree-node.ambiguous > header .name { color: #803; font-weight: bold; }
-.tree-node.sequence > header .name { color: darkBlue; }
-.tree-node.terminal > header .name { color: darkGreen; }
-.tree-node.opaque > header .name { color: orangeRed; }
-
-.selected { outline: 1px dashed black; }
-.abbrev { opacity: 50%; }
-.abbrev::after { content: '~'; }
-.opaque { background-color: bisque; }
diff --git a/clang-tools-extra/pseudo/tool/HTMLForest.html b/clang-tools-extra/pseudo/tool/HTMLForest.html
deleted file mode 100644
index 4cf98cbbb2cc9..0000000000000
--- a/clang-tools-extra/pseudo/tool/HTMLForest.html
+++ /dev/null
@@ -1,15 +0,0 @@
-<div id="tree"><ul></ul></div>
-<pre id="code"></pre>
-<div id="info" hidden>
- <header>
- <span id="i_symbol"></span>
- <span id="i_kind"></span>
- </header>
- <section>
- <div id="i_rules"></div>
- <div id="i_alternatives"></div>
- </section>
- <section>
- <div id="i_ancestors"></div>
- </section>
-</div>
diff --git a/clang-tools-extra/pseudo/tool/HTMLForest.js b/clang-tools-extra/pseudo/tool/HTMLForest.js
deleted file mode 100644
index 24b88a5c10b47..0000000000000
--- a/clang-tools-extra/pseudo/tool/HTMLForest.js
+++ /dev/null
@@ -1,290 +0,0 @@
-// The global map of forest node index => NodeView.
-views = [];
-// NodeView is a visible forest node.
-// It has an entry in the navigation tree, and a span in the code itself.
-// Each NodeView is associated with a forest node, but not all nodes have views:
-// - nodes not reachable though current ambiguity selection
-// - trivial "wrapping" sequence nodes are abbreviated away
-class NodeView {
- // Builds a node representing forest[index], or its target if it is a wrapper.
- // Registers the node in the global map.
- static make(index, parent, abbrev) {
- var node = forest[index];
- if (node.kind == 'sequence' && node.children.length == 1 &&
- forest[node.children[0]].kind != 'ambiguous') {
- abbrev ||= [];
- abbrev.push(index);
- return NodeView.make(node.children[0], parent, abbrev);
- }
- return views[index] = new NodeView(index, parent, node, abbrev);
- }
-
- constructor(index, parent, node, abbrev) {
- this.abbrev = abbrev || [];
- this.parent = parent;
- this.children =
- (node.kind == 'ambiguous' ? [ node.selected ] : node.children || [])
- .map((c) => NodeView.make(c, this));
- this.index = index;
- this.node = node;
- views[index] = this;
-
- this.span = this.buildSpan();
- this.tree = this.buildTree();
- }
-
- // Replaces the token sequence in #code with a <span class=node>.
- buildSpan() {
- var elt = document.createElement('span');
- elt.dataset['index'] = this.index;
- elt.classList.add("node");
- elt.classList.add("selectable-node");
- elt.classList.add(this.node.kind);
-
- var begin = null, end = null;
- if (this.children.length != 0) {
- begin = this.children[0].span;
- end = this.children[this.children.length - 1].span.nextSibling;
- } else if (this.node.kind == 'terminal') {
- begin = document.getElementById(this.node.token);
- end = begin.nextSibling;
- } else if (this.node.kind == 'opaque') {
- begin = document.getElementById(this.node.firstToken);
- end = (this.node.lastToken == null)
- ? begin
- : document.getElementById(this.node.lastToken).nextSibling;
- }
- var parent = begin.parentNode;
- splice(begin, end, elt);
- parent.insertBefore(elt, end);
- return elt;
- }
-
- // Returns a (detached) <li class=tree-node> suitable for use in #tree.
- buildTree() {
- var elt = document.createElement('li');
- elt.dataset['index'] = this.index;
- elt.classList.add('tree-node');
- elt.classList.add('selectable-node');
- elt.classList.add(this.node.kind);
- var header = document.createElement('header');
- elt.appendChild(header);
-
- if (this.abbrev.length > 0) {
- var abbrev = document.createElement('span');
- abbrev.classList.add('abbrev');
- abbrev.innerText = forest[this.abbrev[0]].symbol;
- header.appendChild(abbrev);
- }
- var name = document.createElement('span');
- name.classList.add('name');
- name.innerText = this.node.symbol;
- header.appendChild(name);
-
- if (this.children.length != 0) {
- var sublist = document.createElement('ul');
- this.children.forEach((c) => sublist.appendChild(c.tree));
- elt.appendChild(sublist);
- }
- return elt;
- }
-
- // Make this view visible on the screen by scrolling if needed.
- scrollVisible() {
- scrollIntoViewV(document.getElementById('tree'), this.tree.firstChild);
- scrollIntoViewV(document.getElementById('code'), this.span);
- }
-
- // Fill #info with details of this node.
- renderInfo() {
- document.getElementById('info').classList = this.node.kind;
- document.getElementById('i_symbol').innerText = this.node.symbol;
- document.getElementById('i_kind').innerText = this.node.kind;
-
- // For sequence nodes, add LHS := RHS rule.
- // If this node abbreviates trivial sequences, we want those rules too.
- var rules = document.getElementById('i_rules');
- rules.textContent = '';
- function addRule(i) {
- var ruleText = forest[i].rule;
- if (ruleText == null)
- return;
- var rule = document.createElement('div');
- rule.classList.add('rule');
- rule.innerText = ruleText;
- rules.insertBefore(rule, rules.firstChild);
- }
- this.abbrev.forEach(addRule);
- addRule(this.index);
-
- // For ambiguous nodes, show a selectable list of alternatives.
- var alternatives = document.getElementById('i_alternatives');
- alternatives.textContent = '';
- var that = this;
- function addAlternative(i) {
- var altNode = forest[i];
- var text = altNode.rule || altNode.kind;
- var alt = document.createElement('div');
- alt.classList.add('alternative');
- alt.innerText = text;
- alt.dataset['index'] = i;
- alt.dataset['parent'] = that.index;
- if (i == that.node.selected)
- alt.classList.add('selected');
- alternatives.appendChild(alt);
- }
- if (this.node.kind == 'ambiguous')
- this.node.children.forEach(addAlternative);
-
- // Show the stack of ancestor nodes.
- // The part of each rule that leads to the current node is bolded.
- var ancestors = document.getElementById('i_ancestors');
- ancestors.textContent = '';
- var child = this;
- for (var view = this.parent; view != null;
- child = view, view = view.parent) {
- var indexInParent = view.children.indexOf(child);
-
- var ctx = document.createElement('div');
- ctx.classList.add('ancestors');
- ctx.classList.add('selectable-node');
- ctx.classList.add(view.node.kind);
- if (view.node.rule) {
- // Rule syntax is LHS := RHS1 [annotation] RHS2.
- // We walk through the chunks and bold the one at parentInIndex.
- var chunkCount = 0;
- ctx.innerHTML = view.node.rule.replaceAll(/[^ ]+/g, function(match) {
- if (!(match.startsWith('[') && match.endsWith(']')) /*annotations*/
- && chunkCount++ == indexInParent + 2 /*skip LHS :=*/)
- return '<b>' + match + '</b>';
- return match;
- });
- } else /*ambiguous*/ {
- ctx.innerHTML = '<b>' + view.node.symbol + '</b>';
- }
- ctx.dataset['index'] = view.index;
- if (view.abbrev.length > 0) {
- var abbrev = document.createElement('span');
- abbrev.classList.add('abbrev');
- abbrev.innerText = forest[view.abbrev[0]].symbol;
- ctx.insertBefore(abbrev, ctx.firstChild);
- }
-
- ctx.dataset['index'] = view.index;
- ancestors.appendChild(ctx, ancestors.firstChild);
- }
- }
-
- remove() {
- this.children.forEach((c) => c.remove());
- splice(this.span.firstChild, null, this.span.parentNode,
- this.span.nextSibling);
- detach(this.span);
- delete views[this.index];
- }
-};
-
-var selection = null;
-function selectView(view) {
- var old = selection;
- selection = view;
- if (view == old)
- return;
-
- if (old) {
- old.tree.classList.remove('selected');
- old.span.classList.remove('selected');
- }
- document.getElementById('info').hidden = (view == null);
- if (!view)
- return;
- view.tree.classList.add('selected');
- view.span.classList.add('selected');
- view.renderInfo();
- view.scrollVisible();
-}
-
-// To highlight nodes on hover, we create dynamic CSS rules of the form
-// .selectable-node[data-index="42"] { background-color: blue; }
-// This avoids needing to find all the related nodes and update their classes.
-var highlightSheet = new CSSStyleSheet();
-document.adoptedStyleSheets.push(highlightSheet);
-function highlightView(view) {
- var text = '';
- for (const color of ['#6af', '#bbb', '#ddd', '#eee']) {
- if (view == null)
- break;
- text += '.selectable-node[data-index="' + view.index + '"] '
- text += '{ background-color: ' + color + '; }\n';
- view = view.parent;
- }
- highlightSheet.replace(text);
-}
-
-// Select which branch of an ambiguous node is taken.
-function chooseAlternative(parent, index) {
- var parentView = views[parent];
- parentView.node.selected = index;
- var oldChild = parentView.children[0];
- oldChild.remove();
- var newChild = NodeView.make(index, parentView);
- parentView.children[0] = newChild;
- parentView.tree.lastChild.replaceChild(newChild.tree, oldChild.tree);
-
- highlightView(null);
- // Force redraw of the info box.
- selectView(null);
- selectView(parentView);
-}
-
-// Attach event listeners and build content once the document is ready.
-document.addEventListener("DOMContentLoaded", function() {
- var code = document.getElementById('code');
- var tree = document.getElementById('tree');
- var ancestors = document.getElementById('i_ancestors');
- var alternatives = document.getElementById('i_alternatives');
-
- [code, tree, ancestors].forEach(function(container) {
- container.addEventListener('click', function(e) {
- var nodeElt = e.target.closest('.selectable-node');
- selectView(nodeElt && views[Number(nodeElt.dataset['index'])]);
- });
- container.addEventListener('mousemove', function(e) {
- var nodeElt = e.target.closest('.selectable-node');
- highlightView(nodeElt && views[Number(nodeElt.dataset['index'])]);
- });
- });
-
- alternatives.addEventListener('click', function(e) {
- var altElt = e.target.closest('.alternative');
- if (altElt)
- chooseAlternative(Number(altElt.dataset['parent']),
- Number(altElt.dataset['index']));
- });
-
- // The HTML provides #code content in a hidden DOM element, move it.
- var hiddenCode = document.getElementById('hidden-code');
- splice(hiddenCode.firstChild, hiddenCode.lastChild, code);
- detach(hiddenCode);
-
- // Build the tree of NodeViews and attach to #tree.
- tree.firstChild.appendChild(NodeView.make(0).tree);
-});
-
-// Helper DOM functions //
-
-// Moves the sibling range [first, until) into newParent.
-function splice(first, until, newParent, before) {
- for (var next = first; next != until;) {
- var elt = next;
- next = next.nextSibling;
- newParent.insertBefore(elt, before);
- }
-}
-function detach(node) { node.parentNode.removeChild(node); }
-// Like scrollIntoView, but vertical only!
-function scrollIntoViewV(container, elt) {
- if (container.scrollTop > elt.offsetTop + elt.offsetHeight ||
- container.scrollTop + container.clientHeight < elt.offsetTop)
- container.scrollTo({top : elt.offsetTop, behavior : 'smooth'});
-}
diff --git a/clang-tools-extra/pseudo/unittests/CMakeLists.txt b/clang-tools-extra/pseudo/unittests/CMakeLists.txt
index 821ca4d0652e1..33db4fcd8e2ef 100644
--- a/clang-tools-extra/pseudo/unittests/CMakeLists.txt
+++ b/clang-tools-extra/pseudo/unittests/CMakeLists.txt
@@ -5,13 +5,7 @@ set(LLVM_LINK_COMPONENTS
add_custom_target(ClangPseudoUnitTests)
add_unittest(ClangPseudoUnitTests ClangPseudoTests
BracketTest.cpp
- CXXTest.cpp
DirectiveTreeTest.cpp
- DisambiguateTest.cpp
- ForestTest.cpp
- GLRTest.cpp
- GrammarTest.cpp
- LRTableTest.cpp
TokenTest.cpp
)
@@ -24,8 +18,6 @@ clang_target_link_libraries(ClangPseudoTests
target_link_libraries(ClangPseudoTests
PRIVATE
clangPseudo
- clangPseudoCXX
- clangPseudoGrammar
LLVMTestingAnnotations
LLVMTestingSupport
)
diff --git a/clang-tools-extra/pseudo/unittests/CXXTest.cpp b/clang-tools-extra/pseudo/unittests/CXXTest.cpp
deleted file mode 100644
index 505f958ae7556..0000000000000
--- a/clang-tools-extra/pseudo/unittests/CXXTest.cpp
+++ /dev/null
@@ -1,30 +0,0 @@
-//===--- CXXTest.cpp ------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "clang-pseudo/cxx/CXX.h"
-#include "gtest/gtest.h"
-
-namespace clang {
-namespace pseudo {
-namespace cxx {
-namespace {
-
-TEST(CXX, GeneratedEnums) {
- const auto &Lang = clang::pseudo::cxx::getLanguage();
- EXPECT_EQ("iteration-statement",
- Lang.G.symbolName(Symbol::iteration_statement));
- EXPECT_EQ("iteration-statement := DO statement WHILE ( expression ) ;",
- Lang.G.dumpRule(
- rule::iteration_statement::
- DO__statement__WHILE__L_PAREN__expression__R_PAREN__SEMI));
-}
-
-} // namespace
-} // namespace cxx
-} // namespace pseudo
-} // namespace clang
diff --git a/clang-tools-extra/pseudo/unittests/DisambiguateTest.cpp b/clang-tools-extra/pseudo/unittests/DisambiguateTest.cpp
deleted file mode 100644
index 2f483bb090660..0000000000000
--- a/clang-tools-extra/pseudo/unittests/DisambiguateTest.cpp
+++ /dev/null
@@ -1,111 +0,0 @@
-//===--- DisambiguateTest.cpp ---------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "clang-pseudo/Disambiguate.h"
-#include "clang-pseudo/Forest.h"
-#include "clang-pseudo/Token.h"
-#include "clang/Basic/TokenKinds.h"
-#include "gmock/gmock.h"
-#include "gtest/gtest.h"
-#include <vector>
-
-namespace clang {
-namespace pseudo {
-namespace {
-using testing::ElementsAre;
-using testing::Pair;
-using testing::UnorderedElementsAre;
-
-// Common disambiguation test fixture.
-// This is the ambiguous forest representing parses of 'a * b;'.
-class DisambiguateTest : public ::testing::Test {
-protected:
- // Greatly simplified C++ grammar.
- enum Symbol : SymbolID {
- Statement,
- Declarator,
- Expression,
- DeclSpecifier,
- Type,
- Template,
- };
- enum Rule : RuleID {
- /* LHS__RHS1_RHS2 means LHS := RHS1 RHS2 */
- Statement__DeclSpecifier_Declarator_Semi,
- Declarator__Star_Declarator,
- Declarator__Identifier,
- Statement__Expression_Semi,
- Expression__Expression_Star_Expression,
- Expression__Identifier,
- DeclSpecifier__Type,
- DeclSpecifier__Template,
- Type__Identifier,
- Template__Identifier,
- };
-
- ForestArena Arena;
- ForestNode &A = Arena.createTerminal(tok::identifier, 0);
- ForestNode &Star = Arena.createTerminal(tok::star, 1);
- ForestNode &B = Arena.createTerminal(tok::identifier, 2);
- ForestNode &Semi = Arena.createTerminal(tok::semi, 3);
-
- // Parse as multiplication expression.
- ForestNode &AExpr =
- Arena.createSequence(Expression, Expression__Identifier, &A);
- ForestNode &BExpr =
- Arena.createSequence(Expression, Expression__Identifier, &B);
- ForestNode &Expr =
- Arena.createSequence(Expression, Expression__Expression_Star_Expression,
- {&AExpr, &Star, &BExpr});
- ForestNode &ExprStmt = Arena.createSequence(
- Statement, Statement__Expression_Semi, {&Expr, &Semi});
- // Parse as declaration (`a` may be CTAD or not).
- ForestNode &AType =
- Arena.createSequence(DeclSpecifier, DeclSpecifier__Type,
- &Arena.createSequence(Type, Type__Identifier, &A));
- ForestNode &ATemplate = Arena.createSequence(
- DeclSpecifier, DeclSpecifier__Template,
- &Arena.createSequence(Template, Template__Identifier, &A));
- ForestNode &DeclSpec =
- Arena.createAmbiguous(DeclSpecifier, {&AType, &ATemplate});
- ForestNode &BDeclarator =
- Arena.createSequence(Declarator, Declarator__Identifier, &B);
- ForestNode &BPtr = Arena.createSequence(
- Declarator, Declarator__Star_Declarator, {&Star, &BDeclarator});
- ForestNode &DeclStmt =
- Arena.createSequence(Statement, Statement__DeclSpecifier_Declarator_Semi,
- {&DeclSpec, &Star, &BDeclarator});
- // Top-level ambiguity
- ForestNode &Stmt = Arena.createAmbiguous(Statement, {&ExprStmt, &DeclStmt});
-};
-
-TEST_F(DisambiguateTest, Remove) {
- Disambiguation D;
- D.try_emplace(&Stmt, 1); // statement is a declaration, not an expression
- D.try_emplace(&DeclSpec, 0); // a is a type, not a (CTAD) template
- ForestNode *Root = &Stmt;
- removeAmbiguities(Root, D);
-
- EXPECT_EQ(Root, &DeclStmt);
- EXPECT_THAT(DeclStmt.elements(), ElementsAre(&AType, &Star, &BDeclarator));
-}
-
-TEST_F(DisambiguateTest, DummyStrategy) {
- Disambiguation D = disambiguate(&Stmt, {});
- EXPECT_THAT(D, UnorderedElementsAre(Pair(&Stmt, 1), Pair(&DeclSpec, 1)));
-
- ForestNode *Root = &Stmt;
- removeAmbiguities(Root, D);
- EXPECT_EQ(Root, &DeclStmt);
- EXPECT_THAT(DeclStmt.elements(),
- ElementsAre(&ATemplate, &Star, &BDeclarator));
-}
-
-} // namespace
-} // namespace pseudo
-} // namespace clang
diff --git a/clang-tools-extra/pseudo/unittests/ForestTest.cpp b/clang-tools-extra/pseudo/unittests/ForestTest.cpp
deleted file mode 100644
index 36af896148209..0000000000000
--- a/clang-tools-extra/pseudo/unittests/ForestTest.cpp
+++ /dev/null
@@ -1,180 +0,0 @@
-//===--- ForestTest.cpp - Test Forest dump ----------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "clang-pseudo/Forest.h"
-#include "clang-pseudo/Token.h"
-#include "clang/Basic/LangOptions.h"
-#include "llvm/ADT/StringRef.h"
-#include "gmock/gmock.h"
-#include "gtest/gtest.h"
-#include <vector>
-
-namespace clang {
-namespace pseudo {
-namespace {
-
-// FIXME: extract to a TestGrammar class to allow code sharing among tests.
-class ForestTest : public ::testing::Test {
-public:
- void build(llvm::StringRef BNF) {
- Diags.clear();
- G = Grammar::parseBNF(BNF, Diags);
- }
-
- SymbolID symbol(llvm::StringRef Name) const {
- for (unsigned I = 0; I < NumTerminals; ++I)
- if (G.table().Terminals[I] == Name)
- return tokenSymbol(static_cast<tok::TokenKind>(I));
- for (SymbolID ID = 0; ID < G.table().Nonterminals.size(); ++ID)
- if (G.table().Nonterminals[ID].Name == Name)
- return ID;
- ADD_FAILURE() << "No such symbol found: " << Name;
- return 0;
- }
-
- RuleID ruleFor(llvm::StringRef NonterminalName) const {
- auto RuleRange = G.table().Nonterminals[symbol(NonterminalName)].RuleRange;
- if (RuleRange.End - RuleRange.Start == 1)
- return G.table().Nonterminals[symbol(NonterminalName)].RuleRange.Start;
- ADD_FAILURE() << "Expected a single rule for " << NonterminalName
- << ", but it has " << RuleRange.End - RuleRange.Start
- << " rule!\n";
- return 0;
- }
-
-protected:
- Grammar G;
- std::vector<std::string> Diags;
-};
-
-TEST_F(ForestTest, DumpBasic) {
- build(R"cpp(
- _ := add-expression EOF
- add-expression := id-expression + id-expression
- id-expression := IDENTIFIER
- )cpp");
- ASSERT_TRUE(Diags.empty());
- ForestArena Arena;
- const auto &TS =
- cook(lex("a + b", clang::LangOptions()), clang::LangOptions());
-
- auto T = Arena.createTerminals(TS);
- ASSERT_EQ(T.size(), 4u);
- const auto *Left = &Arena.createSequence(
- symbol("id-expression"), ruleFor("id-expression"), {&T.front()});
- const auto *Right = &Arena.createSequence(symbol("id-expression"),
- ruleFor("id-expression"), {&T[2]});
-
- const auto *Add =
- &Arena.createSequence(symbol("add-expression"), ruleFor("add-expression"),
- {Left, &T[1], Right});
- EXPECT_EQ(Add->dumpRecursive(G, true),
- "[ 0, end) add-expression := id-expression + id-expression\n"
- "[ 0, 1) ├─id-expression~IDENTIFIER := tok[0]\n"
- "[ 1, 2) ├─+ := tok[1]\n"
- "[ 2, end) └─id-expression~IDENTIFIER := tok[2]\n");
- EXPECT_EQ(Add->dumpRecursive(G, false),
- "[ 0, end) add-expression := id-expression + id-expression\n"
- "[ 0, 1) ├─id-expression := IDENTIFIER\n"
- "[ 0, 1) │ └─IDENTIFIER := tok[0]\n"
- "[ 1, 2) ├─+ := tok[1]\n"
- "[ 2, end) └─id-expression := IDENTIFIER\n"
- "[ 2, end) └─IDENTIFIER := tok[2]\n");
-}
-
-TEST_F(ForestTest, DumpAmbiguousAndRefs) {
- build(R"cpp(
- _ := type EOF
- type := class-type # rule 4
- type := enum-type # rule 5
- class-type := shared-type
- enum-type := shared-type
- shared-type := IDENTIFIER)cpp");
- ASSERT_TRUE(Diags.empty());
- ForestArena Arena;
- const auto &TS = cook(lex("abc", clang::LangOptions()), clang::LangOptions());
-
- auto Terminals = Arena.createTerminals(TS);
- ASSERT_EQ(Terminals.size(), 2u);
-
- const auto *SharedType = &Arena.createSequence(
- symbol("shared-type"), ruleFor("shared-type"), {Terminals.begin()});
- const auto *ClassType = &Arena.createSequence(
- symbol("class-type"), ruleFor("class-type"), {SharedType});
- const auto *EnumType = &Arena.createSequence(
- symbol("enum-type"), ruleFor("enum-type"), {SharedType});
- const auto *Alternative1 =
- &Arena.createSequence(symbol("type"), /*RuleID=*/4, {ClassType});
- const auto *Alternative2 =
- &Arena.createSequence(symbol("type"), /*RuleID=*/5, {EnumType});
- const auto *Type =
- &Arena.createAmbiguous(symbol("type"), {Alternative1, Alternative2});
- EXPECT_EQ(Type->dumpRecursive(G),
- "[ 0, end) type := <ambiguous>\n"
- "[ 0, end) ├─type := class-type\n"
- "[ 0, end) │ └─class-type := shared-type\n"
- "[ 0, end) │ └─shared-type := IDENTIFIER #1\n"
- "[ 0, end) │ └─IDENTIFIER := tok[0]\n"
- "[ 0, end) └─type := enum-type\n"
- "[ 0, end) └─enum-type := shared-type\n"
- "[ 0, end) └─shared-type =#1\n");
-}
-
-TEST_F(ForestTest, DumpAbbreviatedShared) {
- build(R"cpp(
- _ := A
- A := B
- B := *
- )cpp");
-
- ForestArena Arena;
- const auto *Star = &Arena.createTerminal(tok::star, 0);
-
- const auto *B = &Arena.createSequence(symbol("B"), ruleFor("B"), {Star});
- // We have two identical (but distinct) A nodes.
- // The GLR parser would never produce this, but it makes the example simpler.
- const auto *A1 = &Arena.createSequence(symbol("A"), ruleFor("A"), {B});
- const auto *A2 = &Arena.createSequence(symbol("A"), ruleFor("A"), {B});
- const auto *A = &Arena.createAmbiguous(symbol("A"), {A1, A2});
-
- // We must not abbreviate away shared nodes: if we show A~* there's no way to
- // show that the intermediate B node is shared between A1 and A2.
- EXPECT_EQ(A->dumpRecursive(G, /*Abbreviate=*/true),
- "[ 0, end) A := <ambiguous>\n"
- "[ 0, end) ├─A~B := * #1\n"
- "[ 0, end) │ └─* := tok[0]\n"
- "[ 0, end) └─A~B =#1\n");
-}
-
-TEST_F(ForestTest, Iteration) {
- // Z
- // / \
- // X Y
- // |\|
- // A B
- ForestArena Arena;
- const auto *A = &Arena.createTerminal(tok::identifier, 0);
- const auto *B = &Arena.createOpaque(1, 0);
- const auto *X = &Arena.createSequence(2, 1, {A, B});
- const auto *Y = &Arena.createSequence(2, 2, {B});
- const auto *Z = &Arena.createAmbiguous(2, {X, Y});
-
- std::vector<const ForestNode *> Nodes;
- for (const ForestNode &N : Z->descendants())
- Nodes.push_back(&N);
- EXPECT_THAT(Nodes, testing::UnorderedElementsAre(A, B, X, Y, Z));
-
- Nodes.clear();
- for (const ForestNode &N : X->descendants())
- Nodes.push_back(&N);
- EXPECT_THAT(Nodes, testing::UnorderedElementsAre(X, A, B));
-}
-
-} // namespace
-} // namespace pseudo
-} // namespace clang
diff --git a/clang-tools-extra/pseudo/unittests/GLRTest.cpp b/clang-tools-extra/pseudo/unittests/GLRTest.cpp
deleted file mode 100644
index f361fb78247ac..0000000000000
--- a/clang-tools-extra/pseudo/unittests/GLRTest.cpp
+++ /dev/null
@@ -1,789 +0,0 @@
-//===--- GLRTest.cpp - Test the GLR parser ----------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "clang-pseudo/GLR.h"
-#include "clang-pseudo/Bracket.h"
-#include "clang-pseudo/Language.h"
-#include "clang-pseudo/Token.h"
-#include "clang-pseudo/grammar/Grammar.h"
-#include "clang/Basic/LangOptions.h"
-#include "clang/Basic/TokenKinds.h"
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/Support/FormatVariadic.h"
-#include "gmock/gmock.h"
-#include "gtest/gtest.h"
-#include <memory>
-
-namespace clang {
-namespace pseudo {
-
-llvm::raw_ostream &operator<<(llvm::raw_ostream &OS,
- const std::vector<const GSS::Node *> &Heads) {
- for (const auto *Head : Heads)
- OS << *Head << "\n";
- return OS;
-}
-
-namespace {
-
-using StateID = LRTable::StateID;
-using testing::AllOf;
-using testing::ElementsAre;
-using testing::IsEmpty;
-using testing::UnorderedElementsAre;
-
-MATCHER_P(state, StateID, "") { return arg->State == StateID; }
-MATCHER_P(parsedSymbol, FNode, "") { return arg->Payload == FNode; }
-MATCHER_P(parsedSymbolID, SID, "") { return arg->Payload->symbol() == SID; }
-MATCHER_P(start, Start, "") { return arg->Payload->startTokenIndex() == Start; }
-
-testing::Matcher<const GSS::Node *>
-parents(llvm::ArrayRef<const GSS::Node *> Parents) {
- return testing::Property(&GSS::Node::parents,
- testing::UnorderedElementsAreArray(Parents));
-}
-
-Token::Index recoverBraces(Token::Index Begin, const TokenStream &Code) {
- EXPECT_GT(Begin, 0u);
- const Token &Left = Code.tokens()[Begin - 1];
- EXPECT_EQ(Left.Kind, tok::l_brace);
- if (const auto* Right = Left.pair()) {
- EXPECT_EQ(Right->Kind, tok::r_brace);
- return Code.index(*Right);
- }
- return Token::Invalid;
-}
-
-class GLRTest : public ::testing::Test {
-public:
- void build(llvm::StringRef GrammarBNF) {
- std::vector<std::string> Diags;
- TestLang.G = Grammar::parseBNF(GrammarBNF, Diags);
- }
-
- TokenStream emptyTokenStream() {
- TokenStream Empty;
- Empty.finalize();
- return Empty;
- }
-
- void buildGrammar(std::vector<std::string> Nonterminals,
- std::vector<std::string> Rules) {
- Nonterminals.push_back("_");
- llvm::sort(Nonterminals);
- Nonterminals.erase(std::unique(Nonterminals.begin(), Nonterminals.end()),
- Nonterminals.end());
- std::string FakeTestBNF;
- for (const auto &NT : Nonterminals)
- FakeTestBNF += llvm::formatv("{0} := {1}\n", "_", NT);
- FakeTestBNF += llvm::join(Rules, "\n");
- build(FakeTestBNF);
- }
-
- SymbolID id(llvm::StringRef Name) const {
- for (unsigned I = 0; I < NumTerminals; ++I)
- if (TestLang.G.table().Terminals[I] == Name)
- return tokenSymbol(static_cast<tok::TokenKind>(I));
- for (SymbolID ID = 0; ID < TestLang.G.table().Nonterminals.size(); ++ID)
- if (TestLang.G.table().Nonterminals[ID].Name == Name)
- return ID;
- ADD_FAILURE() << "No such symbol found: " << Name;
- return 0;
- }
- ExtensionID extensionID(llvm::StringRef AttrValueName) const {
- for (ExtensionID EID = 0; EID < TestLang.G.table().AttributeValues.size();
- ++EID)
- if (TestLang.G.table().AttributeValues[EID] == AttrValueName)
- return EID;
- ADD_FAILURE() << "No such attribute value found: " << AttrValueName;
- return 0;
- }
-
- RuleID ruleFor(llvm::StringRef NonterminalName) const {
- auto RuleRange =
- TestLang.G.table().Nonterminals[id(NonterminalName)].RuleRange;
- if (RuleRange.End - RuleRange.Start == 1)
- return TestLang.G.table()
- .Nonterminals[id(NonterminalName)]
- .RuleRange.Start;
- ADD_FAILURE() << "Expected a single rule for " << NonterminalName
- << ", but it has " << RuleRange.End - RuleRange.Start
- << " rule!\n";
- return 0;
- }
-
-protected:
- Language TestLang;
- ForestArena Arena;
- GSS GSStack;
-};
-
-TEST_F(GLRTest, ShiftMergingHeads) {
- // Given a test case where we have two heads 1, 2, 3 in the GSS, the heads 1,
- // 2 have shift actions to reach state 4, and the head 3 has a shift action to
- // reach state 5:
- // 0--1
- // └--2
- // └--3
- // After the shift action, the GSS (with new heads 4, 5) is:
- // 0---1---4
- // └---2---┘
- // └---3---5
- auto *GSSNode0 =
- GSStack.addNode(/*State=*/0, /*ForestNode=*/nullptr, /*Parents=*/{});
- auto *GSSNode1 = GSStack.addNode(/*State=*/1, /*ForestNode=*/nullptr,
- /*Parents=*/{GSSNode0});
- auto *GSSNode2 = GSStack.addNode(/*State=*/2, /*ForestNode=*/nullptr,
- /*Parents=*/{GSSNode0});
- auto *GSSNode3 = GSStack.addNode(/*State=*/3, /*ForestNode=*/nullptr,
- /*Parents=*/{GSSNode0});
-
- buildGrammar({}, {}); // Create a fake empty grammar.
- LRTable::Builder B(TestLang.G);
- B.Transition[{StateID{1}, tokenSymbol(tok::semi)}] = StateID{4};
- B.Transition[{StateID{2}, tokenSymbol(tok::semi)}] = StateID{4};
- B.Transition[{StateID{3}, tokenSymbol(tok::semi)}] = StateID{5};
- TestLang.Table = std::move(B).build();
-
- ForestNode &SemiTerminal = Arena.createTerminal(tok::semi, 0);
- std::vector<const GSS::Node *> NewHeads;
- glrShift({GSSNode1, GSSNode2, GSSNode3}, SemiTerminal,
- {emptyTokenStream(), Arena, GSStack}, TestLang, NewHeads);
-
- EXPECT_THAT(NewHeads,
- UnorderedElementsAre(AllOf(state(4), parsedSymbol(&SemiTerminal),
- parents({GSSNode1, GSSNode2})),
- AllOf(state(5), parsedSymbol(&SemiTerminal),
- parents({GSSNode3}))))
- << NewHeads;
-}
-
-TEST_F(GLRTest, ReduceConflictsSplitting) {
- // Before (splitting due to R/R conflict):
- // 0--1(IDENTIFIER)
- // After reducing 1 by `class-name := IDENTIFIER` and
- // `enum-name := IDENTIFIER`:
- // 0--2(class-name) // 2 is goto(0, class-name)
- // └--3(enum-name) // 3 is goto(0, enum-name)
- buildGrammar({"class-name", "enum-name"},
- {"class-name := IDENTIFIER", "enum-name := IDENTIFIER"});
- LRTable::Builder B(TestLang.G);
- B.Transition[{StateID{0}, id("class-name")}] = StateID{2};
- B.Transition[{StateID{0}, id("enum-name")}] = StateID{3};
- B.Reduce[StateID{1}].insert(ruleFor("class-name"));
- B.Reduce[StateID{1}].insert(ruleFor("enum-name"));
- TestLang.Table = std::move(B).build();
-
- const auto *GSSNode0 =
- GSStack.addNode(/*State=*/0, /*ForestNode=*/nullptr, /*Parents=*/{});
- const auto *GSSNode1 =
- GSStack.addNode(1, &Arena.createTerminal(tok::identifier, 0), {GSSNode0});
-
- std::vector<const GSS::Node *> Heads = {GSSNode1};
- glrReduce(Heads, tokenSymbol(tok::eof),
- {emptyTokenStream(), Arena, GSStack}, TestLang);
- EXPECT_THAT(Heads, UnorderedElementsAre(
- GSSNode1,
- AllOf(state(2), parsedSymbolID(id("class-name")),
- parents({GSSNode0})),
- AllOf(state(3), parsedSymbolID(id("enum-name")),
- parents({GSSNode0}))))
- << Heads;
-}
-
-TEST_F(GLRTest, ReduceSplittingDueToMultipleBases) {
- // Before (splitting due to multiple bases):
- // 2(class-name)--4(*)
- // 3(enum-name)---┘
- // After reducing 4 by `ptr-operator := *`:
- // 2(class-name)--5(ptr-operator) // 5 is goto(2, ptr-operator)
- // 3(enum-name)---6(ptr-operator) // 6 is goto(3, ptr-operator)
- buildGrammar({"ptr-operator", "class-name", "enum-name"},
- {"ptr-operator := *"});
-
- auto *ClassNameNode = &Arena.createOpaque(id("class-name"), /*TokenIndex=*/0);
- auto *EnumNameNode = &Arena.createOpaque(id("enum-name"), /*TokenIndex=*/0);
-
- const auto *GSSNode2 =
- GSStack.addNode(/*State=*/2, /*ForestNode=*/ClassNameNode, /*Parents=*/{});
- const auto *GSSNode3 =
- GSStack.addNode(/*State=*/3, /*ForestNode=*/EnumNameNode, /*Parents=*/{});
- const auto *GSSNode4 = GSStack.addNode(
- /*State=*/4, &Arena.createTerminal(tok::star, /*TokenIndex=*/1),
- /*Parents=*/{GSSNode2, GSSNode3});
-
- LRTable::Builder B(TestLang.G);
- B.Transition[{StateID{2}, id("ptr-operator")}] = StateID{5};
- B.Transition[{StateID{3}, id("ptr-operator")}] = StateID{6};
- B.Reduce[StateID{4}].insert(ruleFor("ptr-operator"));
- TestLang.Table = std::move(B).build();
-
- std::vector<const GSS::Node *> Heads = {GSSNode4};
- glrReduce(Heads, tokenSymbol(tok::eof), {emptyTokenStream(), Arena, GSStack},
- TestLang);
-
- EXPECT_THAT(Heads, UnorderedElementsAre(
- GSSNode4,
- AllOf(state(5), parsedSymbolID(id("ptr-operator")),
- parents({GSSNode2})),
- AllOf(state(6), parsedSymbolID(id("ptr-operator")),
- parents({GSSNode3}))))
- << Heads;
- // Verify that the payload of the two new heads is shared, only a single
- // ptr-operator node is created in the forest.
- EXPECT_EQ(Heads[1]->Payload, Heads[2]->Payload);
-}
-
-TEST_F(GLRTest, ReduceJoiningWithMultipleBases) {
- // Before (joining due to same goto state, multiple bases):
- // 0--1(cv-qualifier)--3(class-name)
- // └--2(cv-qualifier)--4(enum-name)
- // After reducing 3 by `type-name := class-name` and
- // 4 by `type-name := enum-name`:
- // 0--1(cv-qualifier)--5(type-name) // 5 is goto(1, type-name) and
- // └--2(cv-qualifier)--┘ // goto(2, type-name)
- buildGrammar({"type-name", "class-name", "enum-name", "cv-qualifier"},
- {"type-name := class-name", "type-name := enum-name"});
-
- auto *CVQualifierNode =
- &Arena.createOpaque(id("cv-qualifier"), /*TokenIndex=*/0);
- auto *ClassNameNode = &Arena.createOpaque(id("class-name"), /*TokenIndex=*/1);
- auto *EnumNameNode = &Arena.createOpaque(id("enum-name"), /*TokenIndex=*/1);
-
- const auto *GSSNode0 =
- GSStack.addNode(/*State=*/0, /*ForestNode=*/nullptr, /*Parents=*/{});
- const auto *GSSNode1 = GSStack.addNode(
- /*State=*/1, /*ForestNode=*/CVQualifierNode, /*Parents=*/{GSSNode0});
- const auto *GSSNode2 = GSStack.addNode(
- /*State=*/2, /*ForestNode=*/CVQualifierNode, /*Parents=*/{GSSNode0});
- const auto *GSSNode3 = GSStack.addNode(
- /*State=*/3, /*ForestNode=*/ClassNameNode,
- /*Parents=*/{GSSNode1});
- const auto *GSSNode4 =
- GSStack.addNode(/*State=*/4, /*ForestNode=*/EnumNameNode,
- /*Parents=*/{GSSNode2});
-
- // FIXME: figure out a way to get rid of the hard-coded reduce RuleID!
- LRTable::Builder B(TestLang.G);
- B.Transition[{StateID{1}, id("type-name")}] = StateID{5};
- B.Transition[{StateID{2}, id("type-name")}] = StateID{5};
- B.Reduce[StateID{3}].insert(/* type-name := class-name */ RuleID{0});
- B.Reduce[StateID{4}].insert(/* type-name := enum-name */ RuleID{1});
- TestLang.Table = std::move(B).build();
-
- std::vector<const GSS::Node *> Heads = {GSSNode3, GSSNode4};
- glrReduce(Heads, tokenSymbol(tok::eof), {emptyTokenStream(), Arena, GSStack},
- TestLang);
-
- // Verify that the stack heads are joint at state 5 after reduces.
- EXPECT_THAT(Heads, UnorderedElementsAre(GSSNode3, GSSNode4,
- AllOf(state(5),
- parsedSymbolID(id("type-name")),
- parents({GSSNode1, GSSNode2}))))
- << Heads;
- // Verify that we create an ambiguous ForestNode of two parses of `type-name`.
- EXPECT_EQ(Heads.back()->Payload->dumpRecursive(TestLang.G),
- "[ 1, end) type-name := <ambiguous>\n"
- "[ 1, end) ├─type-name := class-name\n"
- "[ 1, end) │ └─class-name := <opaque>\n"
- "[ 1, end) └─type-name := enum-name\n"
- "[ 1, end) └─enum-name := <opaque>\n");
-}
-
-TEST_F(GLRTest, ReduceJoiningWithSameBase) {
- // Before (joining due to same goto state, the same base):
- // 0--1(class-name)--3(*)
- // └--2(enum-name)--4(*)
- // After reducing 3 by `pointer := class-name *` and
- // 2 by `pointer := enum-name *`:
- // 0--5(pointer) // 5 is goto(0, pointer)
- buildGrammar({"pointer", "class-name", "enum-name"},
- {"pointer := class-name *", "pointer := enum-name *"});
-
- auto *ClassNameNode = &Arena.createOpaque(id("class-name"), /*TokenIndex=*/0);
- auto *EnumNameNode = &Arena.createOpaque(id("enum-name"), /*TokenIndex=*/0);
- auto *StartTerminal = &Arena.createTerminal(tok::star, /*TokenIndex=*/1);
-
- const auto *GSSNode0 =
- GSStack.addNode(/*State=*/0, /*ForestNode=*/nullptr, /*Parents=*/{});
- const auto *GSSNode1 =
- GSStack.addNode(/*State=*/1, /*ForestNode=*/ClassNameNode,
- /*Parents=*/{GSSNode0});
- const auto *GSSNode2 =
- GSStack.addNode(/*State=*/2, /*ForestNode=*/EnumNameNode,
- /*Parents=*/{GSSNode0});
- const auto *GSSNode3 =
- GSStack.addNode(/*State=*/3, /*ForestNode=*/StartTerminal,
- /*Parents=*/{GSSNode1});
- const auto *GSSNode4 =
- GSStack.addNode(/*State=*/4, /*ForestNode=*/StartTerminal,
- /*Parents=*/{GSSNode2});
-
- // FIXME: figure out a way to get rid of the hard-coded reduce RuleID!
- LRTable::Builder B(TestLang.G);
- B.Transition[{StateID{0}, id("pointer")}] = StateID{5};
- B.Reduce[StateID{3}].insert(/* pointer := class-name */ RuleID{0});
- B.Reduce[StateID{4}].insert(/* pointer := enum-name */ RuleID{1});
- TestLang.Table = std::move(B).build();
-
- std::vector<const GSS::Node *> Heads = {GSSNode3, GSSNode4};
- glrReduce(Heads, tokenSymbol(tok::eof),
- {emptyTokenStream(), Arena, GSStack}, TestLang);
-
- EXPECT_THAT(
- Heads, UnorderedElementsAre(GSSNode3, GSSNode4,
- AllOf(state(5), parsedSymbolID(id("pointer")),
- parents({GSSNode0}))))
- << Heads;
- EXPECT_EQ(Heads.back()->Payload->dumpRecursive(TestLang.G),
- "[ 0, end) pointer := <ambiguous>\n"
- "[ 0, end) ├─pointer := class-name *\n"
- "[ 0, 1) │ ├─class-name := <opaque>\n"
- "[ 1, end) │ └─* := tok[1]\n"
- "[ 0, end) └─pointer := enum-name *\n"
- "[ 0, 1) ├─enum-name := <opaque>\n"
- "[ 1, end) └─* := tok[1]\n");
-}
-
-TEST_F(GLRTest, ReduceLookahead) {
- // A term can be followed by +, but not by -.
- buildGrammar({"sum", "term"}, {"expr := term + term", "term := IDENTIFIER"});
- LRTable::Builder B(TestLang.G);
- B.Transition[{StateID{0}, id("term")}] = StateID{2};
- B.Reduce[StateID{1}].insert(RuleID{0});
- TestLang.Table = std::move(B).build();
-
- auto *Identifier = &Arena.createTerminal(tok::identifier, /*Start=*/0);
-
- const auto *Root =
- GSStack.addNode(/*State=*/0, /*ForestNode=*/nullptr, /*Parents=*/{});
- const auto *GSSNode1 =
- GSStack.addNode(/*State=*/1, /*ForestNode=*/Identifier, {Root});
-
- // When the lookahead is +, reduce is performed.
- std::vector<const GSS::Node *> Heads = {GSSNode1};
- glrReduce(Heads, tokenSymbol(tok::plus), {emptyTokenStream(), Arena, GSStack},
- TestLang);
- EXPECT_THAT(Heads,
- ElementsAre(GSSNode1, AllOf(state(2), parsedSymbolID(id("term")),
- parents(Root))));
-
- // When the lookahead is -, reduce is not performed.
- Heads = {GSSNode1};
- glrReduce(Heads, tokenSymbol(tok::minus),
- {emptyTokenStream(), Arena, GSStack}, TestLang);
- EXPECT_THAT(Heads, ElementsAre(GSSNode1));
-}
-
-TEST_F(GLRTest, Recover) {
- // Recovery while parsing "word" inside braces.
- // Before:
- // 0--1({)--2(?)
- // After recovering a `word` at state 1:
- // 0--3(word) // 3 is goto(1, word)
- buildGrammar({"word", "top"}, {"top := { word [recover=Braces] }"});
- LRTable::Builder B(TestLang.G);
- B.Transition[{StateID{1}, id("word")}] = StateID{3};
- B.Recoveries.push_back({StateID{1}, {extensionID("Braces"), id("word")}});
- TestLang.Table = std::move(B).build();
- TestLang.RecoveryStrategies.try_emplace(extensionID("Braces"), recoverBraces);
-
- auto *LBrace = &Arena.createTerminal(tok::l_brace, 0);
- auto *Question1 = &Arena.createTerminal(tok::question, 1);
- const auto *Root = GSStack.addNode(0, nullptr, {});
- const auto *OpenedBraces = GSStack.addNode(1, LBrace, {Root});
- const auto *AfterQuestion1 = GSStack.addNode(2, Question1, {OpenedBraces});
-
- // Need a token stream with paired braces so the strategy works.
- clang::LangOptions LOptions;
- TokenStream Tokens = cook(lex("{ ? ? ? }", LOptions), LOptions);
- pairBrackets(Tokens);
- std::vector<const GSS::Node *> NewHeads;
-
- unsigned TokenIndex = 2;
- glrRecover({AfterQuestion1}, TokenIndex, {Tokens, Arena, GSStack}, TestLang,
- NewHeads);
- EXPECT_EQ(TokenIndex, 4u) << "should skip ahead to matching brace";
- EXPECT_THAT(NewHeads, ElementsAre(AllOf(state(3), parsedSymbolID(id("word")),
- parents({OpenedBraces}), start(1u))));
- EXPECT_EQ(NewHeads.front()->Payload->kind(), ForestNode::Opaque);
-
- // Test recovery failure: omit closing brace so strategy fails
- TokenStream NoRBrace = cook(lex("{ ? ? ? ?", LOptions), LOptions);
- pairBrackets(NoRBrace);
- NewHeads.clear();
- TokenIndex = 2;
- glrRecover({AfterQuestion1}, TokenIndex, {NoRBrace, Arena, GSStack}, TestLang,
- NewHeads);
- EXPECT_EQ(TokenIndex, 2u) << "should not advance on failure";
- EXPECT_THAT(NewHeads, IsEmpty());
-}
-
-TEST_F(GLRTest, RecoverRightmost) {
- // In a nested block structure, we recover at the innermost possible block.
- // Before:
- // 0--1({)--1({)--1({)
- // After recovering a `block` at inside the second braces:
- // 0--1({)--2(body) // 2 is goto(1, body)
- buildGrammar({"body", "top"}, {"top := { body [recover=Braces] }"});
- LRTable::Builder B(TestLang.G);
- B.Transition[{StateID{1}, id("body")}] = StateID{2};
- B.Recoveries.push_back({StateID{1}, {extensionID("Braces"), id("body")}});
- TestLang.Table = std::move(B).build();
- TestLang.RecoveryStrategies.try_emplace(extensionID("Braces"), recoverBraces);
-
- clang::LangOptions LOptions;
- // Innermost brace is unmatched, to test fallback to next brace.
- TokenStream Tokens = cook(lex("{ { { ? } }", LOptions), LOptions);
- Tokens.tokens()[0].Pair = 5;
- Tokens.tokens()[1].Pair = 4;
- Tokens.tokens()[4].Pair = 1;
- Tokens.tokens()[5].Pair = 0;
-
- auto *Brace1 = &Arena.createTerminal(tok::l_brace, 0);
- auto *Brace2 = &Arena.createTerminal(tok::l_brace, 1);
- auto *Brace3 = &Arena.createTerminal(tok::l_brace, 2);
- const auto *Root = GSStack.addNode(0, nullptr, {});
- const auto *In1 = GSStack.addNode(1, Brace1, {Root});
- const auto *In2 = GSStack.addNode(1, Brace2, {In1});
- const auto *In3 = GSStack.addNode(1, Brace3, {In2});
-
- unsigned TokenIndex = 3;
- std::vector<const GSS::Node *> NewHeads;
- glrRecover({In3}, TokenIndex, {Tokens, Arena, GSStack}, TestLang, NewHeads);
- EXPECT_EQ(TokenIndex, 5u);
- EXPECT_THAT(NewHeads, ElementsAre(AllOf(state(2), parsedSymbolID(id("body")),
- parents({In2}), start(2u))));
-}
-
-TEST_F(GLRTest, RecoverAlternatives) {
- // Recovery inside braces with multiple equally good options
- // Before:
- // 0--1({)
- // After recovering either `word` or `number` inside the braces:
- // 0--1({)--2(word) // 2 is goto(1, word)
- // └--3(number) // 3 is goto(1, number)
- buildGrammar({"number", "word", "top"},
- {
- "top := { number [recover=Braces] }",
- "top := { word [recover=Braces] }",
- });
- LRTable::Builder B(TestLang.G);
- B.Transition[{StateID{1}, id("number")}] = StateID{2};
- B.Transition[{StateID{1}, id("word")}] = StateID{3};
- B.Recoveries.push_back({StateID{1}, {extensionID("Braces"), id("number")}});
- B.Recoveries.push_back({StateID{1}, {extensionID("Braces"), id("word")}});
- TestLang.RecoveryStrategies.try_emplace(extensionID("Braces"), recoverBraces);
- TestLang.Table = std::move(B).build();
- auto *LBrace = &Arena.createTerminal(tok::l_brace, 0);
- const auto *Root = GSStack.addNode(0, nullptr, {});
- const auto *OpenedBraces = GSStack.addNode(1, LBrace, {Root});
-
- clang::LangOptions LOptions;
- TokenStream Tokens = cook(lex("{ ? }", LOptions), LOptions);
- pairBrackets(Tokens);
- std::vector<const GSS::Node *> NewHeads;
- unsigned TokenIndex = 1;
-
- glrRecover({OpenedBraces}, TokenIndex, {Tokens, Arena, GSStack}, TestLang,
- NewHeads);
- EXPECT_EQ(TokenIndex, 2u);
- EXPECT_THAT(NewHeads,
- UnorderedElementsAre(AllOf(state(2), parsedSymbolID(id("number")),
- parents({OpenedBraces}), start(1u)),
- AllOf(state(3), parsedSymbolID(id("word")),
- parents({OpenedBraces}), start(1u))));
-}
-
-TEST_F(GLRTest, PerfectForestNodeSharing) {
- // Run the GLR on a simple grammar and test that we build exactly one forest
- // node per (SymbolID, token range).
-
- // This is a grmammar where the original parsing-stack-based forest node
- // sharing approach will fail. In its LR0 graph, it has two states containing
- // item `expr := • IDENTIFIER`, and both have different goto states on the
- // nonterminal `expr`.
- build(R"bnf(
- _ := test EOF
-
- test := { expr
- test := { IDENTIFIER
- test := left-paren expr
- left-paren := {
- expr := IDENTIFIER
- )bnf");
- TestLang.Table = LRTable::buildSLR(TestLang.G);
- clang::LangOptions LOptions;
- const TokenStream &Tokens = cook(lex("{ abc", LOptions), LOptions);
-
- const ForestNode &Parsed =
- glrParse({Tokens, Arena, GSStack}, id("test"), TestLang);
- // Verify that there is no duplicated sequence node of `expr := IDENTIFIER`
- // in the forest, see the `#1` and `=#1` in the dump string.
- EXPECT_EQ(Parsed.dumpRecursive(TestLang.G),
- "[ 0, end) test := <ambiguous>\n"
- "[ 0, end) ├─test := { expr\n"
- "[ 0, 1) │ ├─{ := tok[0]\n"
- "[ 1, end) │ └─expr := IDENTIFIER #1\n"
- "[ 1, end) │ └─IDENTIFIER := tok[1]\n"
- "[ 0, end) ├─test := { IDENTIFIER\n"
- "[ 0, 1) │ ├─{ := tok[0]\n"
- "[ 1, end) │ └─IDENTIFIER := tok[1]\n"
- "[ 0, end) └─test := left-paren expr\n"
- "[ 0, 1) ├─left-paren := {\n"
- "[ 0, 1) │ └─{ := tok[0]\n"
- "[ 1, end) └─expr =#1\n");
-}
-
-TEST_F(GLRTest, GLRReduceOrder) {
- // Given the following grammar, and the input `IDENTIFIER`, reductions should
- // be performed in the following order:
- // 1. foo := IDENTIFIER
- // 2. { test := IDENTIFIER, test := foo }
- // foo should be reduced first, so that in step 2 we have completed reduces
- // for test, and form an ambiguous forest node.
- build(R"bnf(
- _ := test EOF
-
- test := IDENTIFIER
- test := foo
- foo := IDENTIFIER
- )bnf");
- clang::LangOptions LOptions;
- const TokenStream &Tokens = cook(lex("IDENTIFIER", LOptions), LOptions);
- TestLang.Table = LRTable::buildSLR(TestLang.G);
-
- const ForestNode &Parsed =
- glrParse({Tokens, Arena, GSStack}, id("test"), TestLang);
- EXPECT_EQ(Parsed.dumpRecursive(TestLang.G),
- "[ 0, end) test := <ambiguous>\n"
- "[ 0, end) ├─test := IDENTIFIER\n"
- "[ 0, end) │ └─IDENTIFIER := tok[0]\n"
- "[ 0, end) └─test := foo\n"
- "[ 0, end) └─foo := IDENTIFIER\n"
- "[ 0, end) └─IDENTIFIER := tok[0]\n");
-}
-
-TEST_F(GLRTest, RecoveryEndToEnd) {
- // Simple example of brace-based recovery showing:
- // - recovered region includes tokens both ahead of and behind the cursor
- // - multiple possible recovery rules
- // - recovery from outer scopes is rejected
- build(R"bnf(
- _ := block EOF
-
- block := { block [recover=Braces] }
- block := { numbers [recover=Braces] }
- numbers := NUMERIC_CONSTANT NUMERIC_CONSTANT
- )bnf");
- TestLang.Table = LRTable::buildSLR(TestLang.G);
- TestLang.RecoveryStrategies.try_emplace(extensionID("Braces"), recoverBraces);
- clang::LangOptions LOptions;
- TokenStream Tokens = cook(lex("{ { 42 ? } }", LOptions), LOptions);
- pairBrackets(Tokens);
-
- const ForestNode &Parsed =
- glrParse({Tokens, Arena, GSStack}, id("block"), TestLang);
- EXPECT_EQ(Parsed.dumpRecursive(TestLang.G),
- "[ 0, end) block := { block [recover=Braces] }\n"
- "[ 0, 1) ├─{ := tok[0]\n"
- "[ 1, 5) ├─block := <ambiguous>\n"
- "[ 1, 5) │ ├─block := { block [recover=Braces] }\n"
- "[ 1, 2) │ │ ├─{ := tok[1]\n"
- "[ 2, 4) │ │ ├─block := <opaque>\n"
- "[ 4, 5) │ │ └─} := tok[4]\n"
- "[ 1, 5) │ └─block := { numbers [recover=Braces] }\n"
- "[ 1, 2) │ ├─{ := tok[1]\n"
- "[ 2, 4) │ ├─numbers := <opaque>\n"
- "[ 4, 5) │ └─} := tok[4]\n"
- "[ 5, end) └─} := tok[5]\n");
-}
-
-TEST_F(GLRTest, RecoverTerminal) {
- build(R"bnf(
- _ := stmt EOF
-
- stmt := IDENTIFIER ; [recover=Skip]
- )bnf");
- TestLang.Table = LRTable::buildSLR(TestLang.G);
- TestLang.RecoveryStrategies.try_emplace(
- extensionID("Skip"),
- [](Token::Index Start, const TokenStream &) { return Start; });
- clang::LangOptions LOptions;
- TokenStream Tokens = cook(lex("foo", LOptions), LOptions);
-
- const ForestNode &Parsed =
- glrParse({Tokens, Arena, GSStack}, id("stmt"), TestLang);
- EXPECT_EQ(Parsed.dumpRecursive(TestLang.G),
- "[ 0, end) stmt := IDENTIFIER ; [recover=Skip]\n"
- "[ 0, 1) ├─IDENTIFIER := tok[0]\n"
- "[ 1, end) └─; := <opaque>\n");
-}
-
-TEST_F(GLRTest, RecoverUnrestrictedReduce) {
- // Here, ! is not in any rule and therefore not in the follow set of `word`.
- // We would not normally reduce `word := IDENTIFIER`, but do so for recovery.
-
- build(R"bnf(
- _ := sentence EOF
-
- word := IDENTIFIER
- sentence := word word [recover=AcceptAnyTokenInstead]
- )bnf");
-
- clang::LangOptions LOptions;
- const TokenStream &Tokens = cook(lex("id !", LOptions), LOptions);
- TestLang.Table = LRTable::buildSLR(TestLang.G);
- TestLang.RecoveryStrategies.try_emplace(
- extensionID("AcceptAnyTokenInstead"),
- [](Token::Index Start, const TokenStream &Stream) { return Start + 1; });
-
- const ForestNode &Parsed =
- glrParse({Tokens, Arena, GSStack}, id("sentence"), TestLang);
- EXPECT_EQ(Parsed.dumpRecursive(TestLang.G),
- "[ 0, end) sentence := word word [recover=AcceptAnyTokenInstead]\n"
- "[ 0, 1) ├─word := IDENTIFIER\n"
- "[ 0, 1) │ └─IDENTIFIER := tok[0]\n"
- "[ 1, end) └─word := <opaque>\n");
-}
-
-TEST_F(GLRTest, RecoveryFromStartOfInput) {
- build(R"bnf(
- _ := start [recover=Fallback] EOF
-
- start := IDENTIFIER
- )bnf");
- TestLang.Table = LRTable::buildSLR(TestLang.G);
- bool fallback_recovered = false;
- auto fallback = [&](Token::Index Start, const TokenStream & Code) {
- fallback_recovered = true;
- return Code.tokens().size();
- };
- TestLang.RecoveryStrategies.try_emplace(
- extensionID("Fallback"),
- fallback);
- clang::LangOptions LOptions;
- TokenStream Tokens = cook(lex("?", LOptions), LOptions);
-
- const ForestNode &Parsed =
- glrParse({Tokens, Arena, GSStack}, id("start"), TestLang);
- EXPECT_TRUE(fallback_recovered);
- EXPECT_EQ(Parsed.dumpRecursive(TestLang.G),
- "[ 0, end) start := <opaque>\n");
-}
-
-TEST_F(GLRTest, RepeatedRecovery) {
- // We require multiple steps of recovery at eof and then a reduction in order
- // to successfully parse.
- build(R"bnf(
- _ := function EOF
- # FIXME: this forces EOF to be in follow(signature).
- # Remove it once we use unconstrained reduction for recovery.
- _ := signature EOF
-
- function := signature body [recover=Skip]
- signature := IDENTIFIER params [recover=Skip]
- params := ( )
- body := { }
- )bnf");
- TestLang.Table = LRTable::buildSLR(TestLang.G);
- TestLang.RecoveryStrategies.try_emplace(
- extensionID("Skip"),
- [](Token::Index Start, const TokenStream &) { return Start; });
- clang::LangOptions LOptions;
- TokenStream Tokens = cook(lex("main", LOptions), LOptions);
-
- const ForestNode &Parsed =
- glrParse({Tokens, Arena, GSStack}, id("function"), TestLang);
- EXPECT_EQ(Parsed.dumpRecursive(TestLang.G),
- "[ 0, end) function := signature body [recover=Skip]\n"
- "[ 0, 1) ├─signature := IDENTIFIER params [recover=Skip]\n"
- "[ 0, 1) │ ├─IDENTIFIER := tok[0]\n"
- "[ 1, 1) │ └─params := <opaque>\n"
- "[ 1, end) └─body := <opaque>\n");
-}
-
-TEST_F(GLRTest, NoExplicitAccept) {
- build(R"bnf(
- _ := test EOF
-
- test := IDENTIFIER test
- test := IDENTIFIER
- )bnf");
- clang::LangOptions LOptions;
- // Given the following input, and the grammar above, we perform two reductions
- // of the nonterminal `test` when the next token is `eof`, verify that the
- // parser stops at the right state.
- const TokenStream &Tokens = cook(lex("id id", LOptions), LOptions);
- TestLang.Table = LRTable::buildSLR(TestLang.G);
-
- const ForestNode &Parsed =
- glrParse({Tokens, Arena, GSStack}, id("test"), TestLang);
- EXPECT_EQ(Parsed.dumpRecursive(TestLang.G),
- "[ 0, end) test := IDENTIFIER test\n"
- "[ 0, 1) ├─IDENTIFIER := tok[0]\n"
- "[ 1, end) └─test := IDENTIFIER\n"
- "[ 1, end) └─IDENTIFIER := tok[1]\n");
-}
-
-TEST_F(GLRTest, GuardExtension) {
- build(R"bnf(
- _ := start EOF
-
- start := IDENTIFIER [guard]
- )bnf");
- TestLang.Guards.try_emplace(
- ruleFor("start"), [&](const GuardParams &P) {
- assert(P.RHS.size() == 1 &&
- P.RHS.front()->symbol() ==
- tokenSymbol(clang::tok::identifier));
- return P.Tokens.tokens()[P.RHS.front()->startTokenIndex()]
- .text() == "test";
- });
- clang::LangOptions LOptions;
- TestLang.Table = LRTable::buildSLR(TestLang.G);
-
- std::string Input = "test";
- const TokenStream &Succeeded = cook(lex(Input, LOptions), LOptions);
- EXPECT_EQ(glrParse({Succeeded, Arena, GSStack}, id("start"), TestLang)
- .dumpRecursive(TestLang.G),
- "[ 0, end) start := IDENTIFIER [guard]\n"
- "[ 0, end) └─IDENTIFIER := tok[0]\n");
-
- Input = "notest";
- const TokenStream &Failed = cook(lex(Input, LOptions), LOptions);
- EXPECT_EQ(glrParse({Failed, Arena, GSStack}, id("start"), TestLang)
- .dumpRecursive(TestLang.G),
- "[ 0, end) start := <opaque>\n");
-}
-
-TEST(GSSTest, GC) {
- // ┌-A-┬-AB
- // ├-B-┘
- // Root-+-C
- // ├-D
- // └-E
- GSS GSStack;
- auto *Root = GSStack.addNode(0, nullptr, {});
- auto *A = GSStack.addNode(0, nullptr, {Root});
- auto *B = GSStack.addNode(0, nullptr, {Root});
- auto *C = GSStack.addNode(0, nullptr, {Root});
- auto *D = GSStack.addNode(0, nullptr, {Root});
- auto *AB = GSStack.addNode(0, nullptr, {A, B});
-
- EXPECT_EQ(1u, GSStack.gc({AB, C})) << "D is destroyed";
- EXPECT_EQ(0u, GSStack.gc({AB, C})) << "D is already gone";
- auto *E = GSStack.addNode(0, nullptr, {Root});
- EXPECT_EQ(D, E) << "Storage of GCed node D is reused for E";
- EXPECT_EQ(3u, GSStack.gc({A, E})) << "Destroys B, AB, C";
- EXPECT_EQ(1u, GSStack.gc({E})) << "Destroys A";
-}
-
-} // namespace
-} // namespace pseudo
-} // namespace clang
diff --git a/clang-tools-extra/pseudo/unittests/GrammarTest.cpp b/clang-tools-extra/pseudo/unittests/GrammarTest.cpp
deleted file mode 100644
index 6b6b47b8a2dbe..0000000000000
--- a/clang-tools-extra/pseudo/unittests/GrammarTest.cpp
+++ /dev/null
@@ -1,213 +0,0 @@
-//===--- GrammarTest.cpp - grammar tests -----------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "clang-pseudo/grammar/Grammar.h"
-#include "gmock/gmock.h"
-#include "gtest/gtest.h"
-#include <memory>
-
-namespace clang {
-namespace pseudo {
-namespace {
-
-using testing::AllOf;
-using testing::ElementsAre;
-using testing::IsEmpty;
-using testing::Pair;
-using testing::UnorderedElementsAre;
-
-MATCHER_P(TargetID, SID, "") { return arg.Target == SID; }
-template <typename... T> testing::Matcher<const Rule &> Sequence(T... IDs) {
- return testing::Property(&Rule::seq, ElementsAre(IDs...));
-}
-
-class GrammarTest : public ::testing::Test {
-public:
- void build(llvm::StringRef BNF) {
- Diags.clear();
- G = Grammar::parseBNF(BNF, Diags);
- }
-
- SymbolID id(llvm::StringRef Name) const {
- for (unsigned I = 0; I < NumTerminals; ++I)
- if (G.table().Terminals[I] == Name)
- return tokenSymbol(static_cast<tok::TokenKind>(I));
- for (SymbolID ID = 0; ID < G.table().Nonterminals.size(); ++ID)
- if (G.table().Nonterminals[ID].Name == Name)
- return ID;
- ADD_FAILURE() << "No such symbol found: " << Name;
- return 0;
- }
-
- RuleID ruleFor(llvm::StringRef NonterminalName) const {
- auto RuleRange = G.table().Nonterminals[id(NonterminalName)].RuleRange;
- if (RuleRange.End - RuleRange.Start == 1)
- return G.table().Nonterminals[id(NonterminalName)].RuleRange.Start;
- ADD_FAILURE() << "Expected a single rule for " << NonterminalName
- << ", but it has " << RuleRange.End - RuleRange.Start
- << " rule!\n";
- return 0;
- }
-
-protected:
- Grammar G;
- std::vector<std::string> Diags;
-};
-
-TEST_F(GrammarTest, Basic) {
- build("_ := IDENTIFIER + _ # comment");
- EXPECT_THAT(Diags, IsEmpty());
-
- auto ExpectedRule =
- AllOf(TargetID(id("_")), Sequence(id("IDENTIFIER"), id("+"), id("_")));
- EXPECT_EQ(G.symbolName(id("_")), "_");
- EXPECT_THAT(G.rulesFor(id("_")), UnorderedElementsAre(ExpectedRule));
- const auto &Rule = G.lookupRule(/*RID=*/0);
- EXPECT_THAT(Rule, ExpectedRule);
- EXPECT_THAT(G.symbolName(Rule.seq()[0]), "IDENTIFIER");
- EXPECT_THAT(G.symbolName(Rule.seq()[1]), "+");
- EXPECT_THAT(G.symbolName(Rule.seq()[2]), "_");
-}
-
-TEST_F(GrammarTest, EliminatedOptional) {
- build("_ := CONST_opt INT ;_opt");
- EXPECT_THAT(Diags, IsEmpty());
- EXPECT_THAT(G.table().Rules,
- UnorderedElementsAre(Sequence(id("INT")),
- Sequence(id("CONST"), id("INT")),
- Sequence(id("CONST"), id("INT"), id(";")),
- Sequence(id("INT"), id(";"))));
-}
-
-TEST_F(GrammarTest, RuleIDSorted) {
- build(R"bnf(
- _ := x
-
- x := y
- y := z
- z := IDENTIFIER
- )bnf");
- ASSERT_TRUE(Diags.empty());
-
- EXPECT_LT(ruleFor("z"), ruleFor("y"));
- EXPECT_LT(ruleFor("y"), ruleFor("x"));
- EXPECT_LT(ruleFor("x"), ruleFor("_"));
-}
-
-TEST_F(GrammarTest, Annotation) {
- build(R"bnf(
- _ := x
- x := IDENTIFIER [guard]
- )bnf");
- ASSERT_THAT(Diags, IsEmpty());
- EXPECT_FALSE(G.lookupRule(ruleFor("_")).Guarded);
- EXPECT_TRUE(G.lookupRule(ruleFor("x")).Guarded);
-}
-
-TEST_F(GrammarTest, Diagnostics) {
- build(R"cpp(
- _ := ,_opt
- _ := undefined-sym
- null :=
- _ := IDENFIFIE # a typo of the terminal IDENFITIER
-
- invalid
- # cycle
- a := b
- b := a
-
- _ := IDENTIFIER [unknown=value]
- )cpp");
-
- EXPECT_EQ(G.underscore(), id("_"));
- EXPECT_THAT(Diags, UnorderedElementsAre(
- "Rule '_ := ,_opt' has a nullable RHS",
- "Rule 'null := ' has a nullable RHS",
- "No rules for nonterminal: undefined-sym",
- "Failed to parse 'invalid': no separator :=",
- "Token-like name IDENFIFIE is used as a nonterminal",
- "No rules for nonterminal: IDENFIFIE",
- "The grammar contains a cycle involving symbol a",
- "Unknown attribute 'unknown'"));
-}
-
-TEST_F(GrammarTest, DuplicatedDiagnostics) {
- build(R"cpp(
- _ := test
-
- test := INT
- test := DOUBLE
- test := INT
- )cpp");
-
- EXPECT_THAT(Diags, UnorderedElementsAre("Duplicate rule: `test := INT`"));
-}
-
-TEST_F(GrammarTest, FirstAndFollowSets) {
- build(
- R"bnf(
-_ := expr
-expr := expr - term
-expr := term
-term := IDENTIFIER
-term := ( expr )
-)bnf");
- ASSERT_TRUE(Diags.empty());
- auto ToPairs = [](std::vector<llvm::DenseSet<SymbolID>> Input) {
- std::vector<std::pair<SymbolID, llvm::DenseSet<SymbolID>>> Sets;
- for (SymbolID ID = 0; ID < Input.size(); ++ID)
- Sets.emplace_back(ID, std::move(Input[ID]));
- return Sets;
- };
-
- EXPECT_THAT(
- ToPairs(firstSets(G)),
- UnorderedElementsAre(
- Pair(id("_"), UnorderedElementsAre(id("IDENTIFIER"), id("("))),
- Pair(id("expr"), UnorderedElementsAre(id("IDENTIFIER"), id("("))),
- Pair(id("term"), UnorderedElementsAre(id("IDENTIFIER"), id("(")))));
- EXPECT_THAT(
- ToPairs(followSets(G)),
- UnorderedElementsAre(
- Pair(id("_"), UnorderedElementsAre(id("EOF"))),
- Pair(id("expr"), UnorderedElementsAre(id("-"), id("EOF"), id(")"))),
- Pair(id("term"), UnorderedElementsAre(id("-"), id("EOF"), id(")")))));
-
- build(R"bnf(
-# A simplfied C++ decl-specifier-seq.
-_ := decl-specifier-seq
-decl-specifier-seq := decl-specifier decl-specifier-seq
-decl-specifier-seq := decl-specifier
-decl-specifier := simple-type-specifier
-decl-specifier := INLINE
-simple-type-specifier := INT
- )bnf");
- ASSERT_TRUE(Diags.empty());
- EXPECT_THAT(
- ToPairs(firstSets(G)),
- UnorderedElementsAre(
- Pair(id("_"), UnorderedElementsAre(id("INLINE"), id("INT"))),
- Pair(id("decl-specifier-seq"),
- UnorderedElementsAre(id("INLINE"), id("INT"))),
- Pair(id("simple-type-specifier"), UnorderedElementsAre(id("INT"))),
- Pair(id("decl-specifier"),
- UnorderedElementsAre(id("INLINE"), id("INT")))));
- EXPECT_THAT(
- ToPairs(followSets(G)),
- UnorderedElementsAre(
- Pair(id("_"), UnorderedElementsAre(id("EOF"))),
- Pair(id("decl-specifier-seq"), UnorderedElementsAre(id("EOF"))),
- Pair(id("decl-specifier"),
- UnorderedElementsAre(id("INLINE"), id("INT"), id("EOF"))),
- Pair(id("simple-type-specifier"),
- UnorderedElementsAre(id("INLINE"), id("INT"), id("EOF")))));
-}
-
-} // namespace
-} // namespace pseudo
-} // namespace clang
diff --git a/clang-tools-extra/pseudo/unittests/LRTableTest.cpp b/clang-tools-extra/pseudo/unittests/LRTableTest.cpp
deleted file mode 100644
index 9c9f18e03a3d4..0000000000000
--- a/clang-tools-extra/pseudo/unittests/LRTableTest.cpp
+++ /dev/null
@@ -1,76 +0,0 @@
-//===--- LRTableTest.cpp - ---------------------------------------*- C++-*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "clang-pseudo/grammar/LRTable.h"
-#include "clang-pseudo/grammar/Grammar.h"
-#include "clang/Basic/TokenKinds.h"
-#include "llvm/Testing/Support/SupportHelpers.h"
-#include "gmock/gmock.h"
-#include "gtest/gtest.h"
-#include <vector>
-
-namespace clang {
-namespace pseudo {
-namespace {
-
-using llvm::ValueIs;
-using testing::ElementsAre;
-using StateID = LRTable::StateID;
-
-TEST(LRTable, Builder) {
- std::vector<std::string> GrammarDiags;
- Grammar G = Grammar::parseBNF(R"bnf(
- _ := expr # rule 0
- expr := term # rule 1
- expr := expr + term # rule 2
- term := IDENTIFIER # rule 3
- )bnf",
- GrammarDiags);
- EXPECT_THAT(GrammarDiags, testing::IsEmpty());
-
- SymbolID Term = *G.findNonterminal("term");
- SymbolID Eof = tokenSymbol(tok::eof);
- SymbolID Identifier = tokenSymbol(tok::identifier);
- SymbolID Plus = tokenSymbol(tok::plus);
-
- LRTable::Builder B(G);
- // eof IDENT term
- // +-------+----+-------+------+
- // |state0 | | s0 | |
- // |state1 | | | g3 |
- // |state2 | | | |
- // +-------+----+-------+------+-------
- B.Transition[{StateID{0}, Identifier}] = StateID{0};
- B.Transition[{StateID{1}, Term}] = StateID{3};
- B.Reduce[StateID{0}].insert(RuleID{0});
- B.Reduce[StateID{1}].insert(RuleID{2});
- B.Reduce[StateID{2}].insert(RuleID{1});
- LRTable T = std::move(B).build();
-
- EXPECT_EQ(T.getShiftState(0, Eof), std::nullopt);
- EXPECT_THAT(T.getShiftState(0, Identifier), ValueIs(0));
- EXPECT_THAT(T.getReduceRules(0), ElementsAre(0));
-
- EXPECT_EQ(T.getShiftState(1, Eof), std::nullopt);
- EXPECT_EQ(T.getShiftState(1, Identifier), std::nullopt);
- EXPECT_THAT(T.getGoToState(1, Term), ValueIs(3));
- EXPECT_THAT(T.getReduceRules(1), ElementsAre(2));
-
- // Verify the behaivor for other non-available-actions terminals.
- SymbolID Int = tokenSymbol(tok::kw_int);
- EXPECT_EQ(T.getShiftState(2, Int), std::nullopt);
-
- // Check follow sets.
- EXPECT_TRUE(T.canFollow(Term, Plus));
- EXPECT_TRUE(T.canFollow(Term, Eof));
- EXPECT_FALSE(T.canFollow(Term, Int));
-}
-
-} // namespace
-} // namespace pseudo
-} // namespace clang
More information about the cfe-commits
mailing list