[clang] 20e05b9 - [syntax][pseudo] Add Grammar for the clang pseudo-parser
Haojian Wu via cfe-commits
cfe-commits at lists.llvm.org
Thu Feb 3 02:28:51 PST 2022
Author: Haojian Wu
Date: 2022-02-03T11:28:27+01:00
New Revision: 20e05b9f0ebea35076b96c89257becd35d6de859
URL: https://github.com/llvm/llvm-project/commit/20e05b9f0ebea35076b96c89257becd35d6de859
DIFF: https://github.com/llvm/llvm-project/commit/20e05b9f0ebea35076b96c89257becd35d6de859.diff
LOG: [syntax][pseudo] Add Grammar for the clang pseudo-parser
This patch introduces the Grammar class, which is a critial piece for constructing
a tabled-based parser.
As the first patch, the scope is limited to:
- define base types (symbol, rules) of modeling the grammar
- construct Grammar by parsing the BNF file (annotations are excluded for now)
Differential Revision: https://reviews.llvm.org/D114790
Added:
clang/include/clang/Tooling/Syntax/Pseudo/Grammar.h
clang/lib/Tooling/Syntax/Pseudo/CMakeLists.txt
clang/lib/Tooling/Syntax/Pseudo/Grammar.cpp
clang/lib/Tooling/Syntax/Pseudo/GrammarBNF.cpp
clang/unittests/Tooling/Syntax/Pseudo/CMakeLists.txt
clang/unittests/Tooling/Syntax/Pseudo/GrammarTests.cpp
Modified:
clang/lib/Tooling/Syntax/CMakeLists.txt
clang/unittests/Tooling/Syntax/CMakeLists.txt
Removed:
################################################################################
diff --git a/clang/include/clang/Tooling/Syntax/Pseudo/Grammar.h b/clang/include/clang/Tooling/Syntax/Pseudo/Grammar.h
new file mode 100644
index 0000000000000..80db9f268ff13
--- /dev/null
+++ b/clang/include/clang/Tooling/Syntax/Pseudo/Grammar.h
@@ -0,0 +1,170 @@
+//===--- Grammar.h - grammar used by clang pseudo parser --------*- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines base structures for parsing & modeling a grammar for a
+// programming language:
+//
+// # This is a fake C++ BNF grammar
+// _ := translation-unit
+// translation-unit := declaration-seq_opt
+// declaration-seq := declaration
+// declaration-seq := declaration-seq declaration
+//
+// A grammar formally describes a language, and it is constructed by a set of
+// production rules. A rule is of BNF form (AAA := BBB CCC). A symbol is either
+// non-terminal or terminal, identified by a SymbolID.
+//
+// Notions about the BNF grammar:
+// - "_" is the augmented symbol, formed by start symbols.
+// - single-line comment is supported, starting with a #
+// - A rule describes how a nonterminal (left side of :=) is constructed, and
+// it is *per line* in the grammar file
+// - Terminals (also called tokens) correspond to the clang::TokenKind; they
+// are written in the grammar like "IDENTIFIER", "USING", "+"
+// - Nonterminals are specified with "lower-case" names in the grammar; they
+// shouldn't be nullable (has an empty sequence)
+// - optional symbols are supported (specified with a _opt suffix), and they
+// will be eliminated during the grammar parsing stage
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_TOOLING_SYNTAX_GRAMMAR_H
+#define LLVM_CLANG_TOOLING_SYNTAX_GRAMMAR_H
+
+#include "clang/Basic/TokenKinds.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringRef.h"
+#include <cstdint>
+#include <vector>
+
+namespace clang {
+namespace syntax {
+namespace pseudo {
+// A SymbolID uniquely identifies a terminal/non-terminal symbol in a grammar.
+// Non-terminal IDs are indexes into a table of non-terminal symbols.
+// Terminal IDs correspond to the clang TokenKind enum.
+using SymbolID = uint16_t;
+// SymbolID is only 12 bits wide.
+// There are maximum 2^11 terminals (aka tokens) and 2^11 nonterminals.
+static constexpr uint16_t SymbolBits = 12;
+static constexpr uint16_t NumTerminals = tok::NUM_TOKENS;
+// SymbolIDs with the top bit set are tokens/terminals.
+static constexpr SymbolID TokenFlag = 1 << (SymbolBits - 1);
+inline bool isToken(SymbolID ID) { return ID & TokenFlag; }
+inline bool isNonterminal(SymbolID ID) { return !isToken(ID); }
+// The terminals are always the clang tok::TokenKind (not all are used).
+inline tok::TokenKind symbolToToken(SymbolID SID) {
+ assert(isToken(SID));
+ SID &= ~TokenFlag;
+ assert(SID < NumTerminals);
+ return static_cast<tok::TokenKind>(SID);
+}
+inline SymbolID tokenSymbol(tok::TokenKind TK) {
+ return TokenFlag | static_cast<SymbolID>(TK);
+}
+
+// A RuleID uniquely identifies a production rule in a grammar.
+// It is an index into a table of rules.
+using RuleID = uint16_t;
+// There are maximum 2^12 rules.
+static constexpr unsigned RuleBits = 12;
+
+// Represent a production rule in the grammar, e.g.
+// expression := a b c
+// ^Target ^Sequence
+struct Rule {
+ Rule(SymbolID Target, llvm::ArrayRef<SymbolID> Seq);
+
+ // We occupy 4 bits for the sequence, in theory, it can be at most 2^4 tokens
+ // long, however, we're stricter in order to reduce the size, we limit the max
+ // lenth to 9 (this is the longest sequence in cxx grammar).
+ static constexpr unsigned SizeBits = 4;
+ static constexpr unsigned MaxElements = 9;
+ static_assert(MaxElements <= (1 << SizeBits), "Exceeds the maximum limit");
+ static_assert(SizeBits + SymbolBits <= 16,
+ "Must be able to store symbol ID + size efficiently");
+
+ // 16 bits for target symbol and size of sequence:
+ // SymbolID : 12 | Size : 4
+ SymbolID Target : SymbolBits;
+ uint8_t Size : SizeBits; // Size of the Sequence
+ SymbolID Sequence[MaxElements];
+
+ llvm::ArrayRef<SymbolID> seq() const {
+ return llvm::ArrayRef<SymbolID>(Sequence, Size);
+ }
+ friend bool operator==(const Rule &L, const Rule &R) {
+ return L.Target == R.Target && L.seq() == R.seq();
+ }
+};
+
+struct GrammarTable;
+
+// Grammar that describes a programming language, e.g. C++. It represents the
+// contents of the specified grammar.
+// It is a building block for constructing a table-based parser.
+class Grammar {
+public:
+ explicit Grammar(std::unique_ptr<GrammarTable> T) : T(std::move(T)) {}
+
+ // Parses grammar from a BNF file.
+ // Diagnostics emitted during parsing are stored in Diags.
+ static std::unique_ptr<Grammar> parseBNF(llvm::StringRef BNF,
+ std::vector<std::string> &Diags);
+
+ // Returns all rules of the given non-terminal symbol.
+ llvm::ArrayRef<Rule> rulesFor(SymbolID SID) const;
+ const Rule &lookupRule(RuleID RID) const;
+
+ // Gets symbol (terminal or non-terminal) name.
+ // Terminals have names like "," (kw_comma) or "OPERATOR" (kw_operator).
+ llvm::StringRef symbolName(SymbolID) const;
+
+ // Dumps the whole grammar.
+ std::string dump() const;
+ // Dumps a particular rule.
+ std::string dumpRule(RuleID) const;
+ // Dumps all rules of the given nonterminal symbol.
+ std::string dumpRules(SymbolID) const;
+
+ const GrammarTable &table() const { return *T; }
+
+private:
+ std::unique_ptr<GrammarTable> T;
+};
+
+// Storage for the underlying data of the Grammar.
+// It can be constructed dynamically (from compiling BNF file) or statically
+// (a compiled data-source).
+struct GrammarTable {
+ struct Nonterminal {
+ std::string Name;
+ // Corresponding rules that construct the non-terminal, it is a [start, end)
+ // index range of the Rules table.
+ struct {
+ RuleID start;
+ RuleID end;
+ } RuleRange;
+ };
+
+ // The rules are sorted (and thus grouped) by target symbol.
+ // RuleID is the index of the vector.
+ std::vector<Rule> Rules;
+ // A table of terminals (aka tokens). It correspond to the clang::Token.
+ // clang::tok::TokenKind is the index of the table.
+ std::vector<std::string> Terminals;
+ // A table of nonterminals, sorted by name.
+ // SymbolID is the index of the table.
+ std::vector<Nonterminal> Nonterminals;
+};
+
+} // namespace pseudo
+} // namespace syntax
+} // namespace clang
+
+#endif // LLVM_CLANG_TOOLING_SYNTAX_GRAMMAR_H
diff --git a/clang/lib/Tooling/Syntax/CMakeLists.txt b/clang/lib/Tooling/Syntax/CMakeLists.txt
index e933faeb0f506..f8d9184977e8a 100644
--- a/clang/lib/Tooling/Syntax/CMakeLists.txt
+++ b/clang/lib/Tooling/Syntax/CMakeLists.txt
@@ -19,3 +19,5 @@ add_clang_library(clangToolingSyntax
DEPENDS
omp_gen
)
+
+add_subdirectory(Pseudo)
diff --git a/clang/lib/Tooling/Syntax/Pseudo/CMakeLists.txt b/clang/lib/Tooling/Syntax/Pseudo/CMakeLists.txt
new file mode 100644
index 0000000000000..77dce4b70ea11
--- /dev/null
+++ b/clang/lib/Tooling/Syntax/Pseudo/CMakeLists.txt
@@ -0,0 +1,10 @@
+set(LLVM_LINK_COMPONENTS Support)
+
+add_clang_library(clangSyntaxPseudo
+ Grammar.cpp
+ GrammarBNF.cpp
+
+ LINK_LIBS
+ clangBasic
+ clangLex
+ )
diff --git a/clang/lib/Tooling/Syntax/Pseudo/Grammar.cpp b/clang/lib/Tooling/Syntax/Pseudo/Grammar.cpp
new file mode 100644
index 0000000000000..014e6b4d28bc6
--- /dev/null
+++ b/clang/lib/Tooling/Syntax/Pseudo/Grammar.cpp
@@ -0,0 +1,77 @@
+//===--- Grammar.cpp - Grammar for clang pseudo parser ----------*- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/Tooling/Syntax/Pseudo/Grammar.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace clang {
+namespace syntax {
+namespace pseudo {
+
+Rule::Rule(SymbolID Target, llvm::ArrayRef<SymbolID> Sequence)
+ : Target(Target), Size(static_cast<uint8_t>(Sequence.size())) {
+ assert(Sequence.size() <= Rule::MaxElements);
+ llvm::copy(Sequence, this->Sequence);
+}
+
+llvm::ArrayRef<Rule> Grammar::rulesFor(SymbolID SID) const {
+ assert(isNonterminal(SID));
+ const auto &R = T->Nonterminals[SID].RuleRange;
+ assert(R.end <= T->Rules.size());
+ return llvm::makeArrayRef(&T->Rules[R.start], R.end - R.start);
+}
+
+const Rule &Grammar::lookupRule(RuleID RID) const {
+ assert(RID < T->Rules.size());
+ return T->Rules[RID];
+}
+
+llvm::StringRef Grammar::symbolName(SymbolID SID) const {
+ if (isToken(SID))
+ return T->Terminals[symbolToToken(SID)];
+ return T->Nonterminals[SID].Name;
+}
+
+std::string Grammar::dumpRule(RuleID RID) const {
+ std::string Result;
+ llvm::raw_string_ostream OS(Result);
+ const Rule &R = T->Rules[RID];
+ OS << symbolName(R.Target) << " :=";
+ for (SymbolID SID : R.seq())
+ OS << " " << symbolName(SID);
+ return Result;
+}
+
+std::string Grammar::dumpRules(SymbolID SID) const {
+ assert(isNonterminal(SID));
+ std::string Result;
+ const auto &Range = T->Nonterminals[SID].RuleRange;
+ for (RuleID RID = Range.start; RID < Range.end; ++RID)
+ Result.append(dumpRule(RID)).push_back('\n');
+ return Result;
+}
+
+std::string Grammar::dump() const {
+ std::string Result;
+ llvm::raw_string_ostream OS(Result);
+ OS << "Nonterminals:\n";
+ for (SymbolID SID = 0; SID < T->Nonterminals.size(); ++SID)
+ OS << llvm::formatv(" {0} {1}\n", SID, symbolName(SID));
+ OS << "Rules:\n";
+ for (RuleID RID = 0; RID < T->Rules.size(); ++RID)
+ OS << llvm::formatv(" {0} {1}\n", RID, dumpRule(RID));
+ return OS.str();
+}
+
+} // namespace pseudo
+} // namespace syntax
+} // namespace clang
diff --git a/clang/lib/Tooling/Syntax/Pseudo/GrammarBNF.cpp b/clang/lib/Tooling/Syntax/Pseudo/GrammarBNF.cpp
new file mode 100644
index 0000000000000..40181e049f253
--- /dev/null
+++ b/clang/lib/Tooling/Syntax/Pseudo/GrammarBNF.cpp
@@ -0,0 +1,260 @@
+//===--- GrammarBNF.cpp - build grammar from BNF files ----------*- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/Basic/TokenKinds.h"
+#include "clang/Tooling/Syntax/Pseudo/Grammar.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/FormatVariadic.h"
+#include <memory>
+
+namespace clang {
+namespace syntax {
+namespace pseudo {
+
+namespace {
+static const llvm::StringRef OptSuffix = "_opt";
+static const llvm::StringRef StartSymbol = "_";
+
+void initTerminals(std::vector<std::string> &Out) {
+ Out.clear();
+ Out.reserve(NumTerminals);
+ for (unsigned I = 0; I < NumTerminals; ++I) {
+ tok::TokenKind K = static_cast<tok::TokenKind>(I);
+ if (const auto *Punc = tok::getPunctuatorSpelling(K))
+ Out.push_back(Punc);
+ else
+ Out.push_back(llvm::StringRef(tok::getTokenName(K)).upper());
+ }
+}
+// Builds grammar from BNF files.
+class GrammarBuilder {
+public:
+ GrammarBuilder(std::vector<std::string> &Diagnostics)
+ : Diagnostics(Diagnostics) {}
+
+ std::unique_ptr<Grammar> build(llvm::StringRef BNF) {
+ auto Specs = eliminateOptional(parse(BNF));
+
+ assert(llvm::all_of(Specs,
+ [](const RuleSpec &R) {
+ if (R.Target.endswith(OptSuffix))
+ return false;
+ return llvm::all_of(
+ R.Sequence, [](const RuleSpec::Element &E) {
+ return !E.Symbol.endswith(OptSuffix);
+ });
+ }) &&
+ "Optional symbols should be eliminated!");
+
+ auto T = std::make_unique<GrammarTable>();
+ initTerminals(T->Terminals);
+
+ // Assemble the name->ID and ID->nonterminal name maps.
+ llvm::DenseSet<llvm::StringRef> UniqueNonterminals;
+ llvm::DenseMap<llvm::StringRef, SymbolID> SymbolIds;
+ for (uint16_t I = 0; I < NumTerminals; ++I)
+ SymbolIds.try_emplace(T->Terminals[I], tokenSymbol(tok::TokenKind(I)));
+ auto Consider = [&](llvm::StringRef Name) {
+ if (!SymbolIds.count(Name))
+ UniqueNonterminals.insert(Name);
+ };
+ for (const auto &Spec : Specs) {
+ Consider(Spec.Target);
+ for (const RuleSpec::Element &Elt : Spec.Sequence)
+ Consider(Elt.Symbol);
+ }
+ llvm::for_each(UniqueNonterminals, [&T](llvm::StringRef Name) {
+ T->Nonterminals.emplace_back();
+ T->Nonterminals.back().Name = Name.str();
+ });
+ assert(T->Nonterminals.size() < (1 << (SymbolBits - 1)) &&
+ "Too many nonterminals to fit in SymbolID bits!");
+ llvm::sort(T->Nonterminals, [](const GrammarTable::Nonterminal &L,
+ const GrammarTable::Nonterminal &R) {
+ return L.Name < R.Name;
+ });
+ // Build name -> ID maps for nonterminals.
+ for (SymbolID SID = 0; SID < T->Nonterminals.size(); ++SID)
+ SymbolIds.try_emplace(T->Nonterminals[SID].Name, SID);
+
+ // Convert the rules.
+ T->Rules.reserve(Specs.size());
+ std::vector<SymbolID> Symbols;
+ auto Lookup = [SymbolIds](llvm::StringRef Name) {
+ auto It = SymbolIds.find(Name);
+ assert(It != SymbolIds.end() && "Didn't find the symbol in SymbolIds!");
+ return It->second;
+ };
+ for (const auto &Spec : Specs) {
+ assert(Spec.Sequence.size() < Rule::MaxElements);
+ Symbols.clear();
+ for (const RuleSpec::Element &Elt : Spec.Sequence)
+ Symbols.push_back(Lookup(Elt.Symbol));
+ T->Rules.push_back(Rule(Lookup(Spec.Target), Symbols));
+ }
+ assert(T->Rules.size() < (1 << RuleBits) &&
+ "Too many rules to fit in RuleID bits!");
+ llvm::sort(T->Rules, [](const Rule &Left, const Rule &Right) {
+ // Sorted by the Target.
+ return std::tie(Left.Target, Left.Size) <
+ std::tie(Right.Target, Right.Size);
+ });
+ RuleID RulePos = 0;
+ for (SymbolID SID = 0; SID < T->Nonterminals.size(); ++SID) {
+ RuleID Start = RulePos;
+ while (RulePos < T->Rules.size() && T->Rules[RulePos].Target == SID)
+ ++RulePos;
+ T->Nonterminals[SID].RuleRange = {Start, RulePos};
+ }
+ auto G = std::make_unique<Grammar>(std::move(T));
+ diagnoseGrammar(*G);
+ return G;
+ }
+
+private:
+ // Text representation of a BNF grammar rule.
+ struct RuleSpec {
+ llvm::StringRef Target;
+ struct Element {
+ llvm::StringRef Symbol; // Name of the symbol
+ };
+ std::vector<Element> Sequence;
+
+ std::string toString() const {
+ std::vector<llvm::StringRef> Body;
+ for (const auto &E : Sequence)
+ Body.push_back(E.Symbol);
+ return llvm::formatv("{0} := {1}", Target, llvm::join(Body, " "));
+ }
+ };
+
+ std::vector<RuleSpec> parse(llvm::StringRef Lines) {
+ std::vector<RuleSpec> Specs;
+ for (llvm::StringRef Line : llvm::split(Lines, '\n')) {
+ Line = Line.trim();
+ // Strip anything coming after the '#' (comment).
+ Line = Line.take_while([](char C) { return C != '#'; });
+ if (Line.empty())
+ continue;
+ RuleSpec Rule;
+ if (parseLine(Line, Rule))
+ Specs.push_back(std::move(Rule));
+ }
+ return Specs;
+ }
+
+ bool parseLine(llvm::StringRef Line, RuleSpec &Out) {
+ auto Parts = Line.split(":=");
+ if (Parts.first == Line) { // no separator in Line
+ Diagnostics.push_back(
+ llvm::formatv("Failed to parse '{0}': no separator :=", Line).str());
+ return false;
+ }
+
+ Out.Target = Parts.first.trim();
+ Out.Sequence.clear();
+ for (llvm::StringRef Chunk : llvm::split(Parts.second, ' ')) {
+ Chunk = Chunk.trim();
+ if (Chunk.empty())
+ continue; // skip empty
+
+ Out.Sequence.push_back({Chunk});
+ }
+ return true;
+ };
+
+ // Inlines all _opt symbols.
+ // For example, a rule E := id +_opt id, after elimination, we have two
+ // equivalent rules:
+ // 1) E := id + id
+ // 2) E := id id
+ std::vector<RuleSpec> eliminateOptional(llvm::ArrayRef<RuleSpec> Input) {
+ std::vector<RuleSpec> Results;
+ std::vector<RuleSpec::Element> Storage;
+ for (const auto &R : Input) {
+ eliminateOptionalTail(
+ R.Sequence, Storage, [&Results, &Storage, &R, this]() {
+ if (Storage.empty()) {
+ Diagnostics.push_back(
+ llvm::formatv("Rule '{0}' has a nullable RHS", R.toString()));
+ return;
+ }
+ Results.push_back({R.Target, Storage});
+ });
+ assert(Storage.empty());
+ }
+ return Results;
+ }
+ void eliminateOptionalTail(llvm::ArrayRef<RuleSpec::Element> Elements,
+ std::vector<RuleSpec::Element> &Result,
+ llvm::function_ref<void()> CB) {
+ if (Elements.empty())
+ return CB();
+ auto Front = Elements.front();
+ if (!Front.Symbol.endswith(OptSuffix)) {
+ Result.push_back(std::move(Front));
+ eliminateOptionalTail(Elements.drop_front(1), Result, CB);
+ Result.pop_back();
+ return;
+ }
+ // Enumerate two options: skip the opt symbol, or inline the symbol.
+ eliminateOptionalTail(Elements.drop_front(1), Result, CB); // skip
+ Front.Symbol = Front.Symbol.drop_back(OptSuffix.size()); // drop "_opt"
+ Result.push_back(std::move(Front));
+ eliminateOptionalTail(Elements.drop_front(1), Result, CB);
+ Result.pop_back();
+ }
+
+ // Diagnoses the grammar and emit warnings if any.
+ void diagnoseGrammar(const Grammar &G) {
+ const auto &T = G.table();
+ for (SymbolID SID = 0; SID < T.Nonterminals.size(); ++SID) {
+ auto Range = T.Nonterminals[SID].RuleRange;
+ if (Range.start == Range.end)
+ Diagnostics.push_back(
+ llvm::formatv("No rules for nonterminal: {0}", G.symbolName(SID)));
+ llvm::StringRef NameRef = T.Nonterminals[SID].Name;
+ if (llvm::all_of(NameRef, llvm::isAlpha) && NameRef.upper() == NameRef) {
+ Diagnostics.push_back(llvm::formatv(
+ "Token-like name {0} is used as a nonterminal", G.symbolName(SID)));
+ }
+ }
+ for (RuleID RID = 0; RID + 1 < T.Rules.size(); ++RID) {
+ if (T.Rules[RID] == T.Rules[RID + 1])
+ Diagnostics.push_back(
+ llvm::formatv("Duplicate rule: `{0}`", G.dumpRule(RID)));
+ // Warning for nullable non-terminals
+ if (T.Rules[RID].Size == 0)
+ Diagnostics.push_back(
+ llvm::formatv("Rule `{0}` has a nullable RHS", G.dumpRule(RID)));
+ }
+ // symbol-id -> used counts
+ std::vector<unsigned> UseCounts(T.Nonterminals.size(), 0);
+ for (const Rule &R : T.Rules)
+ for (SymbolID SID : R.seq())
+ if (isNonterminal(SID))
+ ++UseCounts[SID];
+ for (SymbolID SID = 0; SID < UseCounts.size(); ++SID)
+ if (UseCounts[SID] == 0 && T.Nonterminals[SID].Name != StartSymbol)
+ Diagnostics.push_back(
+ llvm::formatv("Nonterminal never used: {0}", G.symbolName(SID)));
+ }
+ std::vector<std::string> &Diagnostics;
+};
+} // namespace
+
+std::unique_ptr<Grammar>
+Grammar::parseBNF(llvm::StringRef BNF, std::vector<std::string> &Diagnostics) {
+ Diagnostics.clear();
+ return GrammarBuilder(Diagnostics).build(BNF);
+}
+
+} // namespace pseudo
+} // namespace syntax
+} // namespace clang
diff --git a/clang/unittests/Tooling/Syntax/CMakeLists.txt b/clang/unittests/Tooling/Syntax/CMakeLists.txt
index 174f3e7bf5732..a21d558c481cb 100644
--- a/clang/unittests/Tooling/Syntax/CMakeLists.txt
+++ b/clang/unittests/Tooling/Syntax/CMakeLists.txt
@@ -28,3 +28,5 @@ target_link_libraries(SyntaxTests
PRIVATE
LLVMTestingSupport
)
+
+add_subdirectory(Pseudo)
diff --git a/clang/unittests/Tooling/Syntax/Pseudo/CMakeLists.txt b/clang/unittests/Tooling/Syntax/Pseudo/CMakeLists.txt
new file mode 100644
index 0000000000000..77c6cbac026de
--- /dev/null
+++ b/clang/unittests/Tooling/Syntax/Pseudo/CMakeLists.txt
@@ -0,0 +1,20 @@
+set(LLVM_LINK_COMPONENTS
+ Support
+ )
+
+add_clang_unittest(ClangPseudoTests
+ GrammarTests.cpp
+)
+
+clang_target_link_libraries(ClangPseudoTests
+ PRIVATE
+ clangBasic
+ clangLex
+ clangSyntaxPseudo
+ clangTesting
+ )
+
+target_link_libraries(ClangPseudoTests
+ PRIVATE
+ LLVMTestingSupport
+ )
diff --git a/clang/unittests/Tooling/Syntax/Pseudo/GrammarTests.cpp b/clang/unittests/Tooling/Syntax/Pseudo/GrammarTests.cpp
new file mode 100644
index 0000000000000..7954e53d786c3
--- /dev/null
+++ b/clang/unittests/Tooling/Syntax/Pseudo/GrammarTests.cpp
@@ -0,0 +1,102 @@
+//===--- GrammarTests.cpp - grammar tests ----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/Tooling/Syntax/Pseudo/Grammar.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include <memory>
+
+namespace clang {
+namespace syntax {
+namespace pseudo {
+namespace {
+
+using testing::AllOf;
+using testing::ElementsAre;
+using testing::IsEmpty;
+using testing::UnorderedElementsAre;
+
+MATCHER_P(TargetID, SID, "") { return arg.Target == SID; }
+template <typename... T> testing::Matcher<const Rule &> Sequence(T... IDs) {
+ return testing::Property(&Rule::seq, ElementsAre(IDs...));
+}
+
+class GrammarTest : public ::testing::Test {
+public:
+ void build(llvm::StringRef BNF) {
+ Diags.clear();
+ G = Grammar::parseBNF(BNF, Diags);
+ }
+
+ SymbolID lookup(llvm::StringRef Name) const {
+ for (unsigned I = 0; I < NumTerminals; ++I)
+ if (G->table().Terminals[I] == Name)
+ return tokenSymbol(static_cast<tok::TokenKind>(I));
+ for (SymbolID ID = 0; ID < G->table().Nonterminals.size(); ++ID)
+ if (G->table().Nonterminals[ID].Name == Name)
+ return ID;
+ ADD_FAILURE() << "No such symbol found: " << Name;
+ return 0;
+ }
+
+protected:
+ std::unique_ptr<Grammar> G;
+ std::vector<std::string> Diags;
+};
+
+TEST_F(GrammarTest, Basic) {
+ build("expression := IDENTIFIER + expression # comment");
+ EXPECT_THAT(Diags, IsEmpty());
+
+ auto ExpectedRule =
+ AllOf(TargetID(lookup("expression")),
+ Sequence(lookup("IDENTIFIER"), lookup("+"), lookup("expression")));
+ auto ExpressionID = lookup("expression");
+ EXPECT_EQ(G->symbolName(ExpressionID), "expression");
+ EXPECT_THAT(G->rulesFor(ExpressionID), UnorderedElementsAre(ExpectedRule));
+ const auto &Rule = G->lookupRule(/*RID=*/0);
+ EXPECT_THAT(Rule, ExpectedRule);
+ EXPECT_THAT(G->symbolName(Rule.seq()[0]), "IDENTIFIER");
+ EXPECT_THAT(G->symbolName(Rule.seq()[1]), "+");
+ EXPECT_THAT(G->symbolName(Rule.seq()[2]), "expression");
+}
+
+TEST_F(GrammarTest, EliminatedOptional) {
+ build("_ := CONST_opt INT ;_opt");
+ EXPECT_THAT(Diags, IsEmpty());
+ EXPECT_THAT(G->table().Rules,
+ UnorderedElementsAre(
+ Sequence(lookup("INT")),
+ Sequence(lookup("CONST"), lookup("INT")),
+ Sequence(lookup("CONST"), lookup("INT"), lookup(";")),
+ Sequence(lookup("INT"), lookup(";"))));
+}
+
+TEST_F(GrammarTest, Diagnostics) {
+ build(R"cpp(
+ _ := ,_opt
+ _ := undefined-sym
+ null :=
+ _ := IDENFIFIE # a typo of the terminal IDENFITIER
+
+ invalid
+ )cpp");
+
+ EXPECT_THAT(Diags, UnorderedElementsAre(
+ "Rule '_ := ,_opt' has a nullable RHS",
+ "Rule 'null := ' has a nullable RHS",
+ "No rules for nonterminal: undefined-sym",
+ "Failed to parse 'invalid': no separator :=",
+ "Token-like name IDENFIFIE is used as a nonterminal",
+ "No rules for nonterminal: IDENFIFIE"));
+}
+
+} // namespace
+} // namespace pseudo
+} // namespace syntax
+} // namespace clang
More information about the cfe-commits
mailing list