[clang] a2fab82 - [pseudo] Implement LRTable.
Haojian Wu via cfe-commits
cfe-commits at lists.llvm.org
Wed Feb 23 00:21:44 PST 2022
Author: Haojian Wu
Date: 2022-02-23T09:21:34+01:00
New Revision: a2fab82f33bb8cc38cd1dfe7856dae706ce4297a
URL: https://github.com/llvm/llvm-project/commit/a2fab82f33bb8cc38cd1dfe7856dae706ce4297a
DIFF: https://github.com/llvm/llvm-project/commit/a2fab82f33bb8cc38cd1dfe7856dae706ce4297a.diff
LOG: [pseudo] Implement LRTable.
This patch introduces a dense implementation of the LR parsing table, which is
used by LR parsers.
We build a SLR(1) parsing table from the LR(0) graph.
Statistics of the LR parsing table on the C++ spec grammar:
- number of states: 1449
- number of actions: 83069
- size of the table (bytes): 334928
Differential Revision: https://reviews.llvm.org/D118196
Added:
clang/include/clang/Tooling/Syntax/Pseudo/LRTable.h
clang/lib/Tooling/Syntax/Pseudo/LRTable.cpp
clang/lib/Tooling/Syntax/Pseudo/LRTableBuild.cpp
clang/test/Syntax/lr-build-basic.test
clang/test/Syntax/lr-build-conflicts.test
clang/unittests/Tooling/Syntax/Pseudo/LRTableTest.cpp
Modified:
clang/include/clang/Tooling/Syntax/Pseudo/Grammar.h
clang/lib/Tooling/Syntax/Pseudo/CMakeLists.txt
clang/lib/Tooling/Syntax/Pseudo/Grammar.cpp
clang/lib/Tooling/Syntax/Pseudo/GrammarBNF.cpp
clang/test/Syntax/check-cxx-bnf.test
clang/tools/clang-pseudo/ClangPseudo.cpp
clang/unittests/Tooling/Syntax/Pseudo/CMakeLists.txt
Removed:
clang/unittests/Tooling/Syntax/Pseudo/LRGraphTest.cpp
################################################################################
diff --git a/clang/include/clang/Tooling/Syntax/Pseudo/Grammar.h b/clang/include/clang/Tooling/Syntax/Pseudo/Grammar.h
index a7ecfea902b6d..086809ef41423 100644
--- a/clang/include/clang/Tooling/Syntax/Pseudo/Grammar.h
+++ b/clang/include/clang/Tooling/Syntax/Pseudo/Grammar.h
@@ -154,6 +154,8 @@ std::vector<llvm::DenseSet<SymbolID>> followSets(const Grammar &);
// It can be constructed dynamically (from compiling BNF file) or statically
// (a compiled data-source).
struct GrammarTable {
+ GrammarTable();
+
struct Nonterminal {
std::string Name;
// Corresponding rules that construct the non-terminal, it is a [start, end)
@@ -169,7 +171,7 @@ struct GrammarTable {
std::vector<Rule> Rules;
// A table of terminals (aka tokens). It corresponds to the clang::Token.
// clang::tok::TokenKind is the index of the table.
- std::vector<std::string> Terminals;
+ llvm::ArrayRef<std::string> Terminals;
// A table of nonterminals, sorted by name.
// SymbolID is the index of the table.
std::vector<Nonterminal> Nonterminals;
diff --git a/clang/include/clang/Tooling/Syntax/Pseudo/LRTable.h b/clang/include/clang/Tooling/Syntax/Pseudo/LRTable.h
new file mode 100644
index 0000000000000..025f7f141633a
--- /dev/null
+++ b/clang/include/clang/Tooling/Syntax/Pseudo/LRTable.h
@@ -0,0 +1,182 @@
+//===--- LRTable.h - Define LR Parsing Table ---------------------*- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// The LRTable (referred as LR parsing table in the LR literature) is the core
+// component in LR parsers, it drives the LR parsers by specifying an action to
+// take given the current state on the top of the stack and the current
+// lookahead token.
+//
+// The LRTable can be described as a matrix where the rows represent
+// the states of the LR graph, the columns represent the symbols of the
+// grammar, and each entry of the matrix (called action) represents a
+// state transition in the graph.
+//
+// Typically, based on the category of the grammar symbol, the LRTable is
+// broken into two logically separate tables:
+// - ACTION table with terminals as columns -- e.g ACTION[S, a] specifies
+// next action (shift/reduce/accept/error) on state S under a lookahead
+// terminal a
+// - GOTO table with nonterminals as columns -- e.g. GOTO[S, X] specify
+// the state which we transist to from the state S with the nonterminal X
+//
+// LRTable is *performance-critial* as it is consulted frequently during a
+// parse. In general, LRTable is very sparse (most of the entries are empty).
+// For example, for the C++ language, the SLR table has ~1500 states and 650
+// symbols which results in a matrix having 975K entries, ~90% of entries are
+// empty.
+//
+// This file implements a speed-and-space-efficient LRTable.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_TOOLING_SYNTAX_PSEUDO_LRTABLE_H
+#define LLVM_CLANG_TOOLING_SYNTAX_PSEUDO_LRTABLE_H
+
+#include "clang/Tooling/Syntax/Pseudo/Grammar.h"
+#include "llvm/ADT/ArrayRef.h"
+#include <cstdint>
+#include <vector>
+
+namespace clang {
+namespace syntax {
+namespace pseudo {
+
+// Represents the LR parsing table, which can efficiently the question "what is
+// the next step given the lookahead token and current state on top of the
+// stack?".
+//
+// This is a dense implementation, which only takes an amount of space that is
+// proportional to the number of non-empty entries in the table.
+//
+// Unlike the typical LR parsing table which allows at most one available action
+// per entry, conflicted actions are allowed in LRTable. The LRTable is designed
+// to be used in nondeterministic LR parsers (e.g. GLR).
+class LRTable {
+public:
+ // StateID is only 13 bits wide.
+ using StateID = uint16_t;
+ static constexpr unsigned StateBits = 13;
+
+ // Action represents the terminal and nonterminal actions, it combines the
+ // entry of the ACTION and GOTO tables from the LR literature.
+ class Action {
+ public:
+ enum Kind : uint8_t {
+ Sentinel = 0,
+ // Terminal actions, corresponding to entries of ACTION table.
+
+ // Shift to state n: move forward with the lookahead, and push state n
+ // onto the state stack.
+ // A shift is a forward transition, and the value n is the next state that
+ // the parser is to enter.
+ Shift,
+ // Reduce by a rule: pop the state stack.
+ Reduce,
+ // Signals that we have parsed the input successfully.
+ Accept,
+
+ // Nonterminal actions, corresponding to entry of GOTO table.
+
+ // Go to state n: push state n onto the state stack.
+ // Similar to Shift, but it is a nonterminal forward transition.
+ GoTo,
+ };
+
+ static Action accept(RuleID RID) { return Action(Accept, RID); }
+ static Action goTo(StateID S) { return Action(GoTo, S); }
+ static Action shift(StateID S) { return Action(Shift, S); }
+ static Action reduce(RuleID RID) { return Action(Reduce, RID); }
+ static Action sentinel() { return Action(Sentinel, 0); }
+
+ StateID getShiftState() const {
+ assert(kind() == Shift);
+ return Value;
+ }
+ StateID getGoToState() const {
+ assert(kind() == GoTo);
+ return Value;
+ }
+ RuleID getReduceRule() const {
+ assert(kind() == Reduce);
+ return Value;
+ }
+ Kind kind() const { return static_cast<Kind>(K); }
+
+ bool operator==(const Action &L) const { return opaque() == L.opaque(); }
+ uint16_t opaque() const { return K << ValueBits | Value; };
+
+ private:
+ Action(Kind K1, unsigned Value) : K(K1), Value(Value) {}
+ static constexpr unsigned ValueBits = StateBits;
+ static constexpr unsigned KindBits = 3;
+ static_assert(ValueBits >= RuleBits, "Value must be able to store RuleID");
+ static_assert(KindBits + ValueBits <= 16,
+ "Must be able to store kind and value efficiently");
+ uint16_t K : KindBits;
+ // Either StateID or RuleID, depending on the Kind.
+ uint16_t Value : ValueBits;
+ };
+
+ // Returns all available actions for the given state on a terminal.
+ // Expected to be called by LR parsers.
+ llvm::ArrayRef<Action> getActions(StateID State, SymbolID Terminal) const;
+ // Returns the state after we reduce a nonterminal.
+ // Expected to be called by LR parsers.
+ StateID getGoToState(StateID State, SymbolID Nonterminal) const;
+
+ // Looks up available actions.
+ // Returns empty if no available actions in the table.
+ llvm::ArrayRef<Action> find(StateID State, SymbolID Symbol) const;
+
+ size_t bytes() const {
+ return sizeof(*this) + Actions.capacity() * sizeof(Action) +
+ States.capacity() * sizeof(StateID) +
+ NontermOffset.capacity() * sizeof(uint32_t) +
+ TerminalOffset.capacity() * sizeof(uint32_t);
+ }
+
+ std::string dumpStatistics() const;
+ std::string dumpForTests(const Grammar &G) const;
+
+ // Build a SLR(1) parsing table.
+ static LRTable buildSLR(const Grammar &G);
+
+ class Builder;
+ // Represents an entry in the table, used for building the LRTable.
+ struct Entry {
+ StateID State;
+ SymbolID Symbol;
+ Action Act;
+ };
+ // Build a specifid table for testing purposes.
+ static LRTable buildForTests(const GrammarTable &, llvm::ArrayRef<Entry>);
+
+private:
+ // Conceptually the LR table is a multimap from (State, SymbolID) => Action.
+ // Our physical representation is quite
diff erent for compactness.
+
+ // Index is nonterminal SymbolID, value is the offset into States/Actions
+ // where the entries for this nonterminal begin.
+ // Give a non-terminal id, the corresponding half-open range of StateIdx is
+ // [NontermIdx[id], NontermIdx[id+1]).
+ std::vector<uint32_t> NontermOffset;
+ // Similar to NontermOffset, but for terminals, index is tok::TokenKind.
+ std::vector<uint32_t> TerminalOffset;
+ // Parallel to Actions, the value is State (rows of the matrix).
+ // Grouped by the SymbolID, and only subranges are sorted.
+ std::vector<StateID> States;
+ // A flat list of available actions, sorted by (SymbolID, State).
+ std::vector<Action> Actions;
+};
+llvm::raw_ostream &operator<<(llvm::raw_ostream &, const LRTable::Action &);
+
+} // namespace pseudo
+} // namespace syntax
+} // namespace clang
+
+#endif // LLVM_CLANG_TOOLING_SYNTAX_PSEUDO_LRTABLE_H
diff --git a/clang/lib/Tooling/Syntax/Pseudo/CMakeLists.txt b/clang/lib/Tooling/Syntax/Pseudo/CMakeLists.txt
index 43fab1f98a063..8afe7f73f3085 100644
--- a/clang/lib/Tooling/Syntax/Pseudo/CMakeLists.txt
+++ b/clang/lib/Tooling/Syntax/Pseudo/CMakeLists.txt
@@ -4,7 +4,9 @@ add_clang_library(clangToolingSyntaxPseudo
Grammar.cpp
GrammarBNF.cpp
LRGraph.cpp
-
+ LRTable.cpp
+ LRTableBuild.cpp
+
LINK_LIBS
clangBasic
clangLex
diff --git a/clang/lib/Tooling/Syntax/Pseudo/Grammar.cpp b/clang/lib/Tooling/Syntax/Pseudo/Grammar.cpp
index a2cd51a6c7569..4f1a5111ea73c 100644
--- a/clang/lib/Tooling/Syntax/Pseudo/Grammar.cpp
+++ b/clang/lib/Tooling/Syntax/Pseudo/Grammar.cpp
@@ -163,6 +163,23 @@ std::vector<llvm::DenseSet<SymbolID>> followSets(const Grammar &G) {
return FollowSets;
}
+static llvm::ArrayRef<std::string> getTerminalNames() {
+ static const std::vector<std::string> *TerminalNames = []() {
+ static std::vector<std::string> TerminalNames;
+ TerminalNames.reserve(NumTerminals);
+ for (unsigned I = 0; I < NumTerminals; ++I) {
+ tok::TokenKind K = static_cast<tok::TokenKind>(I);
+ if (const auto *Punc = tok::getPunctuatorSpelling(K))
+ TerminalNames.push_back(Punc);
+ else
+ TerminalNames.push_back(llvm::StringRef(tok::getTokenName(K)).upper());
+ }
+ return &TerminalNames;
+ }();
+ return *TerminalNames;
+}
+GrammarTable::GrammarTable() : Terminals(getTerminalNames()) {}
+
} // namespace pseudo
} // namespace syntax
} // namespace clang
diff --git a/clang/lib/Tooling/Syntax/Pseudo/GrammarBNF.cpp b/clang/lib/Tooling/Syntax/Pseudo/GrammarBNF.cpp
index b19bed3449ba9..bc90a9674d9ef 100644
--- a/clang/lib/Tooling/Syntax/Pseudo/GrammarBNF.cpp
+++ b/clang/lib/Tooling/Syntax/Pseudo/GrammarBNF.cpp
@@ -21,17 +21,6 @@ namespace {
static const llvm::StringRef OptSuffix = "_opt";
static const llvm::StringRef StartSymbol = "_";
-void initTerminals(std::vector<std::string> &Out) {
- Out.clear();
- Out.reserve(NumTerminals);
- for (unsigned I = 0; I < NumTerminals; ++I) {
- tok::TokenKind K = static_cast<tok::TokenKind>(I);
- if (const auto *Punc = tok::getPunctuatorSpelling(K))
- Out.push_back(Punc);
- else
- Out.push_back(llvm::StringRef(tok::getTokenName(K)).upper());
- }
-}
// Builds grammar from BNF files.
class GrammarBuilder {
public:
@@ -53,7 +42,6 @@ class GrammarBuilder {
"Optional symbols should be eliminated!");
auto T = std::make_unique<GrammarTable>();
- initTerminals(T->Terminals);
// Assemble the name->ID and ID->nonterminal name maps.
llvm::DenseSet<llvm::StringRef> UniqueNonterminals;
diff --git a/clang/lib/Tooling/Syntax/Pseudo/LRTable.cpp b/clang/lib/Tooling/Syntax/Pseudo/LRTable.cpp
new file mode 100644
index 0000000000000..2ecb9b1cd2ce2
--- /dev/null
+++ b/clang/lib/Tooling/Syntax/Pseudo/LRTable.cpp
@@ -0,0 +1,124 @@
+//===--- LRTable.cpp - Parsing table for LR parsers --------------*- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/Tooling/Syntax/Pseudo/LRTable.h"
+#include "clang/Tooling/Syntax/Pseudo/Grammar.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace clang {
+namespace syntax {
+namespace pseudo {
+
+llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const LRTable::Action &A) {
+ switch (A.kind()) {
+ case LRTable::Action::Shift:
+ return OS << llvm::formatv("shift state {0}", A.getShiftState());
+ case LRTable::Action::Reduce:
+ return OS << llvm::formatv("reduce by rule {0}", A.getReduceRule());
+ case LRTable::Action::GoTo:
+ return OS << llvm::formatv("go to state {0}", A.getGoToState());
+ case LRTable::Action::Accept:
+ return OS << "acc";
+ case LRTable::Action::Sentinel:
+ llvm_unreachable("unexpected Sentinel action kind!");
+ }
+}
+
+std::string LRTable::dumpStatistics() const {
+ StateID NumOfStates = 0;
+ for (StateID It : States)
+ NumOfStates = std::max(It, NumOfStates);
+ return llvm::formatv(R"(
+Statistics of the LR parsing table:
+ number of states: {0}
+ number of actions: {1}
+ size of the table (bytes): {2}
+)",
+ NumOfStates, Actions.size(), bytes())
+ .str();
+}
+
+std::string LRTable::dumpForTests(const Grammar &G) const {
+ std::string Result;
+ llvm::raw_string_ostream OS(Result);
+ StateID MaxState = 0;
+ for (StateID It : States)
+ MaxState = std::max(MaxState, It);
+ OS << "LRTable:\n";
+ for (StateID S = 0; S <= MaxState; ++S) {
+ OS << llvm::formatv("State {0}\n", S);
+ for (uint16_t Terminal = 0; Terminal < NumTerminals; ++Terminal) {
+ SymbolID TokID = tokenSymbol(static_cast<tok::TokenKind>(Terminal));
+ for (auto A : find(S, TokID)) {
+ if (A.kind() == LRTable::Action::Shift)
+ OS.indent(4) << llvm::formatv("'{0}': shift state {1}\n",
+ G.symbolName(TokID), A.getShiftState());
+ else if (A.kind() == LRTable::Action::Reduce)
+ OS.indent(4) << llvm::formatv("'{0}': reduce by rule {1} '{2}'\n",
+ G.symbolName(TokID), A.getReduceRule(),
+ G.dumpRule(A.getReduceRule()));
+ else if (A.kind() == LRTable::Action::Accept)
+ OS.indent(4) << llvm::formatv("'{0}': accept\n", G.symbolName(TokID));
+ }
+ }
+ for (SymbolID NontermID = 0; NontermID < G.table().Nonterminals.size();
+ ++NontermID) {
+ if (find(S, NontermID).empty())
+ continue;
+ OS.indent(4) << llvm::formatv("'{0}': go to state {1}\n",
+ G.symbolName(NontermID),
+ getGoToState(S, NontermID));
+ }
+ }
+ return OS.str();
+}
+
+llvm::ArrayRef<LRTable::Action> LRTable::getActions(StateID State,
+ SymbolID Terminal) const {
+ assert(pseudo::isToken(Terminal) && "expect terminal symbol!");
+ return find(State, Terminal);
+}
+
+LRTable::StateID LRTable::getGoToState(StateID State,
+ SymbolID Nonterminal) const {
+ assert(pseudo::isNonterminal(Nonterminal) && "expected nonterminal symbol!");
+ auto Result = find(State, Nonterminal);
+ assert(Result.size() == 1 && Result.front().kind() == Action::GoTo);
+ return Result.front().getGoToState();
+}
+
+llvm::ArrayRef<LRTable::Action> LRTable::find(StateID Src, SymbolID ID) const {
+ size_t Idx = isToken(ID) ? symbolToToken(ID) : ID;
+ assert(isToken(ID) ? Idx + 1 < TerminalOffset.size()
+ : Idx + 1 < NontermOffset.size());
+ std::pair<size_t, size_t> TargetStateRange =
+ isToken(ID) ? std::make_pair(TerminalOffset[Idx], TerminalOffset[Idx + 1])
+ : std::make_pair(NontermOffset[Idx], NontermOffset[Idx + 1]);
+ auto TargetedStates =
+ llvm::makeArrayRef(States.data() + TargetStateRange.first,
+ States.data() + TargetStateRange.second);
+
+ assert(llvm::is_sorted(TargetedStates) &&
+ "subrange of the StateIdx should be sorted!");
+ const LRTable::StateID *It = llvm::partition_point(
+ TargetedStates, [&Src](LRTable::StateID S) { return S < Src; });
+ if (It == TargetedStates.end())
+ return {};
+ size_t Start = It - States.data(), End = Start;
+ while (End < States.size() && States[End] == Src)
+ ++End;
+ return llvm::makeArrayRef(&Actions[Start], &Actions[End]);
+}
+
+} // namespace pseudo
+} // namespace syntax
+} // namespace clang
diff --git a/clang/lib/Tooling/Syntax/Pseudo/LRTableBuild.cpp b/clang/lib/Tooling/Syntax/Pseudo/LRTableBuild.cpp
new file mode 100644
index 0000000000000..f07d8b106806e
--- /dev/null
+++ b/clang/lib/Tooling/Syntax/Pseudo/LRTableBuild.cpp
@@ -0,0 +1,143 @@
+//===--- LRTableBuild.cpp - Build a LRTable from LRGraph ---------*- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/Basic/TokenKinds.h"
+#include "clang/Tooling/Syntax/Pseudo/Grammar.h"
+#include "clang/Tooling/Syntax/Pseudo/LRGraph.h"
+#include "clang/Tooling/Syntax/Pseudo/LRTable.h"
+#include <cstdint>
+
+namespace llvm {
+template <> struct DenseMapInfo<clang::syntax::pseudo::LRTable::Entry> {
+ using Entry = clang::syntax::pseudo::LRTable::Entry;
+ static inline Entry getEmptyKey() {
+ static Entry E{static_cast<clang::syntax::pseudo::SymbolID>(-1), 0,
+ clang::syntax::pseudo::LRTable::Action::sentinel()};
+ return E;
+ }
+ static inline Entry getTombstoneKey() {
+ static Entry E{static_cast<clang::syntax::pseudo::SymbolID>(-2), 0,
+ clang::syntax::pseudo::LRTable::Action::sentinel()};
+ return E;
+ }
+ static unsigned getHashValue(const Entry &I) {
+ return llvm::hash_combine(I.State, I.Symbol, I.Act.opaque());
+ }
+ static bool isEqual(const Entry &LHS, const Entry &RHS) {
+ return LHS.State == RHS.State && LHS.Symbol == RHS.Symbol &&
+ LHS.Act == RHS.Act;
+ }
+};
+} // namespace llvm
+
+namespace clang {
+namespace syntax {
+namespace pseudo {
+
+class LRTable::Builder {
+public:
+ bool insert(Entry E) { return Entries.insert(std::move(E)).second; }
+ LRTable build(const GrammarTable >) && {
+ // E.g. given the following parsing table with 3 states and 3 terminals:
+ //
+ // a b c
+ // +-------+----+-------+-+
+ // |state0 | | s0,r0 | |
+ // |state1 | acc| | |
+ // |state2 | | r1 | |
+ // +-------+----+-------+-+
+ //
+ // The final LRTable:
+ // - TerminalOffset: [a] = 0, [b] = 1, [c] = 4, [d] = 4 (d is a sentinel)
+ // - States: [ 1, 0, 0, 2]
+ // Actions: [ acc, s0, r0, r1]
+ // ~~~ corresponding range for terminal a
+ // ~~~~~~~~~~ corresponding range for terminal b
+ // First step, we sort all entries by (Symbol, State, Action).
+ std::vector<Entry> Sorted(Entries.begin(), Entries.end());
+ llvm::sort(Sorted, [](const Entry &L, const Entry &R) {
+ return std::forward_as_tuple(L.Symbol, L.State, L.Act.opaque()) <
+ std::forward_as_tuple(R.Symbol, R.State, R.Act.opaque());
+ });
+
+ LRTable Table;
+ Table.Actions.reserve(Sorted.size());
+ Table.States.reserve(Sorted.size());
+ // We are good to finalize the States and Actions.
+ for (const auto &E : Sorted) {
+ Table.Actions.push_back(E.Act);
+ Table.States.push_back(E.State);
+ }
+ // Initialize the terminal and nonterminal idx, all ranges are empty by
+ // default.
+ Table.TerminalOffset = std::vector<uint32_t>(GT.Terminals.size() + 1, 0);
+ Table.NontermOffset = std::vector<uint32_t>(GT.Nonterminals.size() + 1, 0);
+ size_t SortedIndex = 0;
+ for (SymbolID NonterminalID = 0; NonterminalID < Table.NontermOffset.size();
+ ++NonterminalID) {
+ Table.NontermOffset[NonterminalID] = SortedIndex;
+ while (SortedIndex < Sorted.size() &&
+ Sorted[SortedIndex].Symbol == NonterminalID)
+ ++SortedIndex;
+ }
+ for (size_t Terminal = 0; Terminal < Table.TerminalOffset.size();
+ ++Terminal) {
+ Table.TerminalOffset[Terminal] = SortedIndex;
+ while (SortedIndex < Sorted.size() &&
+ Sorted[SortedIndex].Symbol ==
+ tokenSymbol(static_cast<tok::TokenKind>(Terminal)))
+ ++SortedIndex;
+ }
+ return Table;
+ }
+
+private:
+ llvm::DenseSet<Entry> Entries;
+};
+
+LRTable LRTable::buildForTests(const GrammarTable >,
+ llvm::ArrayRef<Entry> Entries) {
+ Builder Build;
+ for (const Entry &E : Entries)
+ Build.insert(E);
+ return std::move(Build).build(GT);
+}
+
+LRTable LRTable::buildSLR(const Grammar &G) {
+ Builder Build;
+ auto Graph = LRGraph::buildLR0(G);
+ for (const auto &T : Graph.edges()) {
+ Action Act = isToken(T.Label) ? Action::shift(T.Dst) : Action::goTo(T.Dst);
+ Build.insert({T.Src, T.Label, Act});
+ }
+ assert(Graph.states().size() <= (1 << StateBits) &&
+ "Graph states execceds the maximum limit!");
+ auto FollowSets = followSets(G);
+ for (StateID SID = 0; SID < Graph.states().size(); ++SID) {
+ for (const Item &I : Graph.states()[SID].Items) {
+ // If we've just parsed the start symbol, we can accept the input.
+ if (G.lookupRule(I.rule()).Target == G.startSymbol() && !I.hasNext()) {
+ Build.insert({SID, tokenSymbol(tok::eof), Action::accept(I.rule())});
+ continue;
+ }
+ if (!I.hasNext()) {
+ // If we've reached the end of a rule A := ..., then we can reduce if
+ // the next token is in the follow set of A".
+ for (SymbolID Follow : FollowSets[G.lookupRule(I.rule()).Target]) {
+ assert(isToken(Follow));
+ Build.insert({SID, Follow, Action::reduce(I.rule())});
+ }
+ }
+ }
+ }
+ return std::move(Build).build(G.table());
+}
+
+} // namespace pseudo
+} // namespace syntax
+} // namespace clang
diff --git a/clang/test/Syntax/check-cxx-bnf.test b/clang/test/Syntax/check-cxx-bnf.test
index fcc0fa6a1ecc7..e7e7194257629 100644
--- a/clang/test/Syntax/check-cxx-bnf.test
+++ b/clang/test/Syntax/check-cxx-bnf.test
@@ -1,2 +1,2 @@
// verify clang/lib/Tooling/Syntax/Pseudo/cxx.bnf
-// RUN: clang-pseudo -check-grammar=%cxx-bnf-file
+// RUN: clang-pseudo -grammar=%cxx-bnf-file
diff --git a/clang/test/Syntax/lr-build-basic.test b/clang/test/Syntax/lr-build-basic.test
new file mode 100644
index 0000000000000..d6538338991e1
--- /dev/null
+++ b/clang/test/Syntax/lr-build-basic.test
@@ -0,0 +1,24 @@
+_ := expr
+expr := IDENTIFIER
+
+# RUN: clang-pseudo -grammar %s -print-graph | FileCheck %s --check-prefix=GRAPH
+# GRAPH: States:
+# GRPAH-NEXT: State 0
+# GRPAH-NEXT: _ := • expr
+# GRPAH-NEXT: expr := • IDENTIFIER
+# GRPAH-NEXT: State 1
+# GRPAH-NEXT: _ := expr •
+# GRPAH-NEXT: State 2
+# GRPAH-NEXT: expr := IDENTIFIER •
+# GRPAH-NEXT: 0 ->[expr] 1
+# GRPAH-NEXT: 0 ->[IDENTIFIER] 2
+
+# RUN: clang-pseudo -grammar %s -print-table | FileCheck %s --check-prefix=TABLE
+# TABLE: LRTable:
+# TABLE-NEXT: State 0
+# TABLE-NEXT: 'IDENTIFIER': shift state 2
+# TABLE-NEXT: 'expr': go to state 1
+# TABLE-NEXT: State 1
+# TABLE-NEXT: 'EOF': accept
+# TABLE-NEXT: State 2
+# TABLE-NEXT: 'EOF': reduce by rule 1 'expr := IDENTIFIER'
diff --git a/clang/test/Syntax/lr-build-conflicts.test b/clang/test/Syntax/lr-build-conflicts.test
new file mode 100644
index 0000000000000..4292a7184e0f8
--- /dev/null
+++ b/clang/test/Syntax/lr-build-conflicts.test
@@ -0,0 +1,47 @@
+_ := expr
+expr := expr - expr # S/R conflict at state 4 on '-' token
+expr := IDENTIFIER
+
+# RUN: clang-pseudo -grammar %s -print-graph | FileCheck %s --check-prefix=GRAPH
+# GRAPH: States
+# GRAPH-NEXT: State 0
+# GRAPH-NEXT: _ := • expr
+# GRAPH-NEXT: expr := • expr - expr
+# GRAPH-NEXT: expr := • IDENTIFIER
+# GRAPH-NEXT: State 1
+# GRAPH-NEXT: _ := expr •
+# GRAPH-NEXT: expr := expr • - expr
+# GRAPH-NEXT: State 2
+# GRAPH-NEXT: expr := IDENTIFIER •
+# GRAPH-NEXT: State 3
+# GRAPH-NEXT: expr := • expr - expr
+# GRAPH-NEXT: expr := expr - • expr
+# GRAPH-NEXT: expr := • IDENTIFIER
+# GRAPH-NEXT: State 4
+# GRAPH-NEXT: expr := expr - expr •
+# GRAPH-NEXT: expr := expr • - expr
+# GRAPH-NEXT: 0 ->[expr] 1
+# GRAPH-NEXT: 0 ->[IDENTIFIER] 2
+# GRAPH-NEXT: 1 ->[-] 3
+# GRAPH-NEXT: 3 ->[expr] 4
+# GRAPH-NEXT: 3 ->[IDENTIFIER] 2
+# GRAPH-NEXT: 4 ->[-] 3
+
+# RUN: clang-pseudo -grammar %s -print-table | FileCheck %s --check-prefix=TABLE
+# TABLE: LRTable:
+# TABLE-NEXT: State 0
+# TABLE-NEXT: 'IDENTIFIER': shift state 2
+# TABLE-NEXT: 'expr': go to state 1
+# TABLE-NEXT: State 1
+# TABLE-NEXT: 'EOF': accept
+# TABLE-NEXT: '-': shift state 3
+# TABLE-NEXT: State 2
+# TABLE-NEXT: 'EOF': reduce by rule 1 'expr := IDENTIFIER'
+# TABLE-NEXT: '-': reduce by rule 1 'expr := IDENTIFIER'
+# TABLE-NEXT: State 3
+# TABLE-NEXT: 'IDENTIFIER': shift state 2
+# TABLE-NEXT: 'expr': go to state 4
+# TABLE-NEXT: State 4
+# TABLE-NEXT: 'EOF': reduce by rule 2 'expr := expr - expr'
+# TABLE-NEXT: '-': shift state 3
+# TABLE-NEXT: '-': reduce by rule 2 'expr := expr - expr'
diff --git a/clang/tools/clang-pseudo/ClangPseudo.cpp b/clang/tools/clang-pseudo/ClangPseudo.cpp
index 6fb8f58fa016c..449b9181f3ee0 100644
--- a/clang/tools/clang-pseudo/ClangPseudo.cpp
+++ b/clang/tools/clang-pseudo/ClangPseudo.cpp
@@ -7,6 +7,8 @@
//===----------------------------------------------------------------------===//
#include "clang/Tooling/Syntax/Pseudo/Grammar.h"
+#include "clang/Tooling/Syntax/Pseudo/LRGraph.h"
+#include "clang/Tooling/Syntax/Pseudo/LRTable.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/FormatVariadic.h"
@@ -18,30 +20,45 @@ using llvm::cl::init;
using llvm::cl::opt;
static opt<std::string>
- CheckGrammar("check-grammar", desc("Parse and check a BNF grammar file."),
- init(""));
+ Grammar("grammar", desc("Parse and check a BNF grammar file."), init(""));
+static opt<bool> PrintGraph("print-graph",
+ desc("Print the LR graph for the grammar"));
+static opt<bool> PrintTable("print-table",
+ desc("Print the LR table for the grammar"));
+
+static std::string readOrDie(llvm::StringRef Path) {
+ llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> Text =
+ llvm::MemoryBuffer::getFile(Path);
+ if (std::error_code EC = Text.getError()) {
+ llvm::errs() << "Error: can't read file '" << Path << "': " << EC.message()
+ << "\n";
+ ::exit(1);
+ }
+ return Text.get()->getBuffer().str();
+}
int main(int argc, char *argv[]) {
llvm::cl::ParseCommandLineOptions(argc, argv, "");
- if (CheckGrammar.getNumOccurrences()) {
- llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> Text =
- llvm::MemoryBuffer::getFile(CheckGrammar);
- if (std::error_code EC = Text.getError()) {
- llvm::errs() << "Error: can't read grammar file '" << CheckGrammar
- << "': " << EC.message() << "\n";
- return 1;
- }
+ if (Grammar.getNumOccurrences()) {
+ std::string Text = readOrDie(Grammar);
std::vector<std::string> Diags;
- auto RSpecs = Grammar::parseBNF(Text.get()->getBuffer(), Diags);
+ auto G = Grammar::parseBNF(Text, Diags);
if (!Diags.empty()) {
llvm::errs() << llvm::join(Diags, "\n");
return 2;
}
- llvm::errs() << llvm::formatv("grammar file {0} is parsed successfully\n",
- CheckGrammar);
+ llvm::outs() << llvm::formatv("grammar file {0} is parsed successfully\n",
+ Grammar);
+ if (PrintGraph)
+ llvm::outs() << clang::syntax::pseudo::LRGraph::buildLR0(*G).dumpForTests(
+ *G);
+ if (PrintTable)
+ llvm::outs() << clang::syntax::pseudo::LRTable::buildSLR(*G).dumpForTests(
+ *G);
return 0;
}
+
return 0;
}
diff --git a/clang/unittests/Tooling/Syntax/Pseudo/CMakeLists.txt b/clang/unittests/Tooling/Syntax/Pseudo/CMakeLists.txt
index de1e1216c58d6..509e9e4a1598b 100644
--- a/clang/unittests/Tooling/Syntax/Pseudo/CMakeLists.txt
+++ b/clang/unittests/Tooling/Syntax/Pseudo/CMakeLists.txt
@@ -4,7 +4,7 @@ set(LLVM_LINK_COMPONENTS
add_clang_unittest(ClangPseudoTests
GrammarTest.cpp
- LRGraphTest.cpp
+ LRTableTest.cpp
)
clang_target_link_libraries(ClangPseudoTests
diff --git a/clang/unittests/Tooling/Syntax/Pseudo/LRGraphTest.cpp b/clang/unittests/Tooling/Syntax/Pseudo/LRGraphTest.cpp
deleted file mode 100644
index e7f7e1a7e65d9..0000000000000
--- a/clang/unittests/Tooling/Syntax/Pseudo/LRGraphTest.cpp
+++ /dev/null
@@ -1,84 +0,0 @@
-//===--- LRGraphTest.cpp - LRGraph tests -------------------------*- C++-*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "clang/Tooling/Syntax/Pseudo/LRGraph.h"
-#include "gmock/gmock.h"
-#include "gtest/gtest.h"
-#include <memory>
-
-namespace clang {
-namespace syntax {
-namespace pseudo {
-namespace {
-
-TEST(LRGraph, Build) {
- struct TestCase {
- llvm::StringRef BNF;
- llvm::StringRef ExpectedStates;
- };
-
- TestCase Cases[] = {{
- R"bnf(
-_ := expr
-expr := IDENTIFIER
- )bnf",
- R"(States:
-State 0
- _ := • expr
- expr := • IDENTIFIER
-State 1
- _ := expr •
-State 2
- expr := IDENTIFIER •
-0 ->[expr] 1
-0 ->[IDENTIFIER] 2
-)"},
- {// A grammar with a S/R conflict in SLR table:
- // (id-id)-id, or id-(id-id).
- R"bnf(
-_ := expr
-expr := expr - expr # S/R conflict at state 4 on '-' token
-expr := IDENTIFIER
- )bnf",
- R"(States:
-State 0
- _ := • expr
- expr := • expr - expr
- expr := • IDENTIFIER
-State 1
- _ := expr •
- expr := expr • - expr
-State 2
- expr := IDENTIFIER •
-State 3
- expr := • expr - expr
- expr := expr - • expr
- expr := • IDENTIFIER
-State 4
- expr := expr - expr •
- expr := expr • - expr
-0 ->[expr] 1
-0 ->[IDENTIFIER] 2
-1 ->[-] 3
-3 ->[expr] 4
-3 ->[IDENTIFIER] 2
-4 ->[-] 3
-)"}};
- for (const auto &C : Cases) {
- std::vector<std::string> Diags;
- auto G = Grammar::parseBNF(C.BNF, Diags);
- ASSERT_THAT(Diags, testing::IsEmpty());
- auto LR0 = LRGraph::buildLR0(*G);
- EXPECT_EQ(LR0.dumpForTests(*G), C.ExpectedStates);
- }
-}
-
-} // namespace
-} // namespace pseudo
-} // namespace syntax
-} // namespace clang
diff --git a/clang/unittests/Tooling/Syntax/Pseudo/LRTableTest.cpp b/clang/unittests/Tooling/Syntax/Pseudo/LRTableTest.cpp
new file mode 100644
index 0000000000000..88ac697ce250d
--- /dev/null
+++ b/clang/unittests/Tooling/Syntax/Pseudo/LRTableTest.cpp
@@ -0,0 +1,56 @@
+//===--- LRTableTest.cpp - ---------------------------------------*- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/Tooling/Syntax/Pseudo/LRTable.h"
+#include "clang/Basic/TokenKinds.h"
+#include "clang/Tooling/Syntax/Pseudo/Grammar.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include <vector>
+
+namespace clang {
+namespace syntax {
+namespace pseudo {
+namespace {
+
+using testing::IsEmpty;
+using testing::UnorderedElementsAre;
+using Action = LRTable::Action;
+
+TEST(LRTable, Builder) {
+ GrammarTable GTable;
+
+ // eof semi ...
+ // +-------+----+-------+---
+ // |state0 | | s0,r0 |...
+ // |state1 | acc| |...
+ // |state2 | | r1 |...
+ // +-------+----+-------+---
+ std::vector<LRTable::Entry> Entries = {
+ {/* State */ 0, tokenSymbol(tok::semi), Action::shift(0)},
+ {/* State */ 0, tokenSymbol(tok::semi), Action::reduce(0)},
+ {/* State */ 1, tokenSymbol(tok::eof), Action::accept(2)},
+ {/* State */ 2, tokenSymbol(tok::semi), Action::reduce(1)}};
+ GrammarTable GT;
+ LRTable T = LRTable::buildForTests(GT, Entries);
+ EXPECT_THAT(T.find(0, tokenSymbol(tok::eof)), IsEmpty());
+ EXPECT_THAT(T.find(0, tokenSymbol(tok::semi)),
+ UnorderedElementsAre(Action::shift(0), Action::reduce(0)));
+ EXPECT_THAT(T.find(1, tokenSymbol(tok::eof)),
+ UnorderedElementsAre(Action::accept(2)));
+ EXPECT_THAT(T.find(1, tokenSymbol(tok::semi)), IsEmpty());
+ EXPECT_THAT(T.find(2, tokenSymbol(tok::semi)),
+ UnorderedElementsAre(Action::reduce(1)));
+ // Verify the behaivor for other non-available-actions terminals.
+ EXPECT_THAT(T.find(2, tokenSymbol(tok::kw_int)), IsEmpty());
+}
+
+} // namespace
+} // namespace pseudo
+} // namespace syntax
+} // namespace clang
More information about the cfe-commits
mailing list