[clang] f1984b1 - [pseudo] Implement LRGraph
Haojian Wu via cfe-commits
cfe-commits at lists.llvm.org
Wed Feb 9 02:20:22 PST 2022
Author: Haojian Wu
Date: 2022-02-09T11:20:07+01:00
New Revision: f1984b14336777e1978bfe994dd1e43ebea93f00
URL: https://github.com/llvm/llvm-project/commit/f1984b14336777e1978bfe994dd1e43ebea93f00
DIFF: https://github.com/llvm/llvm-project/commit/f1984b14336777e1978bfe994dd1e43ebea93f00.diff
LOG: [pseudo] Implement LRGraph
LRGraph is the key component of the clang pseudo parser, it is a
deterministic handle-finding finite-state machine, which is used to
generated the LR parsing table.
Separate from https://reviews.llvm.org/D118196.
Differential Revision: https://reviews.llvm.org/D119172
Added:
clang/include/clang/Tooling/Syntax/Pseudo/LRGraph.h
clang/lib/Tooling/Syntax/Pseudo/LRGraph.cpp
clang/unittests/Tooling/Syntax/Pseudo/LRGraphTest.cpp
Modified:
clang/lib/Tooling/Syntax/Pseudo/CMakeLists.txt
clang/unittests/Tooling/Syntax/Pseudo/CMakeLists.txt
Removed:
################################################################################
diff --git a/clang/include/clang/Tooling/Syntax/Pseudo/LRGraph.h b/clang/include/clang/Tooling/Syntax/Pseudo/LRGraph.h
new file mode 100644
index 000000000000..8a4bdb76e49e
--- /dev/null
+++ b/clang/include/clang/Tooling/Syntax/Pseudo/LRGraph.h
@@ -0,0 +1,177 @@
+//===--- LRGraph.h - Build an LR automaton ------------------*- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// LR parsers are bottom-up parsers -- they scan the input from left to right,
+// and collect the right-hand side of a production rule (called handle) on top
+// of the stack, then replace (reduce) the handle with the nonterminal defined
+// by the production rule.
+//
+// This file defines LRGraph, a deterministic handle-finding finite-state
+// automaton, which is a key component in LR parsers to recognize any of
+// handles in the grammar efficiently. We build the LR table (ACTION and GOTO
+// Table) based on the LRGraph.
+//
+// LRGraph can be constructed for any context-free grammars.
+// Even for a LR-ambiguous grammar, we can construct a deterministic FSA, but
+// interpretation of the FSA is nondeterminsitic -- we might in a state where
+// we can continue searching an handle and identify a handle (called
+// shift/reduce conflicts), or identify more than one handle (callled
+// reduce/reduce conflicts).
+//
+// LRGraph is a common model for all variants of LR automatons, from the most
+// basic one LR(0), the powerful SLR(1), LR(1) which uses a one-token lookahead
+// in making decisions.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_TOOLING_SYNTAX_PSEUDO_LRGRAPH_H
+#define LLVM_CLANG_TOOLING_SYNTAX_PSEUDO_LRGRAPH_H
+
+#include "clang/Tooling/Syntax/Pseudo/Grammar.h"
+#include "llvm/ADT/Hashing.h"
+#include <vector>
+
+namespace clang {
+namespace syntax {
+namespace pseudo {
+
+// An LR item -- a grammar rule with a dot at some position of the body.
+// e.g. a production rule A := X Y yields 3 items:
+// A := . X Y
+// A := X . Y
+// A := X Y .
+// An item indicates how much of a production rule has been recognized at a
+// position (described by dot), for example, A := X . Y indicates that we have
+// recognized the X part from the input, and we hope next to see the input
+// derivable from Y.
+class Item {
+public:
+ static Item start(RuleID ID, const Grammar &G) {
+ Item I;
+ I.RID = ID;
+ I.RuleLength = G.lookupRule(ID).Size;
+ return I;
+ }
+ static Item sentinel(RuleID ID) {
+ Item I;
+ I.RID = ID;
+ return I;
+ }
+
+ RuleID rule() const { return RID; }
+ uint8_t dot() const { return DotPos; }
+
+ bool hasNext() const { return DotPos < RuleLength; }
+ SymbolID next(const Grammar &G) const {
+ assert(hasNext());
+ return G.lookupRule(RID).Sequence[DotPos];
+ }
+
+ Item advance() const {
+ assert(hasNext());
+ Item I = *this;
+ ++I.DotPos;
+ return I;
+ }
+
+ std::string dump(const Grammar &G) const;
+
+ bool operator==(const Item &I) const {
+ return DotPos == I.DotPos && RID == I.RID;
+ }
+ bool operator<(const Item &I) const {
+ return std::tie(RID, DotPos) < std::tie(I.RID, I.DotPos);
+ }
+ friend llvm::hash_code hash_value(const Item &I) {
+ return llvm::hash_combine(I.RID, I.DotPos);
+ }
+
+private:
+ RuleID RID = 0;
+ uint8_t DotPos = 0;
+ uint8_t RuleLength = 0; // the length of rule body.
+};
+
+// A state represents a node in the LR automaton graph. It is an item set, which
+// contains all possible rules that the LR parser may be parsing in that state.
+//
+// Conceptually, If we knew in advance what we're parsing, at any point we're
+// partway through parsing a production, sitting on a stack of partially parsed
+// productions. But because we don't know, there could be *several* productions
+// we're partway through. The set of possibilities is the parser state, and we
+// precompute all the transitions between these states.
+struct State {
+ // A full set of items (including non-kernel items) representing the state,
+ // in a canonical order (see SortByNextSymbol in the cpp file).
+ std::vector<Item> Items;
+
+ std::string dump(const Grammar &G, unsigned Indent = 0) const;
+};
+
+// LRGraph is a deterministic finite state automaton for LR parsing.
+//
+// Intuitively, an LR automaton is a transition graph. The graph has a
+// collection of nodes, called States. Each state corresponds to a particular
+// item set, which represents a condition that could occur duing the process of
+// parsing a production. Edges are directed from one state to another. Each edge
+// is labeled by a grammar symbol (terminal or nonterminal).
+//
+// LRGraph is used to construct the LR parsing table which is a core
+// data-structure driving the LR parser.
+class LRGraph {
+public:
+ // StateID is the index in States table.
+ using StateID = uint16_t;
+
+ // Constructs an LR(0) automaton.
+ static LRGraph buildLR0(const Grammar &);
+
+ // An edge in the LR graph, it represents a transition in the LR automaton.
+ // If the parser is at state Src, with a lookahead Label, then it
+ // transits to state Dst.
+ struct Edge {
+ StateID Src, Dst;
+ SymbolID Label;
+ };
+
+ llvm::ArrayRef<State> states() const { return States; }
+ llvm::ArrayRef<Edge> edges() const { return Edges; }
+
+ std::string dumpForTests(const Grammar &) const;
+
+private:
+ LRGraph(std::vector<State> States, std::vector<Edge> Edges)
+ : States(std::move(States)), Edges(std::move(Edges)) {}
+
+ std::vector<State> States;
+ std::vector<Edge> Edges;
+};
+
+} // namespace pseudo
+} // namespace syntax
+} // namespace clang
+
+namespace llvm {
+// Support clang::syntax::pseudo::Item as DenseMap keys.
+template <> struct DenseMapInfo<clang::syntax::pseudo::Item> {
+ static inline clang::syntax::pseudo::Item getEmptyKey() {
+ return clang::syntax::pseudo::Item::sentinel(-1);
+ }
+ static inline clang::syntax::pseudo::Item getTombstoneKey() {
+ return clang::syntax::pseudo::Item::sentinel(-2);
+ }
+ static unsigned getHashValue(const clang::syntax::pseudo::Item &I) {
+ return hash_value(I);
+ }
+ static bool isEqual(const clang::syntax::pseudo::Item &LHS,
+ const clang::syntax::pseudo::Item &RHS) {
+ return LHS == RHS;
+ }
+};
+} // namespace llvm
+
+#endif // LLVM_CLANG_TOOLING_SYNTAX_PSEUDO_LRGRAPH_H
diff --git a/clang/lib/Tooling/Syntax/Pseudo/CMakeLists.txt b/clang/lib/Tooling/Syntax/Pseudo/CMakeLists.txt
index 223ea52b6f75..43fab1f98a06 100644
--- a/clang/lib/Tooling/Syntax/Pseudo/CMakeLists.txt
+++ b/clang/lib/Tooling/Syntax/Pseudo/CMakeLists.txt
@@ -3,6 +3,7 @@ set(LLVM_LINK_COMPONENTS Support)
add_clang_library(clangToolingSyntaxPseudo
Grammar.cpp
GrammarBNF.cpp
+ LRGraph.cpp
LINK_LIBS
clangBasic
diff --git a/clang/lib/Tooling/Syntax/Pseudo/LRGraph.cpp b/clang/lib/Tooling/Syntax/Pseudo/LRGraph.cpp
new file mode 100644
index 000000000000..17e47718812a
--- /dev/null
+++ b/clang/lib/Tooling/Syntax/Pseudo/LRGraph.cpp
@@ -0,0 +1,231 @@
+//===--- LRGraph.cpp - -------------------------------------------*- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/Tooling/Syntax/Pseudo/LRGraph.h"
+#include "clang/Tooling/Syntax/Pseudo/Grammar.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/raw_ostream.h"
+
+using ItemSet = std::vector<clang::syntax::pseudo::Item>;
+
+namespace llvm {
+// Support clang::syntax::pseudo::Item as DenseMap keys.
+template <> struct DenseMapInfo<ItemSet> {
+ static inline ItemSet getEmptyKey() {
+ return {DenseMapInfo<clang::syntax::pseudo::Item>::getEmptyKey()};
+ }
+ static inline ItemSet getTombstoneKey() {
+ return {DenseMapInfo<clang::syntax::pseudo::Item>::getTombstoneKey()};
+ }
+ static unsigned getHashValue(const ItemSet &I) {
+ return llvm::hash_combine_range(I.begin(), I.end());
+ }
+ static bool isEqual(const ItemSet &LHS, const ItemSet &RHS) {
+ return LHS == RHS;
+ }
+};
+} // namespace llvm
+
+namespace clang {
+namespace syntax {
+namespace pseudo {
+namespace {
+
+struct SortByNextSymbol {
+ SortByNextSymbol(const Grammar &G) : G(G) {}
+ bool operator()(const Item &L, const Item &R) {
+ if (L.hasNext() && R.hasNext() && L.next(G) != R.next(G))
+ return L.next(G) < R.next(G);
+ if (L.hasNext() != R.hasNext())
+ return L.hasNext() < R.hasNext(); // a trailing dot is minimal.
+ return L < R;
+ }
+ const Grammar &G;
+};
+
+// Computes a closure of the given item set S:
+// - extends the given S to contain all options for parsing next token;
+// - nonterminals after a dot are recursively expanded into the begin-state
+// of all production rules that produce that nonterminal;
+//
+// Given
+// Grammar rules = [ _ := E, E := E - T, E := T, T := n, T := ( E ) ]
+// Input = [ E := . T ]
+// returns [ E := . T, T := . n, T := . ( E ) ]
+State closure(ItemSet Queue, const Grammar &G) {
+ llvm::DenseSet<Item> InQueue = {Queue.begin(), Queue.end()};
+ // We reuse the passed-by-value Queue as the final result, as it's already
+ // initialized to the right elements.
+ size_t ItIndex = 0;
+ while (ItIndex < Queue.size()) {
+ const Item &ExpandingItem = Queue[ItIndex];
+ ++ItIndex;
+ if (!ExpandingItem.hasNext())
+ continue;
+
+ SymbolID NextSym = ExpandingItem.next(G);
+ if (pseudo::isToken(NextSym))
+ continue;
+ auto RRange = G.table().Nonterminals[NextSym].RuleRange;
+ for (RuleID RID = RRange.start; RID < RRange.end; ++RID) {
+ Item NewItem = Item::start(RID, G);
+ if (InQueue.insert(NewItem).second) // new
+ Queue.push_back(std::move(NewItem));
+ }
+ }
+ Queue.shrink_to_fit();
+ llvm::sort(Queue, SortByNextSymbol(G));
+ return {std::move(Queue)};
+}
+
+// Returns all next (with a dot advanced) kernel item sets, partitioned by the
+// advanced symbol.
+//
+// Given
+// S = [ E := . a b, E := E . - T ]
+// returns [
+// {id(a), [ E := a . b ]},
+// {id(-), [ E := E - . T ]}
+// ]
+std::vector<std::pair<SymbolID, ItemSet>>
+nextAvailableKernelItems(const State &S, const Grammar &G) {
+ std::vector<std::pair<SymbolID, ItemSet>> Results;
+ llvm::ArrayRef<Item> AllItems = S.Items;
+ AllItems = AllItems.drop_while([](const Item &I) { return !I.hasNext(); });
+ while (!AllItems.empty()) {
+ SymbolID AdvancedSymbol = AllItems.front().next(G);
+ auto Batch = AllItems.take_while([AdvancedSymbol, &G](const Item &I) {
+ assert(I.hasNext());
+ return I.next(G) == AdvancedSymbol;
+ });
+ assert(!Batch.empty());
+ AllItems = AllItems.drop_front(Batch.size());
+
+ // Advance a dot over the Symbol.
+ ItemSet Next;
+ for (const Item &I : Batch)
+ Next.push_back(I.advance());
+ // sort the set to keep order determinism for hash computation.
+ llvm::sort(Next);
+ Results.push_back({AdvancedSymbol, std::move(Next)});
+ }
+ return Results;
+}
+
+} // namespace
+
+std::string Item::dump(const Grammar &G) const {
+ const auto &Rule = G.lookupRule(RID);
+ auto ToNames = [&](llvm::ArrayRef<SymbolID> Syms) {
+ std::vector<llvm::StringRef> Results;
+ for (auto SID : Syms)
+ Results.push_back(G.symbolName(SID));
+ return Results;
+ };
+ return llvm::formatv("{0} := {1} • {2}", G.symbolName(Rule.Target),
+ llvm::join(ToNames(Rule.seq().take_front(DotPos)), " "),
+ llvm::join(ToNames(Rule.seq().drop_front(DotPos)), " "))
+ .str();
+}
+
+std::string State::dump(const Grammar &G, unsigned Indent) const {
+ std::string Result;
+ llvm::raw_string_ostream OS(Result);
+ for (const auto &Item : Items)
+ OS.indent(Indent) << llvm::formatv("{0}\n", Item.dump(G));
+ return OS.str();
+}
+
+std::string LRGraph::dumpForTests(const Grammar &G) const {
+ std::string Result;
+ llvm::raw_string_ostream OS(Result);
+ OS << "States:\n";
+ for (StateID ID = 0; ID < States.size(); ++ID) {
+ OS << llvm::formatv("State {0}\n", ID);
+ OS << States[ID].dump(G, /*Indent*/ 4);
+ }
+ for (const auto &E : Edges) {
+ OS << llvm::formatv("{0} ->[{1}] {2}\n", E.Src, G.symbolName(E.Label),
+ E.Dst);
+ }
+ return OS.str();
+}
+
+LRGraph LRGraph::buildLR0(const Grammar &G) {
+ class Builder {
+ public:
+ Builder(const Grammar &G) : G(G) {}
+
+ // Adds a given state if not existed.
+ std::pair<StateID, /*inserted*/ bool> insert(ItemSet KernelItems) {
+ assert(llvm::is_sorted(KernelItems) &&
+ "Item must be sorted before inserting to a hash map!");
+ auto It = StatesIndex.find(KernelItems);
+ if (It != StatesIndex.end())
+ return {It->second, false};
+ States.push_back(closure(KernelItems, G));
+ StateID NextStateID = States.size() - 1;
+ StatesIndex.insert({std::move(KernelItems), NextStateID});
+ return {NextStateID, true};
+ }
+
+ void insertEdge(StateID Src, StateID Dst, SymbolID Label) {
+ Edges.push_back({Src, Dst, Label});
+ }
+
+ // Returns a state with the given id.
+ const State &find(StateID ID) const {
+ assert(ID < States.size());
+ return States[ID];
+ }
+
+ LRGraph build() && {
+ States.shrink_to_fit();
+ Edges.shrink_to_fit();
+ return LRGraph(std::move(States), std::move(Edges));
+ }
+
+ private:
+ // Key is the **kernel** item sets.
+ llvm::DenseMap<ItemSet, /*index of States*/ size_t> StatesIndex;
+ std::vector<State> States;
+ std::vector<Edge> Edges;
+ const Grammar &G;
+ } Builder(G);
+
+ std::vector<StateID> PendingStates;
+ // Initialize states with the start symbol.
+ auto RRange = G.table().Nonterminals[G.startSymbol()].RuleRange;
+ for (RuleID RID = RRange.start; RID < RRange.end; ++RID) {
+ auto StartState = std::vector<Item>{Item::start(RID, G)};
+ auto Result = Builder.insert(std::move(StartState));
+ assert(Result.second && "State must be new");
+ PendingStates.push_back(Result.first);
+ }
+
+ while (!PendingStates.empty()) {
+ auto CurrentStateID = PendingStates.back();
+ PendingStates.pop_back();
+ for (auto Next :
+ nextAvailableKernelItems(Builder.find(CurrentStateID), G)) {
+ auto Insert = Builder.insert(Next.second);
+ if (Insert.second) // new state, insert to the pending queue.
+ PendingStates.push_back(Insert.first);
+ Builder.insertEdge(CurrentStateID, Insert.first, Next.first);
+ }
+ }
+ return std::move(Builder).build();
+}
+
+} // namespace pseudo
+} // namespace syntax
+} // namespace clang
diff --git a/clang/unittests/Tooling/Syntax/Pseudo/CMakeLists.txt b/clang/unittests/Tooling/Syntax/Pseudo/CMakeLists.txt
index d609d94bbff7..de1e1216c58d 100644
--- a/clang/unittests/Tooling/Syntax/Pseudo/CMakeLists.txt
+++ b/clang/unittests/Tooling/Syntax/Pseudo/CMakeLists.txt
@@ -4,6 +4,7 @@ set(LLVM_LINK_COMPONENTS
add_clang_unittest(ClangPseudoTests
GrammarTest.cpp
+ LRGraphTest.cpp
)
clang_target_link_libraries(ClangPseudoTests
diff --git a/clang/unittests/Tooling/Syntax/Pseudo/LRGraphTest.cpp b/clang/unittests/Tooling/Syntax/Pseudo/LRGraphTest.cpp
new file mode 100644
index 000000000000..e7f7e1a7e65d
--- /dev/null
+++ b/clang/unittests/Tooling/Syntax/Pseudo/LRGraphTest.cpp
@@ -0,0 +1,84 @@
+//===--- LRGraphTest.cpp - LRGraph tests -------------------------*- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/Tooling/Syntax/Pseudo/LRGraph.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include <memory>
+
+namespace clang {
+namespace syntax {
+namespace pseudo {
+namespace {
+
+TEST(LRGraph, Build) {
+ struct TestCase {
+ llvm::StringRef BNF;
+ llvm::StringRef ExpectedStates;
+ };
+
+ TestCase Cases[] = {{
+ R"bnf(
+_ := expr
+expr := IDENTIFIER
+ )bnf",
+ R"(States:
+State 0
+ _ := • expr
+ expr := • IDENTIFIER
+State 1
+ _ := expr •
+State 2
+ expr := IDENTIFIER •
+0 ->[expr] 1
+0 ->[IDENTIFIER] 2
+)"},
+ {// A grammar with a S/R conflict in SLR table:
+ // (id-id)-id, or id-(id-id).
+ R"bnf(
+_ := expr
+expr := expr - expr # S/R conflict at state 4 on '-' token
+expr := IDENTIFIER
+ )bnf",
+ R"(States:
+State 0
+ _ := • expr
+ expr := • expr - expr
+ expr := • IDENTIFIER
+State 1
+ _ := expr •
+ expr := expr • - expr
+State 2
+ expr := IDENTIFIER •
+State 3
+ expr := • expr - expr
+ expr := expr - • expr
+ expr := • IDENTIFIER
+State 4
+ expr := expr - expr •
+ expr := expr • - expr
+0 ->[expr] 1
+0 ->[IDENTIFIER] 2
+1 ->[-] 3
+3 ->[expr] 4
+3 ->[IDENTIFIER] 2
+4 ->[-] 3
+)"}};
+ for (const auto &C : Cases) {
+ std::vector<std::string> Diags;
+ auto G = Grammar::parseBNF(C.BNF, Diags);
+ ASSERT_THAT(Diags, testing::IsEmpty());
+ auto LR0 = LRGraph::buildLR0(*G);
+ EXPECT_EQ(LR0.dumpForTests(*G), C.ExpectedStates);
+ }
+}
+
+} // namespace
+} // namespace pseudo
+} // namespace syntax
+} // namespace clang
More information about the cfe-commits
mailing list