[clang] f1984b1 - [pseudo] Implement LRGraph

Wed Feb 9 02:20:22 PST 2022

Author: Haojian Wu
Date: 2022-02-09T11:20:07+01:00
New Revision: f1984b14336777e1978bfe994dd1e43ebea93f00

URL: https://github.com/llvm/llvm-project/commit/f1984b14336777e1978bfe994dd1e43ebea93f00
DIFF: https://github.com/llvm/llvm-project/commit/f1984b14336777e1978bfe994dd1e43ebea93f00.diff

LOG: [pseudo] Implement LRGraph

LRGraph is the key component of the clang pseudo parser, it is a
deterministic handle-finding finite-state machine, which is used to
generated the LR parsing table.

Separate from https://reviews.llvm.org/D118196.

Differential Revision: https://reviews.llvm.org/D119172

Added: 
    clang/include/clang/Tooling/Syntax/Pseudo/LRGraph.h
    clang/lib/Tooling/Syntax/Pseudo/LRGraph.cpp
    clang/unittests/Tooling/Syntax/Pseudo/LRGraphTest.cpp

Modified: 
    clang/lib/Tooling/Syntax/Pseudo/CMakeLists.txt
    clang/unittests/Tooling/Syntax/Pseudo/CMakeLists.txt

Removed: 
    


################################################################################
diff  --git a/clang/include/clang/Tooling/Syntax/Pseudo/LRGraph.h b/clang/include/clang/Tooling/Syntax/Pseudo/LRGraph.h
new file mode 100644
index 000000000000..8a4bdb76e49e

--- /dev/null
+++ b/clang/include/clang/Tooling/Syntax/Pseudo/LRGraph.h
@@ -0,0 +1,177 @@
+//===--- LRGraph.h - Build an LR automaton  ------------------*- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//  LR parsers are bottom-up parsers -- they scan the input from left to right,
+//  and collect the right-hand side of a production rule (called handle) on top
+//  of the stack, then replace (reduce) the handle with the nonterminal defined
+//  by the production rule.
+//
+//  This file defines LRGraph, a deterministic handle-finding finite-state
+//  automaton, which is a key component in LR parsers to recognize any of
+//  handles in the grammar efficiently. We build the LR table (ACTION and GOTO
+//  Table) based on the LRGraph.
+//
+//  LRGraph can be constructed for any context-free grammars.
+//  Even for a LR-ambiguous grammar, we can construct a deterministic FSA, but
+//  interpretation of the FSA is nondeterminsitic -- we might in a state where
+//  we can continue searching an handle and identify a handle (called
+//  shift/reduce conflicts), or identify more than one handle (callled
+//  reduce/reduce conflicts).
+//
+//  LRGraph is a common model for all variants of LR automatons, from the most
+//  basic one LR(0), the powerful SLR(1), LR(1) which uses a one-token lookahead
+//  in making decisions.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_TOOLING_SYNTAX_PSEUDO_LRGRAPH_H
+#define LLVM_CLANG_TOOLING_SYNTAX_PSEUDO_LRGRAPH_H
+
+#include "clang/Tooling/Syntax/Pseudo/Grammar.h"
+#include "llvm/ADT/Hashing.h"
+#include <vector>
+
+namespace clang {
+namespace syntax {
+namespace pseudo {
+
+// An LR item -- a grammar rule with a dot at some position of the body.
+// e.g. a production rule A := X Y yields 3 items:
+//   A := . X Y
+//   A := X . Y
+//   A := X Y .
+// An item indicates how much of a production rule has been recognized at a
+// position (described by dot), for example, A := X . Y indicates that we have
+// recognized the X part from the input, and we hope next to see the input
+// derivable from Y.
+class Item {
+public:
+  static Item start(RuleID ID, const Grammar &G) {
+    Item I;
+    I.RID = ID;
+    I.RuleLength = G.lookupRule(ID).Size;
+    return I;
+  }
+  static Item sentinel(RuleID ID) {
+    Item I;
+    I.RID = ID;
+    return I;
+  }
+
+  RuleID rule() const { return RID; }
+  uint8_t dot() const { return DotPos; }
+
+  bool hasNext() const { return DotPos < RuleLength; }
+  SymbolID next(const Grammar &G) const {
+    assert(hasNext());
+    return G.lookupRule(RID).Sequence[DotPos];
+  }
+
+  Item advance() const {
+    assert(hasNext());
+    Item I = *this;
+    ++I.DotPos;
+    return I;
+  }
+
+  std::string dump(const Grammar &G) const;
+
+  bool operator==(const Item &I) const {
+    return DotPos == I.DotPos && RID == I.RID;
+  }
+  bool operator<(const Item &I) const {
+    return std::tie(RID, DotPos) < std::tie(I.RID, I.DotPos);
+  }
+  friend llvm::hash_code hash_value(const Item &I) {
+    return llvm::hash_combine(I.RID, I.DotPos);
+  }
+
+private:
+  RuleID RID = 0;
+  uint8_t DotPos = 0;
+  uint8_t RuleLength = 0; // the length of rule body.
+};
+
+// A state represents a node in the LR automaton graph. It is an item set, which
+// contains all possible rules that the LR parser may be parsing in that state.
+//
+// Conceptually, If we knew in advance what we're parsing, at any point we're
+// partway through parsing a production, sitting on a stack of partially parsed
+// productions. But because we don't know, there could be *several* productions
+// we're partway through. The set of possibilities is the parser state, and we
+// precompute all the transitions between these states.
+struct State {
+  // A full set of items (including non-kernel items) representing the state,
+  // in a canonical order (see SortByNextSymbol in the cpp file).
+  std::vector<Item> Items;
+
+  std::string dump(const Grammar &G, unsigned Indent = 0) const;
+};
+
+// LRGraph is a deterministic finite state automaton for LR parsing.
+//
+// Intuitively, an LR automaton is a transition graph. The graph has a
+// collection of nodes, called States. Each state corresponds to a particular
+// item set, which represents a condition that could occur duing the process of
+// parsing a production. Edges are directed from one state to another. Each edge
+// is labeled by a grammar symbol (terminal or nonterminal).
+//
+// LRGraph is used to construct the LR parsing table which is a core
+// data-structure driving the LR parser.
+class LRGraph {
+public:
+  // StateID is the index in States table.
+  using StateID = uint16_t;
+
+  // Constructs an LR(0) automaton.
+  static LRGraph buildLR0(const Grammar &);
+
+  // An edge in the LR graph, it represents a transition in the LR automaton.
+  // If the parser is at state Src, with a lookahead Label, then it
+  // transits to state Dst.
+  struct Edge {
+    StateID Src, Dst;
+    SymbolID Label;
+  };
+
+  llvm::ArrayRef<State> states() const { return States; }
+  llvm::ArrayRef<Edge> edges() const { return Edges; }
+
+  std::string dumpForTests(const Grammar &) const;
+
+private:
+  LRGraph(std::vector<State> States, std::vector<Edge> Edges)
+      : States(std::move(States)), Edges(std::move(Edges)) {}
+
+  std::vector<State> States;
+  std::vector<Edge> Edges;
+};
+
+} // namespace pseudo
+} // namespace syntax
+} // namespace clang
+
+namespace llvm {
+// Support clang::syntax::pseudo::Item as DenseMap keys.
+template <> struct DenseMapInfo<clang::syntax::pseudo::Item> {
+  static inline clang::syntax::pseudo::Item getEmptyKey() {
+    return clang::syntax::pseudo::Item::sentinel(-1);
+  }
+  static inline clang::syntax::pseudo::Item getTombstoneKey() {
+    return clang::syntax::pseudo::Item::sentinel(-2);
+  }
+  static unsigned getHashValue(const clang::syntax::pseudo::Item &I) {
+    return hash_value(I);
+  }
+  static bool isEqual(const clang::syntax::pseudo::Item &LHS,
+                      const clang::syntax::pseudo::Item &RHS) {
+    return LHS == RHS;
+  }
+};
+} // namespace llvm
+
+#endif // LLVM_CLANG_TOOLING_SYNTAX_PSEUDO_LRGRAPH_H

diff  --git a/clang/lib/Tooling/Syntax/Pseudo/CMakeLists.txt b/clang/lib/Tooling/Syntax/Pseudo/CMakeLists.txt
index 223ea52b6f75..43fab1f98a06 100644
--- a/clang/lib/Tooling/Syntax/Pseudo/CMakeLists.txt
+++ b/clang/lib/Tooling/Syntax/Pseudo/CMakeLists.txt
@@ -3,6 +3,7 @@ set(LLVM_LINK_COMPONENTS Support)
 add_clang_library(clangToolingSyntaxPseudo
   Grammar.cpp
   GrammarBNF.cpp
+  LRGraph.cpp
   
   LINK_LIBS
   clangBasic

diff  --git a/clang/lib/Tooling/Syntax/Pseudo/LRGraph.cpp b/clang/lib/Tooling/Syntax/Pseudo/LRGraph.cpp
new file mode 100644
index 000000000000..17e47718812a
--- /dev/null
+++ b/clang/lib/Tooling/Syntax/Pseudo/LRGraph.cpp
@@ -0,0 +1,231 @@
+//===--- LRGraph.cpp - -------------------------------------------*- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/Tooling/Syntax/Pseudo/LRGraph.h"
+#include "clang/Tooling/Syntax/Pseudo/Grammar.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/raw_ostream.h"
+
+using ItemSet = std::vector<clang::syntax::pseudo::Item>;
+
+namespace llvm {
+// Support clang::syntax::pseudo::Item as DenseMap keys.
+template <> struct DenseMapInfo<ItemSet> {
+  static inline ItemSet getEmptyKey() {
+    return {DenseMapInfo<clang::syntax::pseudo::Item>::getEmptyKey()};
+  }
+  static inline ItemSet getTombstoneKey() {
+    return {DenseMapInfo<clang::syntax::pseudo::Item>::getTombstoneKey()};
+  }
+  static unsigned getHashValue(const ItemSet &I) {
+    return llvm::hash_combine_range(I.begin(), I.end());
+  }
+  static bool isEqual(const ItemSet &LHS, const ItemSet &RHS) {
+    return LHS == RHS;
+  }
+};
+} // namespace llvm
+
+namespace clang {
+namespace syntax {
+namespace pseudo {
+namespace {
+
+struct SortByNextSymbol {
+  SortByNextSymbol(const Grammar &G) : G(G) {}
+  bool operator()(const Item &L, const Item &R) {
+    if (L.hasNext() && R.hasNext() && L.next(G) != R.next(G))
+      return L.next(G) < R.next(G);
+    if (L.hasNext() != R.hasNext())
+      return L.hasNext() < R.hasNext(); //  a trailing dot is minimal.
+    return L < R;
+  }
+  const Grammar &G;
+};
+
+// Computes a closure of the given item set S:
+//  - extends the given S to contain all options for parsing next token;
+//  - nonterminals after a dot are recursively expanded into the begin-state
+//    of all production rules that produce that nonterminal;
+//
+// Given
+//   Grammar rules = [ _ := E, E := E - T, E := T, T := n, T := ( E ) ]
+//   Input = [ E := . T ]
+// returns [ E :=  . T, T := . n, T := . ( E ) ]
+State closure(ItemSet Queue, const Grammar &G) {
+  llvm::DenseSet<Item> InQueue = {Queue.begin(), Queue.end()};
+  // We reuse the passed-by-value Queue as the final result, as it's already
+  // initialized to the right elements.
+  size_t ItIndex = 0;
+  while (ItIndex < Queue.size()) {
+    const Item &ExpandingItem = Queue[ItIndex];
+    ++ItIndex;
+    if (!ExpandingItem.hasNext())
+      continue;
+
+    SymbolID NextSym = ExpandingItem.next(G);
+    if (pseudo::isToken(NextSym))
+      continue;
+    auto RRange = G.table().Nonterminals[NextSym].RuleRange;
+    for (RuleID RID = RRange.start; RID < RRange.end; ++RID) {
+      Item NewItem = Item::start(RID, G);
+      if (InQueue.insert(NewItem).second) // new
+        Queue.push_back(std::move(NewItem));
+    }
+  }
+  Queue.shrink_to_fit();
+  llvm::sort(Queue, SortByNextSymbol(G));
+  return {std::move(Queue)};
+}
+
+// Returns all next (with a dot advanced) kernel item sets, partitioned by the
+// advanced symbol.
+//
+// Given
+//  S = [ E := . a b, E := E . - T ]
+// returns [
+//   {id(a), [ E := a . b ]},
+//   {id(-), [ E := E - . T ]}
+// ]
+std::vector<std::pair<SymbolID, ItemSet>>
+nextAvailableKernelItems(const State &S, const Grammar &G) {
+  std::vector<std::pair<SymbolID, ItemSet>> Results;
+  llvm::ArrayRef<Item> AllItems = S.Items;
+  AllItems = AllItems.drop_while([](const Item &I) { return !I.hasNext(); });
+  while (!AllItems.empty()) {
+    SymbolID AdvancedSymbol = AllItems.front().next(G);
+    auto Batch = AllItems.take_while([AdvancedSymbol, &G](const Item &I) {
+      assert(I.hasNext());
+      return I.next(G) == AdvancedSymbol;
+    });
+    assert(!Batch.empty());
+    AllItems = AllItems.drop_front(Batch.size());
+
+    // Advance a dot over the Symbol.
+    ItemSet Next;
+    for (const Item &I : Batch)
+      Next.push_back(I.advance());
+    // sort the set to keep order determinism for hash computation.
+    llvm::sort(Next);
+    Results.push_back({AdvancedSymbol, std::move(Next)});
+  }
+  return Results;
+}
+
+} // namespace
+
+std::string Item::dump(const Grammar &G) const {
+  const auto &Rule = G.lookupRule(RID);
+  auto ToNames = [&](llvm::ArrayRef<SymbolID> Syms) {
+    std::vector<llvm::StringRef> Results;
+    for (auto SID : Syms)
+      Results.push_back(G.symbolName(SID));
+    return Results;
+  };
+  return llvm::formatv("{0} := {1} • {2}", G.symbolName(Rule.Target),
+                       llvm::join(ToNames(Rule.seq().take_front(DotPos)), " "),
+                       llvm::join(ToNames(Rule.seq().drop_front(DotPos)), " "))
+      .str();
+}
+
+std::string State::dump(const Grammar &G, unsigned Indent) const {
+  std::string Result;
+  llvm::raw_string_ostream OS(Result);
+  for (const auto &Item : Items)
+    OS.indent(Indent) << llvm::formatv("{0}\n", Item.dump(G));
+  return OS.str();
+}
+
+std::string LRGraph::dumpForTests(const Grammar &G) const {
+  std::string Result;
+  llvm::raw_string_ostream OS(Result);
+  OS << "States:\n";
+  for (StateID ID = 0; ID < States.size(); ++ID) {
+    OS << llvm::formatv("State {0}\n", ID);
+    OS << States[ID].dump(G, /*Indent*/ 4);
+  }
+  for (const auto &E : Edges) {
+    OS << llvm::formatv("{0} ->[{1}] {2}\n", E.Src, G.symbolName(E.Label),
+                        E.Dst);
+  }
+  return OS.str();
+}
+
+LRGraph LRGraph::buildLR0(const Grammar &G) {
+  class Builder {
+  public:
+    Builder(const Grammar &G) : G(G) {}
+
+    // Adds a given state if not existed.
+    std::pair<StateID, /*inserted*/ bool> insert(ItemSet KernelItems) {
+      assert(llvm::is_sorted(KernelItems) &&
+             "Item must be sorted before inserting to a hash map!");
+      auto It = StatesIndex.find(KernelItems);
+      if (It != StatesIndex.end())
+        return {It->second, false};
+      States.push_back(closure(KernelItems, G));
+      StateID NextStateID = States.size() - 1;
+      StatesIndex.insert({std::move(KernelItems), NextStateID});
+      return {NextStateID, true};
+    }
+
+    void insertEdge(StateID Src, StateID Dst, SymbolID Label) {
+      Edges.push_back({Src, Dst, Label});
+    }
+
+    // Returns a state with the given id.
+    const State &find(StateID ID) const {
+      assert(ID < States.size());
+      return States[ID];
+    }
+
+    LRGraph build() && {
+      States.shrink_to_fit();
+      Edges.shrink_to_fit();
+      return LRGraph(std::move(States), std::move(Edges));
+    }
+
+  private:
+    // Key is the **kernel** item sets.
+    llvm::DenseMap<ItemSet, /*index of States*/ size_t> StatesIndex;
+    std::vector<State> States;
+    std::vector<Edge> Edges;
+    const Grammar &G;
+  } Builder(G);
+
+  std::vector<StateID> PendingStates;
+  // Initialize states with the start symbol.
+  auto RRange = G.table().Nonterminals[G.startSymbol()].RuleRange;
+  for (RuleID RID = RRange.start; RID < RRange.end; ++RID) {
+    auto StartState = std::vector<Item>{Item::start(RID, G)};
+    auto Result = Builder.insert(std::move(StartState));
+    assert(Result.second && "State must be new");
+    PendingStates.push_back(Result.first);
+  }
+
+  while (!PendingStates.empty()) {
+    auto CurrentStateID = PendingStates.back();
+    PendingStates.pop_back();
+    for (auto Next :
+         nextAvailableKernelItems(Builder.find(CurrentStateID), G)) {
+      auto Insert = Builder.insert(Next.second);
+      if (Insert.second) // new state, insert to the pending queue.
+        PendingStates.push_back(Insert.first);
+      Builder.insertEdge(CurrentStateID, Insert.first, Next.first);
+    }
+  }
+  return std::move(Builder).build();
+}
+
+} // namespace pseudo
+} // namespace syntax
+} // namespace clang

diff  --git a/clang/unittests/Tooling/Syntax/Pseudo/CMakeLists.txt b/clang/unittests/Tooling/Syntax/Pseudo/CMakeLists.txt
index d609d94bbff7..de1e1216c58d 100644
--- a/clang/unittests/Tooling/Syntax/Pseudo/CMakeLists.txt
+++ b/clang/unittests/Tooling/Syntax/Pseudo/CMakeLists.txt
@@ -4,6 +4,7 @@ set(LLVM_LINK_COMPONENTS
 
 add_clang_unittest(ClangPseudoTests
   GrammarTest.cpp
+  LRGraphTest.cpp
 )
 
 clang_target_link_libraries(ClangPseudoTests

diff  --git a/clang/unittests/Tooling/Syntax/Pseudo/LRGraphTest.cpp b/clang/unittests/Tooling/Syntax/Pseudo/LRGraphTest.cpp
new file mode 100644
index 000000000000..e7f7e1a7e65d
--- /dev/null
+++ b/clang/unittests/Tooling/Syntax/Pseudo/LRGraphTest.cpp
@@ -0,0 +1,84 @@
+//===--- LRGraphTest.cpp - LRGraph tests -------------------------*- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/Tooling/Syntax/Pseudo/LRGraph.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include <memory>
+
+namespace clang {
+namespace syntax {
+namespace pseudo {
+namespace {
+
+TEST(LRGraph, Build) {
+  struct TestCase {
+    llvm::StringRef BNF;
+    llvm::StringRef ExpectedStates;
+  };
+
+  TestCase Cases[] = {{
+                          R"bnf(
+_ := expr
+expr := IDENTIFIER
+      )bnf",
+                          R"(States:
+State 0
+    _ :=  • expr
+    expr :=  • IDENTIFIER
+State 1
+    _ := expr • 
+State 2
+    expr := IDENTIFIER • 
+0 ->[expr] 1
+0 ->[IDENTIFIER] 2
+)"},
+                      {// A grammar with a S/R conflict in SLR table:
+                       // (id-id)-id, or id-(id-id).
+                       R"bnf(
+_ := expr
+expr := expr - expr  # S/R conflict at state 4 on '-' token
+expr := IDENTIFIER
+      )bnf",
+                       R"(States:
+State 0
+    _ :=  • expr
+    expr :=  • expr - expr
+    expr :=  • IDENTIFIER
+State 1
+    _ := expr • 
+    expr := expr • - expr
+State 2
+    expr := IDENTIFIER • 
+State 3
+    expr :=  • expr - expr
+    expr := expr - • expr
+    expr :=  • IDENTIFIER
+State 4
+    expr := expr - expr • 
+    expr := expr • - expr
+0 ->[expr] 1
+0 ->[IDENTIFIER] 2
+1 ->[-] 3
+3 ->[expr] 4
+3 ->[IDENTIFIER] 2
+4 ->[-] 3
+)"}};
+  for (const auto &C : Cases) {
+    std::vector<std::string> Diags;
+    auto G = Grammar::parseBNF(C.BNF, Diags);
+    ASSERT_THAT(Diags, testing::IsEmpty());
+    auto LR0 = LRGraph::buildLR0(*G);
+    EXPECT_EQ(LR0.dumpForTests(*G), C.ExpectedStates);
+  }
+}
+
+} // namespace
+} // namespace pseudo
+} // namespace syntax
+} // namespace clang