[clang] a2fab82 - [pseudo] Implement LRTable.

Haojian Wu via cfe-commits cfe-commits at lists.llvm.org
Wed Feb 23 00:21:44 PST 2022


Author: Haojian Wu
Date: 2022-02-23T09:21:34+01:00
New Revision: a2fab82f33bb8cc38cd1dfe7856dae706ce4297a

URL: https://github.com/llvm/llvm-project/commit/a2fab82f33bb8cc38cd1dfe7856dae706ce4297a
DIFF: https://github.com/llvm/llvm-project/commit/a2fab82f33bb8cc38cd1dfe7856dae706ce4297a.diff

LOG: [pseudo] Implement LRTable.

This patch introduces a dense implementation of the LR parsing table, which is
used by LR parsers.

We build a SLR(1) parsing table from the LR(0) graph.

Statistics of the LR parsing table on the C++ spec grammar:
  - number of states: 1449
  - number of actions: 83069
  - size of the table (bytes): 334928

Differential Revision: https://reviews.llvm.org/D118196

Added: 
    clang/include/clang/Tooling/Syntax/Pseudo/LRTable.h
    clang/lib/Tooling/Syntax/Pseudo/LRTable.cpp
    clang/lib/Tooling/Syntax/Pseudo/LRTableBuild.cpp
    clang/test/Syntax/lr-build-basic.test
    clang/test/Syntax/lr-build-conflicts.test
    clang/unittests/Tooling/Syntax/Pseudo/LRTableTest.cpp

Modified: 
    clang/include/clang/Tooling/Syntax/Pseudo/Grammar.h
    clang/lib/Tooling/Syntax/Pseudo/CMakeLists.txt
    clang/lib/Tooling/Syntax/Pseudo/Grammar.cpp
    clang/lib/Tooling/Syntax/Pseudo/GrammarBNF.cpp
    clang/test/Syntax/check-cxx-bnf.test
    clang/tools/clang-pseudo/ClangPseudo.cpp
    clang/unittests/Tooling/Syntax/Pseudo/CMakeLists.txt

Removed: 
    clang/unittests/Tooling/Syntax/Pseudo/LRGraphTest.cpp


################################################################################
diff  --git a/clang/include/clang/Tooling/Syntax/Pseudo/Grammar.h b/clang/include/clang/Tooling/Syntax/Pseudo/Grammar.h
index a7ecfea902b6d..086809ef41423 100644
--- a/clang/include/clang/Tooling/Syntax/Pseudo/Grammar.h
+++ b/clang/include/clang/Tooling/Syntax/Pseudo/Grammar.h
@@ -154,6 +154,8 @@ std::vector<llvm::DenseSet<SymbolID>> followSets(const Grammar &);
 // It can be constructed dynamically (from compiling BNF file) or statically
 // (a compiled data-source).
 struct GrammarTable {
+  GrammarTable();
+
   struct Nonterminal {
     std::string Name;
     // Corresponding rules that construct the non-terminal, it is a [start, end)
@@ -169,7 +171,7 @@ struct GrammarTable {
   std::vector<Rule> Rules;
   // A table of terminals (aka tokens). It corresponds to the clang::Token.
   // clang::tok::TokenKind is the index of the table.
-  std::vector<std::string> Terminals;
+  llvm::ArrayRef<std::string> Terminals;
   // A table of nonterminals, sorted by name.
   // SymbolID is the index of the table.
   std::vector<Nonterminal> Nonterminals;

diff  --git a/clang/include/clang/Tooling/Syntax/Pseudo/LRTable.h b/clang/include/clang/Tooling/Syntax/Pseudo/LRTable.h
new file mode 100644
index 0000000000000..025f7f141633a
--- /dev/null
+++ b/clang/include/clang/Tooling/Syntax/Pseudo/LRTable.h
@@ -0,0 +1,182 @@
+//===--- LRTable.h - Define LR Parsing Table ---------------------*- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//  The LRTable (referred as LR parsing table in the LR literature) is the core
+//  component in LR parsers, it drives the LR parsers by specifying an action to
+//  take given the current state on the top of the stack and the current
+//  lookahead token.
+//
+//  The LRTable can be described as a matrix where the rows represent
+//  the states of the LR graph, the columns represent the symbols of the
+//  grammar, and each entry of the matrix (called action) represents a
+//  state transition in the graph.
+//
+//  Typically, based on the category of the grammar symbol, the LRTable is
+//  broken into two logically separate tables:
+//    - ACTION table with terminals as columns -- e.g ACTION[S, a] specifies
+//      next action (shift/reduce/accept/error) on state S under a lookahead
+//      terminal a
+//    - GOTO table with nonterminals as columns -- e.g. GOTO[S, X] specify
+//      the state which we transist to from the state S with the nonterminal X
+//
+//  LRTable is *performance-critial* as it is consulted frequently during a
+//  parse. In general, LRTable is very sparse (most of the entries are empty).
+//  For example, for the C++ language, the SLR table has ~1500 states and 650
+//  symbols which results in a matrix having 975K entries, ~90% of entries are
+//  empty.
+//
+//  This file implements a speed-and-space-efficient LRTable.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_TOOLING_SYNTAX_PSEUDO_LRTABLE_H
+#define LLVM_CLANG_TOOLING_SYNTAX_PSEUDO_LRTABLE_H
+
+#include "clang/Tooling/Syntax/Pseudo/Grammar.h"
+#include "llvm/ADT/ArrayRef.h"
+#include <cstdint>
+#include <vector>
+
+namespace clang {
+namespace syntax {
+namespace pseudo {
+
+// Represents the LR parsing table, which can efficiently the question "what is
+// the next step given the lookahead token and current state on top of the
+// stack?".
+//
+// This is a dense implementation, which only takes an amount of space that is
+// proportional to the number of non-empty entries in the table.
+//
+// Unlike the typical LR parsing table which allows at most one available action
+// per entry, conflicted actions are allowed in LRTable. The LRTable is designed
+// to be used in nondeterministic LR parsers (e.g. GLR).
+class LRTable {
+public:
+  // StateID is only 13 bits wide.
+  using StateID = uint16_t;
+  static constexpr unsigned StateBits = 13;
+
+  // Action represents the terminal and nonterminal actions, it combines the
+  // entry of the ACTION and GOTO tables from the LR literature.
+  class Action {
+  public:
+    enum Kind : uint8_t {
+      Sentinel = 0,
+      // Terminal actions, corresponding to entries of ACTION table.
+
+      // Shift to state n: move forward with the lookahead, and push state n
+      // onto the state stack.
+      // A shift is a forward transition, and the value n is the next state that
+      // the parser is to enter.
+      Shift,
+      // Reduce by a rule: pop the state stack.
+      Reduce,
+      // Signals that we have parsed the input successfully.
+      Accept,
+
+      // Nonterminal actions, corresponding to entry of GOTO table.
+
+      // Go to state n: push state n onto the state stack.
+      // Similar to Shift, but it is a nonterminal forward transition.
+      GoTo,
+    };
+
+    static Action accept(RuleID RID) { return Action(Accept, RID); }
+    static Action goTo(StateID S) { return Action(GoTo, S); }
+    static Action shift(StateID S) { return Action(Shift, S); }
+    static Action reduce(RuleID RID) { return Action(Reduce, RID); }
+    static Action sentinel() { return Action(Sentinel, 0); }
+
+    StateID getShiftState() const {
+      assert(kind() == Shift);
+      return Value;
+    }
+    StateID getGoToState() const {
+      assert(kind() == GoTo);
+      return Value;
+    }
+    RuleID getReduceRule() const {
+      assert(kind() == Reduce);
+      return Value;
+    }
+    Kind kind() const { return static_cast<Kind>(K); }
+
+    bool operator==(const Action &L) const { return opaque() == L.opaque(); }
+    uint16_t opaque() const { return K << ValueBits | Value; };
+
+  private:
+    Action(Kind K1, unsigned Value) : K(K1), Value(Value) {}
+    static constexpr unsigned ValueBits = StateBits;
+    static constexpr unsigned KindBits = 3;
+    static_assert(ValueBits >= RuleBits, "Value must be able to store RuleID");
+    static_assert(KindBits + ValueBits <= 16,
+                  "Must be able to store kind and value efficiently");
+    uint16_t K : KindBits;
+    // Either StateID or RuleID, depending on the Kind.
+    uint16_t Value : ValueBits;
+  };
+
+  // Returns all available actions for the given state on a terminal.
+  // Expected to be called by LR parsers.
+  llvm::ArrayRef<Action> getActions(StateID State, SymbolID Terminal) const;
+  // Returns the state after we reduce a nonterminal.
+  // Expected to be called by LR parsers.
+  StateID getGoToState(StateID State, SymbolID Nonterminal) const;
+
+  // Looks up available actions.
+  // Returns empty if no available actions in the table.
+  llvm::ArrayRef<Action> find(StateID State, SymbolID Symbol) const;
+
+  size_t bytes() const {
+    return sizeof(*this) + Actions.capacity() * sizeof(Action) +
+           States.capacity() * sizeof(StateID) +
+           NontermOffset.capacity() * sizeof(uint32_t) +
+           TerminalOffset.capacity() * sizeof(uint32_t);
+  }
+
+  std::string dumpStatistics() const;
+  std::string dumpForTests(const Grammar &G) const;
+
+  // Build a SLR(1) parsing table.
+  static LRTable buildSLR(const Grammar &G);
+
+  class Builder;
+  // Represents an entry in the table, used for building the LRTable.
+  struct Entry {
+    StateID State;
+    SymbolID Symbol;
+    Action Act;
+  };
+  // Build a specifid table for testing purposes.
+  static LRTable buildForTests(const GrammarTable &, llvm::ArrayRef<Entry>);
+
+private:
+  // Conceptually the LR table is a multimap from (State, SymbolID) => Action.
+  // Our physical representation is quite 
diff erent for compactness.
+
+  // Index is nonterminal SymbolID, value is the offset into States/Actions
+  // where the entries for this nonterminal begin.
+  // Give a non-terminal id, the corresponding half-open range of StateIdx is
+  // [NontermIdx[id], NontermIdx[id+1]).
+  std::vector<uint32_t> NontermOffset;
+  // Similar to NontermOffset, but for terminals, index is tok::TokenKind.
+  std::vector<uint32_t> TerminalOffset;
+  // Parallel to Actions, the value is State (rows of the matrix).
+  // Grouped by the SymbolID, and only subranges are sorted.
+  std::vector<StateID> States;
+  // A flat list of available actions, sorted by (SymbolID, State).
+  std::vector<Action> Actions;
+};
+llvm::raw_ostream &operator<<(llvm::raw_ostream &, const LRTable::Action &);
+
+} // namespace pseudo
+} // namespace syntax
+} // namespace clang
+
+#endif // LLVM_CLANG_TOOLING_SYNTAX_PSEUDO_LRTABLE_H

diff  --git a/clang/lib/Tooling/Syntax/Pseudo/CMakeLists.txt b/clang/lib/Tooling/Syntax/Pseudo/CMakeLists.txt
index 43fab1f98a063..8afe7f73f3085 100644
--- a/clang/lib/Tooling/Syntax/Pseudo/CMakeLists.txt
+++ b/clang/lib/Tooling/Syntax/Pseudo/CMakeLists.txt
@@ -4,7 +4,9 @@ add_clang_library(clangToolingSyntaxPseudo
   Grammar.cpp
   GrammarBNF.cpp
   LRGraph.cpp
-  
+  LRTable.cpp
+  LRTableBuild.cpp
+
   LINK_LIBS
   clangBasic
   clangLex

diff  --git a/clang/lib/Tooling/Syntax/Pseudo/Grammar.cpp b/clang/lib/Tooling/Syntax/Pseudo/Grammar.cpp
index a2cd51a6c7569..4f1a5111ea73c 100644
--- a/clang/lib/Tooling/Syntax/Pseudo/Grammar.cpp
+++ b/clang/lib/Tooling/Syntax/Pseudo/Grammar.cpp
@@ -163,6 +163,23 @@ std::vector<llvm::DenseSet<SymbolID>> followSets(const Grammar &G) {
   return FollowSets;
 }
 
+static llvm::ArrayRef<std::string> getTerminalNames() {
+  static const std::vector<std::string> *TerminalNames = []() {
+    static std::vector<std::string> TerminalNames;
+    TerminalNames.reserve(NumTerminals);
+    for (unsigned I = 0; I < NumTerminals; ++I) {
+      tok::TokenKind K = static_cast<tok::TokenKind>(I);
+      if (const auto *Punc = tok::getPunctuatorSpelling(K))
+        TerminalNames.push_back(Punc);
+      else
+        TerminalNames.push_back(llvm::StringRef(tok::getTokenName(K)).upper());
+    }
+    return &TerminalNames;
+  }();
+  return *TerminalNames;
+}
+GrammarTable::GrammarTable() : Terminals(getTerminalNames()) {}
+
 } // namespace pseudo
 } // namespace syntax
 } // namespace clang

diff  --git a/clang/lib/Tooling/Syntax/Pseudo/GrammarBNF.cpp b/clang/lib/Tooling/Syntax/Pseudo/GrammarBNF.cpp
index b19bed3449ba9..bc90a9674d9ef 100644
--- a/clang/lib/Tooling/Syntax/Pseudo/GrammarBNF.cpp
+++ b/clang/lib/Tooling/Syntax/Pseudo/GrammarBNF.cpp
@@ -21,17 +21,6 @@ namespace {
 static const llvm::StringRef OptSuffix = "_opt";
 static const llvm::StringRef StartSymbol = "_";
 
-void initTerminals(std::vector<std::string> &Out) {
-  Out.clear();
-  Out.reserve(NumTerminals);
-  for (unsigned I = 0; I < NumTerminals; ++I) {
-    tok::TokenKind K = static_cast<tok::TokenKind>(I);
-    if (const auto *Punc = tok::getPunctuatorSpelling(K))
-      Out.push_back(Punc);
-    else
-      Out.push_back(llvm::StringRef(tok::getTokenName(K)).upper());
-  }
-}
 // Builds grammar from BNF files.
 class GrammarBuilder {
 public:
@@ -53,7 +42,6 @@ class GrammarBuilder {
            "Optional symbols should be eliminated!");
 
     auto T = std::make_unique<GrammarTable>();
-    initTerminals(T->Terminals);
 
     // Assemble the name->ID and ID->nonterminal name maps.
     llvm::DenseSet<llvm::StringRef> UniqueNonterminals;

diff  --git a/clang/lib/Tooling/Syntax/Pseudo/LRTable.cpp b/clang/lib/Tooling/Syntax/Pseudo/LRTable.cpp
new file mode 100644
index 0000000000000..2ecb9b1cd2ce2
--- /dev/null
+++ b/clang/lib/Tooling/Syntax/Pseudo/LRTable.cpp
@@ -0,0 +1,124 @@
+//===--- LRTable.cpp - Parsing table for LR parsers --------------*- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/Tooling/Syntax/Pseudo/LRTable.h"
+#include "clang/Tooling/Syntax/Pseudo/Grammar.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace clang {
+namespace syntax {
+namespace pseudo {
+
+llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const LRTable::Action &A) {
+  switch (A.kind()) {
+  case LRTable::Action::Shift:
+    return OS << llvm::formatv("shift state {0}", A.getShiftState());
+  case LRTable::Action::Reduce:
+    return OS << llvm::formatv("reduce by rule {0}", A.getReduceRule());
+  case LRTable::Action::GoTo:
+    return OS << llvm::formatv("go to state {0}", A.getGoToState());
+  case LRTable::Action::Accept:
+    return OS << "acc";
+  case LRTable::Action::Sentinel:
+    llvm_unreachable("unexpected Sentinel action kind!");
+  }
+}
+
+std::string LRTable::dumpStatistics() const {
+  StateID NumOfStates = 0;
+  for (StateID It : States)
+    NumOfStates = std::max(It, NumOfStates);
+  return llvm::formatv(R"(
+Statistics of the LR parsing table:
+    number of states: {0}
+    number of actions: {1}
+    size of the table (bytes): {2}
+)",
+                       NumOfStates, Actions.size(), bytes())
+      .str();
+}
+
+std::string LRTable::dumpForTests(const Grammar &G) const {
+  std::string Result;
+  llvm::raw_string_ostream OS(Result);
+  StateID MaxState = 0;
+  for (StateID It : States)
+    MaxState = std::max(MaxState, It);
+  OS << "LRTable:\n";
+  for (StateID S = 0; S <= MaxState; ++S) {
+    OS << llvm::formatv("State {0}\n", S);
+    for (uint16_t Terminal = 0; Terminal < NumTerminals; ++Terminal) {
+      SymbolID TokID = tokenSymbol(static_cast<tok::TokenKind>(Terminal));
+      for (auto A : find(S, TokID)) {
+        if (A.kind() == LRTable::Action::Shift)
+          OS.indent(4) << llvm::formatv("'{0}': shift state {1}\n",
+                                        G.symbolName(TokID), A.getShiftState());
+        else if (A.kind() == LRTable::Action::Reduce)
+          OS.indent(4) << llvm::formatv("'{0}': reduce by rule {1} '{2}'\n",
+                                        G.symbolName(TokID), A.getReduceRule(),
+                                        G.dumpRule(A.getReduceRule()));
+        else if (A.kind() == LRTable::Action::Accept)
+          OS.indent(4) << llvm::formatv("'{0}': accept\n", G.symbolName(TokID));
+      }
+    }
+    for (SymbolID NontermID = 0; NontermID < G.table().Nonterminals.size();
+         ++NontermID) {
+      if (find(S, NontermID).empty())
+        continue;
+      OS.indent(4) << llvm::formatv("'{0}': go to state {1}\n",
+                                    G.symbolName(NontermID),
+                                    getGoToState(S, NontermID));
+    }
+  }
+  return OS.str();
+}
+
+llvm::ArrayRef<LRTable::Action> LRTable::getActions(StateID State,
+                                                    SymbolID Terminal) const {
+  assert(pseudo::isToken(Terminal) && "expect terminal symbol!");
+  return find(State, Terminal);
+}
+
+LRTable::StateID LRTable::getGoToState(StateID State,
+                                       SymbolID Nonterminal) const {
+  assert(pseudo::isNonterminal(Nonterminal) && "expected nonterminal symbol!");
+  auto Result = find(State, Nonterminal);
+  assert(Result.size() == 1 && Result.front().kind() == Action::GoTo);
+  return Result.front().getGoToState();
+}
+
+llvm::ArrayRef<LRTable::Action> LRTable::find(StateID Src, SymbolID ID) const {
+  size_t Idx = isToken(ID) ? symbolToToken(ID) : ID;
+  assert(isToken(ID) ? Idx + 1 < TerminalOffset.size()
+                     : Idx + 1 < NontermOffset.size());
+  std::pair<size_t, size_t> TargetStateRange =
+      isToken(ID) ? std::make_pair(TerminalOffset[Idx], TerminalOffset[Idx + 1])
+                  : std::make_pair(NontermOffset[Idx], NontermOffset[Idx + 1]);
+  auto TargetedStates =
+      llvm::makeArrayRef(States.data() + TargetStateRange.first,
+                         States.data() + TargetStateRange.second);
+
+  assert(llvm::is_sorted(TargetedStates) &&
+         "subrange of the StateIdx should be sorted!");
+  const LRTable::StateID *It = llvm::partition_point(
+      TargetedStates, [&Src](LRTable::StateID S) { return S < Src; });
+  if (It == TargetedStates.end())
+    return {};
+  size_t Start = It - States.data(), End = Start;
+  while (End < States.size() && States[End] == Src)
+    ++End;
+  return llvm::makeArrayRef(&Actions[Start], &Actions[End]);
+}
+
+} // namespace pseudo
+} // namespace syntax
+} // namespace clang

diff  --git a/clang/lib/Tooling/Syntax/Pseudo/LRTableBuild.cpp b/clang/lib/Tooling/Syntax/Pseudo/LRTableBuild.cpp
new file mode 100644
index 0000000000000..f07d8b106806e
--- /dev/null
+++ b/clang/lib/Tooling/Syntax/Pseudo/LRTableBuild.cpp
@@ -0,0 +1,143 @@
+//===--- LRTableBuild.cpp - Build a LRTable from LRGraph ---------*- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/Basic/TokenKinds.h"
+#include "clang/Tooling/Syntax/Pseudo/Grammar.h"
+#include "clang/Tooling/Syntax/Pseudo/LRGraph.h"
+#include "clang/Tooling/Syntax/Pseudo/LRTable.h"
+#include <cstdint>
+
+namespace llvm {
+template <> struct DenseMapInfo<clang::syntax::pseudo::LRTable::Entry> {
+  using Entry = clang::syntax::pseudo::LRTable::Entry;
+  static inline Entry getEmptyKey() {
+    static Entry E{static_cast<clang::syntax::pseudo::SymbolID>(-1), 0,
+                   clang::syntax::pseudo::LRTable::Action::sentinel()};
+    return E;
+  }
+  static inline Entry getTombstoneKey() {
+    static Entry E{static_cast<clang::syntax::pseudo::SymbolID>(-2), 0,
+                   clang::syntax::pseudo::LRTable::Action::sentinel()};
+    return E;
+  }
+  static unsigned getHashValue(const Entry &I) {
+    return llvm::hash_combine(I.State, I.Symbol, I.Act.opaque());
+  }
+  static bool isEqual(const Entry &LHS, const Entry &RHS) {
+    return LHS.State == RHS.State && LHS.Symbol == RHS.Symbol &&
+           LHS.Act == RHS.Act;
+  }
+};
+} // namespace llvm
+
+namespace clang {
+namespace syntax {
+namespace pseudo {
+
+class LRTable::Builder {
+public:
+  bool insert(Entry E) { return Entries.insert(std::move(E)).second; }
+  LRTable build(const GrammarTable &GT) && {
+    // E.g. given the following parsing table with 3 states and 3 terminals:
+    //
+    //            a    b     c
+    // +-------+----+-------+-+
+    // |state0 |    | s0,r0 | |
+    // |state1 | acc|       | |
+    // |state2 |    |  r1   | |
+    // +-------+----+-------+-+
+    //
+    // The final LRTable:
+    //  - TerminalOffset: [a] = 0, [b] = 1, [c] = 4, [d] = 4 (d is a sentinel)
+    //  -  States:     [ 1,    0,  0,  2]
+    //    Actions:     [ acc, s0, r0, r1]
+    //                   ~~~ corresponding range for terminal a
+    //                        ~~~~~~~~~~ corresponding range for terminal b
+    // First step, we sort all entries by (Symbol, State, Action).
+    std::vector<Entry> Sorted(Entries.begin(), Entries.end());
+    llvm::sort(Sorted, [](const Entry &L, const Entry &R) {
+      return std::forward_as_tuple(L.Symbol, L.State, L.Act.opaque()) <
+             std::forward_as_tuple(R.Symbol, R.State, R.Act.opaque());
+    });
+
+    LRTable Table;
+    Table.Actions.reserve(Sorted.size());
+    Table.States.reserve(Sorted.size());
+    // We are good to finalize the States and Actions.
+    for (const auto &E : Sorted) {
+      Table.Actions.push_back(E.Act);
+      Table.States.push_back(E.State);
+    }
+    // Initialize the terminal and nonterminal idx, all ranges are empty by
+    // default.
+    Table.TerminalOffset = std::vector<uint32_t>(GT.Terminals.size() + 1, 0);
+    Table.NontermOffset = std::vector<uint32_t>(GT.Nonterminals.size() + 1, 0);
+    size_t SortedIndex = 0;
+    for (SymbolID NonterminalID = 0; NonterminalID < Table.NontermOffset.size();
+         ++NonterminalID) {
+      Table.NontermOffset[NonterminalID] = SortedIndex;
+      while (SortedIndex < Sorted.size() &&
+             Sorted[SortedIndex].Symbol == NonterminalID)
+        ++SortedIndex;
+    }
+    for (size_t Terminal = 0; Terminal < Table.TerminalOffset.size();
+         ++Terminal) {
+      Table.TerminalOffset[Terminal] = SortedIndex;
+      while (SortedIndex < Sorted.size() &&
+             Sorted[SortedIndex].Symbol ==
+                 tokenSymbol(static_cast<tok::TokenKind>(Terminal)))
+        ++SortedIndex;
+    }
+    return Table;
+  }
+
+private:
+  llvm::DenseSet<Entry> Entries;
+};
+
+LRTable LRTable::buildForTests(const GrammarTable &GT,
+                               llvm::ArrayRef<Entry> Entries) {
+  Builder Build;
+  for (const Entry &E : Entries)
+    Build.insert(E);
+  return std::move(Build).build(GT);
+}
+
+LRTable LRTable::buildSLR(const Grammar &G) {
+  Builder Build;
+  auto Graph = LRGraph::buildLR0(G);
+  for (const auto &T : Graph.edges()) {
+    Action Act = isToken(T.Label) ? Action::shift(T.Dst) : Action::goTo(T.Dst);
+    Build.insert({T.Src, T.Label, Act});
+  }
+  assert(Graph.states().size() <= (1 << StateBits) &&
+         "Graph states execceds the maximum limit!");
+  auto FollowSets = followSets(G);
+  for (StateID SID = 0; SID < Graph.states().size(); ++SID) {
+    for (const Item &I : Graph.states()[SID].Items) {
+      // If we've just parsed the start symbol, we can accept the input.
+      if (G.lookupRule(I.rule()).Target == G.startSymbol() && !I.hasNext()) {
+        Build.insert({SID, tokenSymbol(tok::eof), Action::accept(I.rule())});
+        continue;
+      }
+      if (!I.hasNext()) {
+        // If we've reached the end of a rule A := ..., then we can reduce if
+        // the next token is in the follow set of A".
+        for (SymbolID Follow : FollowSets[G.lookupRule(I.rule()).Target]) {
+          assert(isToken(Follow));
+          Build.insert({SID, Follow, Action::reduce(I.rule())});
+        }
+      }
+    }
+  }
+  return std::move(Build).build(G.table());
+}
+
+} // namespace pseudo
+} // namespace syntax
+} // namespace clang

diff  --git a/clang/test/Syntax/check-cxx-bnf.test b/clang/test/Syntax/check-cxx-bnf.test
index fcc0fa6a1ecc7..e7e7194257629 100644
--- a/clang/test/Syntax/check-cxx-bnf.test
+++ b/clang/test/Syntax/check-cxx-bnf.test
@@ -1,2 +1,2 @@
 // verify clang/lib/Tooling/Syntax/Pseudo/cxx.bnf
-// RUN: clang-pseudo -check-grammar=%cxx-bnf-file
+// RUN: clang-pseudo -grammar=%cxx-bnf-file

diff  --git a/clang/test/Syntax/lr-build-basic.test b/clang/test/Syntax/lr-build-basic.test
new file mode 100644
index 0000000000000..d6538338991e1
--- /dev/null
+++ b/clang/test/Syntax/lr-build-basic.test
@@ -0,0 +1,24 @@
+_ := expr
+expr := IDENTIFIER
+
+# RUN: clang-pseudo -grammar %s -print-graph | FileCheck %s --check-prefix=GRAPH
+#      GRAPH: States:
+# GRPAH-NEXT: State 0
+# GRPAH-NEXT:     _ :=  • expr
+# GRPAH-NEXT:     expr :=  • IDENTIFIER
+# GRPAH-NEXT: State 1
+# GRPAH-NEXT:     _ := expr • 
+# GRPAH-NEXT: State 2
+# GRPAH-NEXT:     expr := IDENTIFIER • 
+# GRPAH-NEXT: 0 ->[expr] 1
+# GRPAH-NEXT: 0 ->[IDENTIFIER] 2
+
+# RUN: clang-pseudo -grammar %s -print-table | FileCheck %s --check-prefix=TABLE
+#      TABLE: LRTable:
+# TABLE-NEXT: State 0
+# TABLE-NEXT:     'IDENTIFIER': shift state 2
+# TABLE-NEXT:     'expr': go to state 1
+# TABLE-NEXT: State 1
+# TABLE-NEXT:     'EOF': accept
+# TABLE-NEXT: State 2
+# TABLE-NEXT:     'EOF': reduce by rule 1 'expr := IDENTIFIER'

diff  --git a/clang/test/Syntax/lr-build-conflicts.test b/clang/test/Syntax/lr-build-conflicts.test
new file mode 100644
index 0000000000000..4292a7184e0f8
--- /dev/null
+++ b/clang/test/Syntax/lr-build-conflicts.test
@@ -0,0 +1,47 @@
+_ := expr
+expr := expr - expr  # S/R conflict at state 4 on '-' token
+expr := IDENTIFIER
+
+# RUN: clang-pseudo -grammar %s -print-graph | FileCheck %s --check-prefix=GRAPH
+#      GRAPH: States
+# GRAPH-NEXT: State 0
+# GRAPH-NEXT:     _ :=  • expr
+# GRAPH-NEXT:     expr :=  • expr - expr
+# GRAPH-NEXT:     expr :=  • IDENTIFIER
+# GRAPH-NEXT: State 1
+# GRAPH-NEXT:     _ := expr • 
+# GRAPH-NEXT:     expr := expr • - expr
+# GRAPH-NEXT: State 2
+# GRAPH-NEXT:     expr := IDENTIFIER • 
+# GRAPH-NEXT: State 3
+# GRAPH-NEXT:     expr :=  • expr - expr
+# GRAPH-NEXT:     expr := expr - • expr
+# GRAPH-NEXT:     expr :=  • IDENTIFIER
+# GRAPH-NEXT: State 4
+# GRAPH-NEXT:     expr := expr - expr • 
+# GRAPH-NEXT:     expr := expr • - expr
+# GRAPH-NEXT: 0 ->[expr] 1
+# GRAPH-NEXT: 0 ->[IDENTIFIER] 2
+# GRAPH-NEXT: 1 ->[-] 3
+# GRAPH-NEXT: 3 ->[expr] 4
+# GRAPH-NEXT: 3 ->[IDENTIFIER] 2
+# GRAPH-NEXT: 4 ->[-] 3
+
+# RUN: clang-pseudo -grammar %s -print-table | FileCheck %s --check-prefix=TABLE
+#      TABLE: LRTable:
+# TABLE-NEXT: State 0
+# TABLE-NEXT:     'IDENTIFIER': shift state 2
+# TABLE-NEXT:     'expr': go to state 1
+# TABLE-NEXT: State 1
+# TABLE-NEXT:     'EOF': accept
+# TABLE-NEXT:     '-': shift state 3
+# TABLE-NEXT: State 2
+# TABLE-NEXT:     'EOF': reduce by rule 1 'expr := IDENTIFIER'
+# TABLE-NEXT:     '-': reduce by rule 1 'expr := IDENTIFIER'
+# TABLE-NEXT: State 3
+# TABLE-NEXT:     'IDENTIFIER': shift state 2
+# TABLE-NEXT:     'expr': go to state 4
+# TABLE-NEXT: State 4
+# TABLE-NEXT:     'EOF': reduce by rule 2 'expr := expr - expr'
+# TABLE-NEXT:     '-': shift state 3
+# TABLE-NEXT:     '-': reduce by rule 2 'expr := expr - expr'

diff  --git a/clang/tools/clang-pseudo/ClangPseudo.cpp b/clang/tools/clang-pseudo/ClangPseudo.cpp
index 6fb8f58fa016c..449b9181f3ee0 100644
--- a/clang/tools/clang-pseudo/ClangPseudo.cpp
+++ b/clang/tools/clang-pseudo/ClangPseudo.cpp
@@ -7,6 +7,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "clang/Tooling/Syntax/Pseudo/Grammar.h"
+#include "clang/Tooling/Syntax/Pseudo/LRGraph.h"
+#include "clang/Tooling/Syntax/Pseudo/LRTable.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/FormatVariadic.h"
@@ -18,30 +20,45 @@ using llvm::cl::init;
 using llvm::cl::opt;
 
 static opt<std::string>
-    CheckGrammar("check-grammar", desc("Parse and check a BNF grammar file."),
-                 init(""));
+    Grammar("grammar", desc("Parse and check a BNF grammar file."), init(""));
+static opt<bool> PrintGraph("print-graph",
+                            desc("Print the LR graph for the grammar"));
+static opt<bool> PrintTable("print-table",
+                            desc("Print the LR table for the grammar"));
+
+static std::string readOrDie(llvm::StringRef Path) {
+  llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> Text =
+      llvm::MemoryBuffer::getFile(Path);
+  if (std::error_code EC = Text.getError()) {
+    llvm::errs() << "Error: can't read file '" << Path << "': " << EC.message()
+                 << "\n";
+    ::exit(1);
+  }
+  return Text.get()->getBuffer().str();
+}
 
 int main(int argc, char *argv[]) {
   llvm::cl::ParseCommandLineOptions(argc, argv, "");
 
-  if (CheckGrammar.getNumOccurrences()) {
-    llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> Text =
-        llvm::MemoryBuffer::getFile(CheckGrammar);
-    if (std::error_code EC = Text.getError()) {
-      llvm::errs() << "Error: can't read grammar file '" << CheckGrammar
-                   << "': " << EC.message() << "\n";
-      return 1;
-    }
+  if (Grammar.getNumOccurrences()) {
+    std::string Text = readOrDie(Grammar);
     std::vector<std::string> Diags;
-    auto RSpecs = Grammar::parseBNF(Text.get()->getBuffer(), Diags);
+    auto G = Grammar::parseBNF(Text, Diags);
 
     if (!Diags.empty()) {
       llvm::errs() << llvm::join(Diags, "\n");
       return 2;
     }
-    llvm::errs() << llvm::formatv("grammar file {0} is parsed successfully\n",
-                                  CheckGrammar);
+    llvm::outs() << llvm::formatv("grammar file {0} is parsed successfully\n",
+                                  Grammar);
+    if (PrintGraph)
+      llvm::outs() << clang::syntax::pseudo::LRGraph::buildLR0(*G).dumpForTests(
+          *G);
+    if (PrintTable)
+      llvm::outs() << clang::syntax::pseudo::LRTable::buildSLR(*G).dumpForTests(
+          *G);
     return 0;
   }
+
   return 0;
 }

diff  --git a/clang/unittests/Tooling/Syntax/Pseudo/CMakeLists.txt b/clang/unittests/Tooling/Syntax/Pseudo/CMakeLists.txt
index de1e1216c58d6..509e9e4a1598b 100644
--- a/clang/unittests/Tooling/Syntax/Pseudo/CMakeLists.txt
+++ b/clang/unittests/Tooling/Syntax/Pseudo/CMakeLists.txt
@@ -4,7 +4,7 @@ set(LLVM_LINK_COMPONENTS
 
 add_clang_unittest(ClangPseudoTests
   GrammarTest.cpp
-  LRGraphTest.cpp
+  LRTableTest.cpp
 )
 
 clang_target_link_libraries(ClangPseudoTests

diff  --git a/clang/unittests/Tooling/Syntax/Pseudo/LRGraphTest.cpp b/clang/unittests/Tooling/Syntax/Pseudo/LRGraphTest.cpp
deleted file mode 100644
index e7f7e1a7e65d9..0000000000000
--- a/clang/unittests/Tooling/Syntax/Pseudo/LRGraphTest.cpp
+++ /dev/null
@@ -1,84 +0,0 @@
-//===--- LRGraphTest.cpp - LRGraph tests -------------------------*- C++-*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "clang/Tooling/Syntax/Pseudo/LRGraph.h"
-#include "gmock/gmock.h"
-#include "gtest/gtest.h"
-#include <memory>
-
-namespace clang {
-namespace syntax {
-namespace pseudo {
-namespace {
-
-TEST(LRGraph, Build) {
-  struct TestCase {
-    llvm::StringRef BNF;
-    llvm::StringRef ExpectedStates;
-  };
-
-  TestCase Cases[] = {{
-                          R"bnf(
-_ := expr
-expr := IDENTIFIER
-      )bnf",
-                          R"(States:
-State 0
-    _ :=  • expr
-    expr :=  • IDENTIFIER
-State 1
-    _ := expr • 
-State 2
-    expr := IDENTIFIER • 
-0 ->[expr] 1
-0 ->[IDENTIFIER] 2
-)"},
-                      {// A grammar with a S/R conflict in SLR table:
-                       // (id-id)-id, or id-(id-id).
-                       R"bnf(
-_ := expr
-expr := expr - expr  # S/R conflict at state 4 on '-' token
-expr := IDENTIFIER
-      )bnf",
-                       R"(States:
-State 0
-    _ :=  • expr
-    expr :=  • expr - expr
-    expr :=  • IDENTIFIER
-State 1
-    _ := expr • 
-    expr := expr • - expr
-State 2
-    expr := IDENTIFIER • 
-State 3
-    expr :=  • expr - expr
-    expr := expr - • expr
-    expr :=  • IDENTIFIER
-State 4
-    expr := expr - expr • 
-    expr := expr • - expr
-0 ->[expr] 1
-0 ->[IDENTIFIER] 2
-1 ->[-] 3
-3 ->[expr] 4
-3 ->[IDENTIFIER] 2
-4 ->[-] 3
-)"}};
-  for (const auto &C : Cases) {
-    std::vector<std::string> Diags;
-    auto G = Grammar::parseBNF(C.BNF, Diags);
-    ASSERT_THAT(Diags, testing::IsEmpty());
-    auto LR0 = LRGraph::buildLR0(*G);
-    EXPECT_EQ(LR0.dumpForTests(*G), C.ExpectedStates);
-  }
-}
-
-} // namespace
-} // namespace pseudo
-} // namespace syntax
-} // namespace clang

diff  --git a/clang/unittests/Tooling/Syntax/Pseudo/LRTableTest.cpp b/clang/unittests/Tooling/Syntax/Pseudo/LRTableTest.cpp
new file mode 100644
index 0000000000000..88ac697ce250d
--- /dev/null
+++ b/clang/unittests/Tooling/Syntax/Pseudo/LRTableTest.cpp
@@ -0,0 +1,56 @@
+//===--- LRTableTest.cpp - ---------------------------------------*- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/Tooling/Syntax/Pseudo/LRTable.h"
+#include "clang/Basic/TokenKinds.h"
+#include "clang/Tooling/Syntax/Pseudo/Grammar.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include <vector>
+
+namespace clang {
+namespace syntax {
+namespace pseudo {
+namespace {
+
+using testing::IsEmpty;
+using testing::UnorderedElementsAre;
+using Action = LRTable::Action;
+
+TEST(LRTable, Builder) {
+  GrammarTable GTable;
+
+  //           eof   semi  ...
+  // +-------+----+-------+---
+  // |state0 |    | s0,r0 |...
+  // |state1 | acc|       |...
+  // |state2 |    |  r1   |...
+  // +-------+----+-------+---
+  std::vector<LRTable::Entry> Entries = {
+      {/* State */ 0, tokenSymbol(tok::semi), Action::shift(0)},
+      {/* State */ 0, tokenSymbol(tok::semi), Action::reduce(0)},
+      {/* State */ 1, tokenSymbol(tok::eof), Action::accept(2)},
+      {/* State */ 2, tokenSymbol(tok::semi), Action::reduce(1)}};
+  GrammarTable GT;
+  LRTable T = LRTable::buildForTests(GT, Entries);
+  EXPECT_THAT(T.find(0, tokenSymbol(tok::eof)), IsEmpty());
+  EXPECT_THAT(T.find(0, tokenSymbol(tok::semi)),
+              UnorderedElementsAre(Action::shift(0), Action::reduce(0)));
+  EXPECT_THAT(T.find(1, tokenSymbol(tok::eof)),
+              UnorderedElementsAre(Action::accept(2)));
+  EXPECT_THAT(T.find(1, tokenSymbol(tok::semi)), IsEmpty());
+  EXPECT_THAT(T.find(2, tokenSymbol(tok::semi)),
+              UnorderedElementsAre(Action::reduce(1)));
+  // Verify the behaivor for other non-available-actions terminals.
+  EXPECT_THAT(T.find(2, tokenSymbol(tok::kw_int)), IsEmpty());
+}
+
+} // namespace
+} // namespace pseudo
+} // namespace syntax
+} // namespace clang


        


More information about the cfe-commits mailing list