[clang-tools-extra] f383b88 - [pseudo] Sort nonterminals based on their reduction order.

Thu Mar 24 06:31:22 PDT 2022

Author: Haojian Wu
Date: 2022-03-24T14:30:12+01:00
New Revision: f383b88d8214dc8f2d8bc9521c3ddd1c4454927f

URL: https://github.com/llvm/llvm-project/commit/f383b88d8214dc8f2d8bc9521c3ddd1c4454927f
DIFF: https://github.com/llvm/llvm-project/commit/f383b88d8214dc8f2d8bc9521c3ddd1c4454927f.diff

LOG: [pseudo] Sort nonterminals based on their reduction order.

Reductions need to be performed in a careful order in GLR parser, to
make sure we gather all alternatives before creating an ambigous forest
node.

This patch encodes the nonterminal order into the rule id, so that we
can efficiently to determinal ordering of reductions in GLR parser.

This patch also abstracts to a TestGrammar, which is shared among tests.

This is a part of the GLR parser, https://reviews.llvm.org/D121368,
https://reviews.llvm.org/D121150

Differential Revision: https://reviews.llvm.org/D122303

Added: 
    

Modified: 
    clang-tools-extra/pseudo/include/clang-pseudo/Grammar.h
    clang-tools-extra/pseudo/lib/GrammarBNF.cpp
    clang-tools-extra/pseudo/test/lr-build-basic.test
    clang-tools-extra/pseudo/test/lr-build-conflicts.test
    clang-tools-extra/pseudo/unittests/GrammarTest.cpp

Removed: 
    


################################################################################
diff  --git a/clang-tools-extra/pseudo/include/clang-pseudo/Grammar.h b/clang-tools-extra/pseudo/include/clang-pseudo/Grammar.h
index 106734b1b022a..46e8fcbcafa2c 100644

--- a/clang-tools-extra/pseudo/include/clang-pseudo/Grammar.h
+++ b/clang-tools-extra/pseudo/include/clang-pseudo/Grammar.h
@@ -165,8 +165,15 @@ struct GrammarTable {
     } RuleRange;
   };
 
-  // The rules are sorted (and thus grouped) by target symbol.
-  // RuleID is the index of the vector.
+  // RuleID is an index into this table of rule definitions.
+  //
+  // Rules with the same target symbol (LHS) are grouped into a single range.
+  // The relative order of 
diff erent target symbols is *not* by SymbolID, but
+  // rather a topological sort: if S := T then the rules producing T have lower
+  // RuleIDs than rules producing S.
+  // (This strange order simplifies the GLR parser: for a given token range, if
+  // we reduce in increasing RuleID order then we need never backtrack --
+  // prerequisite reductions are reached before dependent ones).
   std::vector<Rule> Rules;
   // A table of terminals (aka tokens). It corresponds to the clang::Token.
   // clang::tok::TokenKind is the index of the table.

diff  --git a/clang-tools-extra/pseudo/lib/GrammarBNF.cpp b/clang-tools-extra/pseudo/lib/GrammarBNF.cpp
index 57ed2ca7c8f0b..7ca5bc1a90221 100644
--- a/clang-tools-extra/pseudo/lib/GrammarBNF.cpp
+++ b/clang-tools-extra/pseudo/lib/GrammarBNF.cpp
@@ -9,9 +9,11 @@
 #include "clang-pseudo/Grammar.h"
 #include "clang/Basic/TokenKinds.h"
 #include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Support/FormatVariadic.h"
 #include <memory>
+#include <utility>
 
 namespace clang {
 namespace pseudo {
@@ -87,23 +89,75 @@ class GrammarBuilder {
     }
     assert(T->Rules.size() < (1 << RuleBits) &&
            "Too many rules to fit in RuleID bits!");
-    llvm::sort(T->Rules, [](const Rule &Left, const Rule &Right) {
-      // Sorted by the Target.
-      return std::tie(Left.Target, Left.Size) <
-             std::tie(Right.Target, Right.Size);
-    });
-    RuleID RulePos = 0;
+    const auto &SymbolOrder = getTopologicalOrder(T.get());
+    llvm::stable_sort(
+        T->Rules, [&SymbolOrder](const Rule &Left, const Rule &Right) {
+          // Sorted by the topological order of the nonterminal Target.
+          return SymbolOrder[Left.Target] < SymbolOrder[Right.Target];
+        });
     for (SymbolID SID = 0; SID < T->Nonterminals.size(); ++SID) {
-      RuleID Start = RulePos;
-      while (RulePos < T->Rules.size() && T->Rules[RulePos].Target == SID)
-        ++RulePos;
-      T->Nonterminals[SID].RuleRange = {Start, RulePos};
+      auto StartIt = llvm::partition_point(T->Rules, [&](const Rule &R) {
+        return SymbolOrder[R.Target] < SymbolOrder[SID];
+      });
+      RuleID Start = StartIt - T->Rules.begin();
+      RuleID End = Start;
+      while (End < T->Rules.size() && T->Rules[End].Target == SID)
+        ++End;
+      T->Nonterminals[SID].RuleRange = {Start, End};
     }
     auto G = std::make_unique<Grammar>(std::move(T));
     diagnoseGrammar(*G);
     return G;
   }
 
+  // Gets topological order for nonterminal symbols.
+  //
+  // The topological order is defined as: if a *single* nonterminal A produces
+  // (or transitively) a nonterminal B (that said, there is a production rule
+  // B := A), then A is less than B.
+  //
+  // It returns the sort key for each symbol, the array is indexed by SymbolID.
+  std::vector<unsigned> getTopologicalOrder(GrammarTable *T) {
+    std::vector<std::pair<SymbolID, SymbolID>> Dependencies;
+    for (const auto &Rule : T->Rules) {
+      // if A := B, A depends on B.
+      if (Rule.Size == 1 && pseudo::isNonterminal(Rule.Sequence[0]))
+        Dependencies.push_back({Rule.Target, Rule.Sequence[0]});
+    }
+    llvm::sort(Dependencies);
+    std::vector<SymbolID> Order;
+    // Each nonterminal state flows: NotVisited -> Visiting -> Visited.
+    enum State {
+      NotVisited,
+      Visiting,
+      Visited,
+    };
+    std::vector<State> VisitStates(T->Nonterminals.size(), NotVisited);
+    std::function<void(SymbolID)> DFS = [&](SymbolID SID) -> void {
+      if (VisitStates[SID] == Visited)
+        return;
+      if (VisitStates[SID] == Visiting) {
+        Diagnostics.push_back(
+            llvm::formatv("The grammar contains a cycle involving symbol {0}",
+                          T->Nonterminals[SID].Name));
+        return;
+      }
+      VisitStates[SID] = Visiting;
+      for (auto It = llvm::lower_bound(Dependencies,
+                                       std::pair<SymbolID, SymbolID>{SID, 0});
+           It != Dependencies.end() && It->first == SID; ++It)
+        DFS(It->second);
+      VisitStates[SID] = Visited;
+      Order.push_back(SID);
+    };
+    for (SymbolID ID = 0; ID != T->Nonterminals.size(); ++ID)
+      DFS(ID);
+    std::vector<unsigned> Result(T->Nonterminals.size(), 0);
+    for (size_t I = 0; I < Order.size(); ++I)
+      Result[Order[I]] = I;
+    return Result;
+  }
+
 private:
   // Text representation of a BNF grammar rule.
   struct RuleSpec {

diff  --git a/clang-tools-extra/pseudo/test/lr-build-basic.test b/clang-tools-extra/pseudo/test/lr-build-basic.test
index cfb54e11a27f0..5c017fa0e4dae 100644
--- a/clang-tools-extra/pseudo/test/lr-build-basic.test
+++ b/clang-tools-extra/pseudo/test/lr-build-basic.test
@@ -26,4 +26,4 @@ id := IDENTIFIER
 # TABLE-NEXT: State 2
 # TABLE-NEXT:     'EOF': reduce by rule 1 'expr := id'
 # TABLE-NEXT: State 3
-# TABLE-NEXT:     'EOF': reduce by rule 2 'id := IDENTIFIER'
+# TABLE-NEXT:     'EOF': reduce by rule 0 'id := IDENTIFIER'

diff  --git a/clang-tools-extra/pseudo/test/lr-build-conflicts.test b/clang-tools-extra/pseudo/test/lr-build-conflicts.test
index 4292a7184e0f8..aefa905413bd5 100644
--- a/clang-tools-extra/pseudo/test/lr-build-conflicts.test
+++ b/clang-tools-extra/pseudo/test/lr-build-conflicts.test
@@ -5,8 +5,8 @@ expr := IDENTIFIER
 # RUN: clang-pseudo -grammar %s -print-graph | FileCheck %s --check-prefix=GRAPH
 #      GRAPH: States
 # GRAPH-NEXT: State 0
-# GRAPH-NEXT:     _ :=  • expr
 # GRAPH-NEXT:     expr :=  • expr - expr
+# GRAPH-NEXT:     _ :=  • expr
 # GRAPH-NEXT:     expr :=  • IDENTIFIER
 # GRAPH-NEXT: State 1
 # GRAPH-NEXT:     _ := expr • 
@@ -42,6 +42,6 @@ expr := IDENTIFIER
 # TABLE-NEXT:     'IDENTIFIER': shift state 2
 # TABLE-NEXT:     'expr': go to state 4
 # TABLE-NEXT: State 4
-# TABLE-NEXT:     'EOF': reduce by rule 2 'expr := expr - expr'
+# TABLE-NEXT:     'EOF': reduce by rule 0 'expr := expr - expr'
 # TABLE-NEXT:     '-': shift state 3
-# TABLE-NEXT:     '-': reduce by rule 2 'expr := expr - expr'
+# TABLE-NEXT:     '-': reduce by rule 0 'expr := expr - expr'

diff  --git a/clang-tools-extra/pseudo/unittests/GrammarTest.cpp b/clang-tools-extra/pseudo/unittests/GrammarTest.cpp
index 2a107063fd679..19e9481b7ed15 100644
--- a/clang-tools-extra/pseudo/unittests/GrammarTest.cpp
+++ b/clang-tools-extra/pseudo/unittests/GrammarTest.cpp
@@ -44,6 +44,16 @@ class GrammarTest : public ::testing::Test {
     return 0;
   }
 
+  RuleID ruleFor(llvm::StringRef NonterminalName) const {
+    auto RuleRange = G->table().Nonterminals[id(NonterminalName)].RuleRange;
+    if (RuleRange.End - RuleRange.Start == 1)
+      return G->table().Nonterminals[id(NonterminalName)].RuleRange.Start;
+    ADD_FAILURE() << "Expected a single rule for " << NonterminalName
+                  << ", but it has " << RuleRange.End - RuleRange.Start
+                  << " rule!\n";
+    return 0;
+  }
+
 protected:
   std::unique_ptr<Grammar> G;
   std::vector<std::string> Diags;
@@ -74,6 +84,21 @@ TEST_F(GrammarTest, EliminatedOptional) {
                                    Sequence(id("INT"), id(";"))));
 }
 
+TEST_F(GrammarTest, RuleIDSorted) {
+  build(R"bnf(
+    _ := x
+
+    x := y
+    y := z
+    z := IDENTIFIER
+  )bnf");
+  ASSERT_TRUE(Diags.empty());
+
+  EXPECT_LT(ruleFor("z"), ruleFor("y"));
+  EXPECT_LT(ruleFor("y"), ruleFor("x"));
+  EXPECT_LT(ruleFor("x"), ruleFor("_"));
+}
+
 TEST_F(GrammarTest, Diagnostics) {
   build(R"cpp(
     _ := ,_opt
@@ -82,6 +107,9 @@ TEST_F(GrammarTest, Diagnostics) {
     _ := IDENFIFIE # a typo of the terminal IDENFITIER
 
     invalid
+    # cycle
+    a := b
+    b := a
   )cpp");
 
   EXPECT_EQ(G->startSymbol(), id("_"));
@@ -91,7 +119,8 @@ TEST_F(GrammarTest, Diagnostics) {
                          "No rules for nonterminal: undefined-sym",
                          "Failed to parse 'invalid': no separator :=",
                          "Token-like name IDENFIFIE is used as a nonterminal",
-                         "No rules for nonterminal: IDENFIFIE"));
+                         "No rules for nonterminal: IDENFIFIE",
+                         "The grammar contains a cycle involving symbol a"));
 }
 
 TEST_F(GrammarTest, FirstAndFollowSets) {