[clang] fe932a8 - [pseudo] Add first and follow set computation in Grammar.

Wed Feb 9 00:16:34 PST 2022

Author: Haojian Wu
Date: 2022-02-09T09:16:27+01:00
New Revision: fe932a88e970d707c0759fae1f211d0f40ca06da

URL: https://github.com/llvm/llvm-project/commit/fe932a88e970d707c0759fae1f211d0f40ca06da
DIFF: https://github.com/llvm/llvm-project/commit/fe932a88e970d707c0759fae1f211d0f40ca06da.diff

LOG: [pseudo] Add first and follow set computation in Grammar.

These will be used when building parsing table for LR parsers.

Separate from https://reviews.llvm.org/D118196.

Differential Revision: https://reviews.llvm.org/D118990

Added: 
    

Modified: 
    clang/include/clang/Tooling/Syntax/Pseudo/Grammar.h
    clang/lib/Tooling/Syntax/Pseudo/Grammar.cpp
    clang/unittests/Tooling/Syntax/Pseudo/GrammarTest.cpp

Removed: 
    


################################################################################
diff  --git a/clang/include/clang/Tooling/Syntax/Pseudo/Grammar.h b/clang/include/clang/Tooling/Syntax/Pseudo/Grammar.h
index 80db9f268ff13..c6e2f09d5fb47 100644

--- a/clang/include/clang/Tooling/Syntax/Pseudo/Grammar.h
+++ b/clang/include/clang/Tooling/Syntax/Pseudo/Grammar.h
@@ -20,7 +20,7 @@
 //  non-terminal or terminal, identified by a SymbolID.
 //
 //  Notions about the BNF grammar:
-//  - "_" is the augmented symbol, formed by start symbols.
+//  - "_" is the start symbol of the augmented grammar;
 //  - single-line comment is supported, starting with a #
 //  - A rule describes how a nonterminal (left side of :=) is constructed, and
 //    it is *per line* in the grammar file
@@ -38,6 +38,7 @@
 
 #include "clang/Basic/TokenKinds.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/StringRef.h"
 #include <cstdint>
 #include <vector>
@@ -110,13 +111,16 @@ struct GrammarTable;
 // It is a building block for constructing a table-based parser.
 class Grammar {
 public:
-  explicit Grammar(std::unique_ptr<GrammarTable> T) : T(std::move(T)) {}
+  explicit Grammar(std::unique_ptr<GrammarTable>);
 
   // Parses grammar from a BNF file.
   // Diagnostics emitted during parsing are stored in Diags.
   static std::unique_ptr<Grammar> parseBNF(llvm::StringRef BNF,
                                            std::vector<std::string> &Diags);
 
+  // Returns the SymbolID of the start symbol '_'.
+  SymbolID startSymbol() const { return StartSymbol; };
+
   // Returns all rules of the given non-terminal symbol.
   llvm::ArrayRef<Rule> rulesFor(SymbolID SID) const;
   const Rule &lookupRule(RuleID RID) const;
@@ -136,7 +140,15 @@ class Grammar {
 
 private:
   std::unique_ptr<GrammarTable> T;
+  // The start symbol '_' of the augmented grammar.
+  SymbolID StartSymbol;
 };
+// For each nonterminal X, computes the set of terminals that begin strings
+// derived from X. (Known as FIRST sets in grammar-based parsers).
+std::vector<llvm::DenseSet<SymbolID>> firstSets(const Grammar &);
+// For each nonterminal X, computes the set of terminals that could immediately
+// follow X. (Known as FOLLOW sets in grammar-based parsers).
+std::vector<llvm::DenseSet<SymbolID>> followSets(const Grammar &);
 
 // Storage for the underlying data of the Grammar.
 // It can be constructed dynamically (from compiling BNF file) or statically

diff  --git a/clang/lib/Tooling/Syntax/Pseudo/Grammar.cpp b/clang/lib/Tooling/Syntax/Pseudo/Grammar.cpp
index 014e6b4d28bc6..a2cd51a6c7569 100644
--- a/clang/lib/Tooling/Syntax/Pseudo/Grammar.cpp
+++ b/clang/lib/Tooling/Syntax/Pseudo/Grammar.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "clang/Tooling/Syntax/Pseudo/Grammar.h"
+#include "clang/Basic/TokenKinds.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringRef.h"
@@ -23,6 +24,16 @@ Rule::Rule(SymbolID Target, llvm::ArrayRef<SymbolID> Sequence)
   llvm::copy(Sequence, this->Sequence);
 }
 
+Grammar::Grammar(std::unique_ptr<GrammarTable> Table) : T(std::move(Table)) {
+  // start symbol is named _, binary search it.
+  auto It = llvm::partition_point(
+      T->Nonterminals,
+      [](const GrammarTable::Nonterminal &X) { return X.Name < "_"; });
+  assert(It != T->Nonterminals.end() && It->Name == "_" &&
+         "symbol _ must exist in the grammar!");
+  StartSymbol = It - T->Nonterminals.begin();
+}
+
 llvm::ArrayRef<Rule> Grammar::rulesFor(SymbolID SID) const {
   assert(isNonterminal(SID));
   const auto &R = T->Nonterminals[SID].RuleRange;
@@ -72,6 +83,86 @@ std::string Grammar::dump() const {
   return OS.str();
 }
 
+std::vector<llvm::DenseSet<SymbolID>> firstSets(const Grammar &G) {
+  std::vector<llvm::DenseSet<SymbolID>> FirstSets(
+      G.table().Nonterminals.size());
+  auto ExpandFirstSet = [&FirstSets](SymbolID Target, SymbolID First) {
+    assert(isNonterminal(Target));
+    if (isToken(First))
+      return FirstSets[Target].insert(First).second;
+    bool Changed = false;
+    for (SymbolID SID : FirstSets[First])
+      Changed |= FirstSets[Target].insert(SID).second;
+    return Changed;
+  };
+
+  // A rule S := T ... implies elements in FIRST(S):
+  //  - if T is a terminal, FIRST(S) contains T
+  //  - if T is a nonterminal, FIRST(S) contains FIRST(T)
+  // Since FIRST(T) may not have been fully computed yet, FIRST(S) itself may
+  // end up being incomplete.
+  // We iterate until we hit a fixed point.
+  // (This isn't particularly efficient, but table building isn't on the
+  // critical path).
+  bool Changed = true;
+  while (Changed) {
+    Changed = false;
+    for (const auto &R : G.table().Rules)
+      // We only need to consider the first element because symbols are
+      // non-nullable.
+      Changed |= ExpandFirstSet(R.Target, R.seq().front());
+  }
+  return FirstSets;
+}
+
+std::vector<llvm::DenseSet<SymbolID>> followSets(const Grammar &G) {
+  auto FirstSets = firstSets(G);
+  std::vector<llvm::DenseSet<SymbolID>> FollowSets(
+      G.table().Nonterminals.size());
+  // Expand the follow set of a non-terminal symbol Y by adding all from the
+  // given symbol set.
+  auto ExpandFollowSet = [&FollowSets](SymbolID Y,
+                                       const llvm::DenseSet<SymbolID> &ToAdd) {
+    assert(isNonterminal(Y));
+    bool Changed = false;
+    for (SymbolID F : ToAdd)
+      Changed |= FollowSets[Y].insert(F).second;
+    return Changed;
+  };
+  // Follow sets is computed based on the following 3 rules, the computation
+  // is completed at a fixed point where there is no more new symbols can be
+  // added to any of the follow sets.
+  //
+  // Rule 1: add endmarker to the FOLLOW(S), where S is the start symbol.
+  FollowSets[G.startSymbol()].insert(tokenSymbol(tok::eof));
+  bool Changed = true;
+  while (Changed) {
+    Changed = false;
+    for (const auto &R : G.table().Rules) {
+      // Rule 2: for a rule X := ... Y Z, we add all symbols from FIRST(Z) to
+      // FOLLOW(Y).
+      for (size_t i = 0; i + 1 < R.seq().size(); ++i) {
+        if (isToken(R.seq()[i]))
+          continue;
+        // We only need to consider the next symbol because symbols are
+        // non-nullable.
+        SymbolID Next = R.seq()[i + 1];
+        if (isToken(Next))
+          // First set for a terminal is itself.
+          Changed |= ExpandFollowSet(R.seq()[i], {Next});
+        else
+          Changed |= ExpandFollowSet(R.seq()[i], FirstSets[Next]);
+      }
+      // Rule 3: for a rule X := ... Z, we add all symbols from FOLLOW(X) to
+      // FOLLOW(Z).
+      SymbolID Z = R.seq().back();
+      if (isNonterminal(Z))
+        Changed |= ExpandFollowSet(Z, FollowSets[R.Target]);
+    }
+  }
+  return FollowSets;
+}
+
 } // namespace pseudo
 } // namespace syntax
 } // namespace clang

diff  --git a/clang/unittests/Tooling/Syntax/Pseudo/GrammarTest.cpp b/clang/unittests/Tooling/Syntax/Pseudo/GrammarTest.cpp
index dc57f16cf3b93..1b928b34062a5 100644
--- a/clang/unittests/Tooling/Syntax/Pseudo/GrammarTest.cpp
+++ b/clang/unittests/Tooling/Syntax/Pseudo/GrammarTest.cpp
@@ -19,6 +19,7 @@ namespace {
 using testing::AllOf;
 using testing::ElementsAre;
 using testing::IsEmpty;
+using testing::Pair;
 using testing::UnorderedElementsAre;
 
 MATCHER_P(TargetID, SID, "") { return arg.Target == SID; }
@@ -33,7 +34,7 @@ class GrammarTest : public ::testing::Test {
     G = Grammar::parseBNF(BNF, Diags);
   }
 
-  SymbolID lookup(llvm::StringRef Name) const {
+  SymbolID id(llvm::StringRef Name) const {
     for (unsigned I = 0; I < NumTerminals; ++I)
       if (G->table().Terminals[I] == Name)
         return tokenSymbol(static_cast<tok::TokenKind>(I));
@@ -50,31 +51,28 @@ class GrammarTest : public ::testing::Test {
 };
 
 TEST_F(GrammarTest, Basic) {
-  build("expression := IDENTIFIER + expression # comment");
+  build("_ := IDENTIFIER + _ # comment");
   EXPECT_THAT(Diags, IsEmpty());
 
   auto ExpectedRule =
-      AllOf(TargetID(lookup("expression")),
-            Sequence(lookup("IDENTIFIER"), lookup("+"), lookup("expression")));
-  auto ExpressionID = lookup("expression");
-  EXPECT_EQ(G->symbolName(ExpressionID), "expression");
-  EXPECT_THAT(G->rulesFor(ExpressionID), UnorderedElementsAre(ExpectedRule));
+      AllOf(TargetID(id("_")), Sequence(id("IDENTIFIER"), id("+"), id("_")));
+  EXPECT_EQ(G->symbolName(id("_")), "_");
+  EXPECT_THAT(G->rulesFor(id("_")), UnorderedElementsAre(ExpectedRule));
   const auto &Rule = G->lookupRule(/*RID=*/0);
   EXPECT_THAT(Rule, ExpectedRule);
   EXPECT_THAT(G->symbolName(Rule.seq()[0]), "IDENTIFIER");
   EXPECT_THAT(G->symbolName(Rule.seq()[1]), "+");
-  EXPECT_THAT(G->symbolName(Rule.seq()[2]), "expression");
+  EXPECT_THAT(G->symbolName(Rule.seq()[2]), "_");
 }
 
 TEST_F(GrammarTest, EliminatedOptional) {
   build("_ := CONST_opt INT ;_opt");
   EXPECT_THAT(Diags, IsEmpty());
   EXPECT_THAT(G->table().Rules,
-              UnorderedElementsAre(
-                  Sequence(lookup("INT")),
-                  Sequence(lookup("CONST"), lookup("INT")),
-                  Sequence(lookup("CONST"), lookup("INT"), lookup(";")),
-                  Sequence(lookup("INT"), lookup(";"))));
+              UnorderedElementsAre(Sequence(id("INT")),
+                                   Sequence(id("CONST"), id("INT")),
+                                   Sequence(id("CONST"), id("INT"), id(";")),
+                                   Sequence(id("INT"), id(";"))));
 }
 
 TEST_F(GrammarTest, Diagnostics) {
@@ -87,6 +85,7 @@ TEST_F(GrammarTest, Diagnostics) {
     invalid
   )cpp");
 
+  EXPECT_EQ(G->startSymbol(), id("_"));
   EXPECT_THAT(Diags, UnorderedElementsAre(
                          "Rule '_ := ,_opt' has a nullable RHS",
                          "Rule 'null := ' has a nullable RHS",
@@ -96,6 +95,66 @@ TEST_F(GrammarTest, Diagnostics) {
                          "No rules for nonterminal: IDENFIFIE"));
 }
 
+TEST_F(GrammarTest, FirstAndFollowSets) {
+  build(
+      R"bnf(
+_ := expr
+expr := expr - term
+expr := term
+term := IDENTIFIER
+term := ( expr )
+)bnf");
+  ASSERT_TRUE(Diags.empty());
+  auto ToPairs = [](std::vector<llvm::DenseSet<SymbolID>> Input) {
+    std::vector<std::pair<SymbolID, llvm::DenseSet<SymbolID>>> Sets;
+    for (SymbolID ID = 0; ID < Input.size(); ++ID)
+      Sets.emplace_back(ID, std::move(Input[ID]));
+    return Sets;
+  };
+
+  EXPECT_THAT(
+      ToPairs(firstSets(*G)),
+      UnorderedElementsAre(
+          Pair(id("_"), UnorderedElementsAre(id("IDENTIFIER"), id("("))),
+          Pair(id("expr"), UnorderedElementsAre(id("IDENTIFIER"), id("("))),
+          Pair(id("term"), UnorderedElementsAre(id("IDENTIFIER"), id("(")))));
+  EXPECT_THAT(
+      ToPairs(followSets(*G)),
+      UnorderedElementsAre(
+          Pair(id("_"), UnorderedElementsAre(id("EOF"))),
+          Pair(id("expr"), UnorderedElementsAre(id("-"), id("EOF"), id(")"))),
+          Pair(id("term"), UnorderedElementsAre(id("-"), id("EOF"), id(")")))));
+
+  build(R"bnf(
+# A simplfied C++ decl-specifier-seq.
+_ := decl-specifier-seq
+decl-specifier-seq := decl-specifier decl-specifier-seq
+decl-specifier-seq := decl-specifier
+decl-specifier := simple-type-specifier
+decl-specifier := INLINE
+simple-type-specifier := INT
+   )bnf");
+  ASSERT_TRUE(Diags.empty());
+  EXPECT_THAT(
+      ToPairs(firstSets(*G)),
+      UnorderedElementsAre(
+          Pair(id("_"), UnorderedElementsAre(id("INLINE"), id("INT"))),
+          Pair(id("decl-specifier-seq"),
+               UnorderedElementsAre(id("INLINE"), id("INT"))),
+          Pair(id("simple-type-specifier"), UnorderedElementsAre(id("INT"))),
+          Pair(id("decl-specifier"),
+               UnorderedElementsAre(id("INLINE"), id("INT")))));
+  EXPECT_THAT(
+      ToPairs(followSets(*G)),
+      UnorderedElementsAre(
+          Pair(id("_"), UnorderedElementsAre(id("EOF"))),
+          Pair(id("decl-specifier-seq"), UnorderedElementsAre(id("EOF"))),
+          Pair(id("decl-specifier"),
+               UnorderedElementsAre(id("INLINE"), id("INT"), id("EOF"))),
+          Pair(id("simple-type-specifier"),
+               UnorderedElementsAre(id("INLINE"), id("INT"), id("EOF")))));
+}
+
 } // namespace
 } // namespace pseudo
 } // namespace syntax