[clang] fe932a8 - [pseudo] Add first and follow set computation in Grammar.
Haojian Wu via cfe-commits
cfe-commits at lists.llvm.org
Wed Feb 9 00:16:34 PST 2022
Author: Haojian Wu
Date: 2022-02-09T09:16:27+01:00
New Revision: fe932a88e970d707c0759fae1f211d0f40ca06da
URL: https://github.com/llvm/llvm-project/commit/fe932a88e970d707c0759fae1f211d0f40ca06da
DIFF: https://github.com/llvm/llvm-project/commit/fe932a88e970d707c0759fae1f211d0f40ca06da.diff
LOG: [pseudo] Add first and follow set computation in Grammar.
These will be used when building parsing table for LR parsers.
Separate from https://reviews.llvm.org/D118196.
Differential Revision: https://reviews.llvm.org/D118990
Added:
Modified:
clang/include/clang/Tooling/Syntax/Pseudo/Grammar.h
clang/lib/Tooling/Syntax/Pseudo/Grammar.cpp
clang/unittests/Tooling/Syntax/Pseudo/GrammarTest.cpp
Removed:
################################################################################
diff --git a/clang/include/clang/Tooling/Syntax/Pseudo/Grammar.h b/clang/include/clang/Tooling/Syntax/Pseudo/Grammar.h
index 80db9f268ff13..c6e2f09d5fb47 100644
--- a/clang/include/clang/Tooling/Syntax/Pseudo/Grammar.h
+++ b/clang/include/clang/Tooling/Syntax/Pseudo/Grammar.h
@@ -20,7 +20,7 @@
// non-terminal or terminal, identified by a SymbolID.
//
// Notions about the BNF grammar:
-// - "_" is the augmented symbol, formed by start symbols.
+// - "_" is the start symbol of the augmented grammar;
// - single-line comment is supported, starting with a #
// - A rule describes how a nonterminal (left side of :=) is constructed, and
// it is *per line* in the grammar file
@@ -38,6 +38,7 @@
#include "clang/Basic/TokenKinds.h"
#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseSet.h"
#include "llvm/ADT/StringRef.h"
#include <cstdint>
#include <vector>
@@ -110,13 +111,16 @@ struct GrammarTable;
// It is a building block for constructing a table-based parser.
class Grammar {
public:
- explicit Grammar(std::unique_ptr<GrammarTable> T) : T(std::move(T)) {}
+ explicit Grammar(std::unique_ptr<GrammarTable>);
// Parses grammar from a BNF file.
// Diagnostics emitted during parsing are stored in Diags.
static std::unique_ptr<Grammar> parseBNF(llvm::StringRef BNF,
std::vector<std::string> &Diags);
+ // Returns the SymbolID of the start symbol '_'.
+ SymbolID startSymbol() const { return StartSymbol; };
+
// Returns all rules of the given non-terminal symbol.
llvm::ArrayRef<Rule> rulesFor(SymbolID SID) const;
const Rule &lookupRule(RuleID RID) const;
@@ -136,7 +140,15 @@ class Grammar {
private:
std::unique_ptr<GrammarTable> T;
+ // The start symbol '_' of the augmented grammar.
+ SymbolID StartSymbol;
};
+// For each nonterminal X, computes the set of terminals that begin strings
+// derived from X. (Known as FIRST sets in grammar-based parsers).
+std::vector<llvm::DenseSet<SymbolID>> firstSets(const Grammar &);
+// For each nonterminal X, computes the set of terminals that could immediately
+// follow X. (Known as FOLLOW sets in grammar-based parsers).
+std::vector<llvm::DenseSet<SymbolID>> followSets(const Grammar &);
// Storage for the underlying data of the Grammar.
// It can be constructed dynamically (from compiling BNF file) or statically
diff --git a/clang/lib/Tooling/Syntax/Pseudo/Grammar.cpp b/clang/lib/Tooling/Syntax/Pseudo/Grammar.cpp
index 014e6b4d28bc6..a2cd51a6c7569 100644
--- a/clang/lib/Tooling/Syntax/Pseudo/Grammar.cpp
+++ b/clang/lib/Tooling/Syntax/Pseudo/Grammar.cpp
@@ -7,6 +7,7 @@
//===----------------------------------------------------------------------===//
#include "clang/Tooling/Syntax/Pseudo/Grammar.h"
+#include "clang/Basic/TokenKinds.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/StringRef.h"
@@ -23,6 +24,16 @@ Rule::Rule(SymbolID Target, llvm::ArrayRef<SymbolID> Sequence)
llvm::copy(Sequence, this->Sequence);
}
+Grammar::Grammar(std::unique_ptr<GrammarTable> Table) : T(std::move(Table)) {
+ // start symbol is named _, binary search it.
+ auto It = llvm::partition_point(
+ T->Nonterminals,
+ [](const GrammarTable::Nonterminal &X) { return X.Name < "_"; });
+ assert(It != T->Nonterminals.end() && It->Name == "_" &&
+ "symbol _ must exist in the grammar!");
+ StartSymbol = It - T->Nonterminals.begin();
+}
+
llvm::ArrayRef<Rule> Grammar::rulesFor(SymbolID SID) const {
assert(isNonterminal(SID));
const auto &R = T->Nonterminals[SID].RuleRange;
@@ -72,6 +83,86 @@ std::string Grammar::dump() const {
return OS.str();
}
+std::vector<llvm::DenseSet<SymbolID>> firstSets(const Grammar &G) {
+ std::vector<llvm::DenseSet<SymbolID>> FirstSets(
+ G.table().Nonterminals.size());
+ auto ExpandFirstSet = [&FirstSets](SymbolID Target, SymbolID First) {
+ assert(isNonterminal(Target));
+ if (isToken(First))
+ return FirstSets[Target].insert(First).second;
+ bool Changed = false;
+ for (SymbolID SID : FirstSets[First])
+ Changed |= FirstSets[Target].insert(SID).second;
+ return Changed;
+ };
+
+ // A rule S := T ... implies elements in FIRST(S):
+ // - if T is a terminal, FIRST(S) contains T
+ // - if T is a nonterminal, FIRST(S) contains FIRST(T)
+ // Since FIRST(T) may not have been fully computed yet, FIRST(S) itself may
+ // end up being incomplete.
+ // We iterate until we hit a fixed point.
+ // (This isn't particularly efficient, but table building isn't on the
+ // critical path).
+ bool Changed = true;
+ while (Changed) {
+ Changed = false;
+ for (const auto &R : G.table().Rules)
+ // We only need to consider the first element because symbols are
+ // non-nullable.
+ Changed |= ExpandFirstSet(R.Target, R.seq().front());
+ }
+ return FirstSets;
+}
+
+std::vector<llvm::DenseSet<SymbolID>> followSets(const Grammar &G) {
+ auto FirstSets = firstSets(G);
+ std::vector<llvm::DenseSet<SymbolID>> FollowSets(
+ G.table().Nonterminals.size());
+ // Expand the follow set of a non-terminal symbol Y by adding all from the
+ // given symbol set.
+ auto ExpandFollowSet = [&FollowSets](SymbolID Y,
+ const llvm::DenseSet<SymbolID> &ToAdd) {
+ assert(isNonterminal(Y));
+ bool Changed = false;
+ for (SymbolID F : ToAdd)
+ Changed |= FollowSets[Y].insert(F).second;
+ return Changed;
+ };
+ // Follow sets is computed based on the following 3 rules, the computation
+ // is completed at a fixed point where there is no more new symbols can be
+ // added to any of the follow sets.
+ //
+ // Rule 1: add endmarker to the FOLLOW(S), where S is the start symbol.
+ FollowSets[G.startSymbol()].insert(tokenSymbol(tok::eof));
+ bool Changed = true;
+ while (Changed) {
+ Changed = false;
+ for (const auto &R : G.table().Rules) {
+ // Rule 2: for a rule X := ... Y Z, we add all symbols from FIRST(Z) to
+ // FOLLOW(Y).
+ for (size_t i = 0; i + 1 < R.seq().size(); ++i) {
+ if (isToken(R.seq()[i]))
+ continue;
+ // We only need to consider the next symbol because symbols are
+ // non-nullable.
+ SymbolID Next = R.seq()[i + 1];
+ if (isToken(Next))
+ // First set for a terminal is itself.
+ Changed |= ExpandFollowSet(R.seq()[i], {Next});
+ else
+ Changed |= ExpandFollowSet(R.seq()[i], FirstSets[Next]);
+ }
+ // Rule 3: for a rule X := ... Z, we add all symbols from FOLLOW(X) to
+ // FOLLOW(Z).
+ SymbolID Z = R.seq().back();
+ if (isNonterminal(Z))
+ Changed |= ExpandFollowSet(Z, FollowSets[R.Target]);
+ }
+ }
+ return FollowSets;
+}
+
} // namespace pseudo
} // namespace syntax
} // namespace clang
diff --git a/clang/unittests/Tooling/Syntax/Pseudo/GrammarTest.cpp b/clang/unittests/Tooling/Syntax/Pseudo/GrammarTest.cpp
index dc57f16cf3b93..1b928b34062a5 100644
--- a/clang/unittests/Tooling/Syntax/Pseudo/GrammarTest.cpp
+++ b/clang/unittests/Tooling/Syntax/Pseudo/GrammarTest.cpp
@@ -19,6 +19,7 @@ namespace {
using testing::AllOf;
using testing::ElementsAre;
using testing::IsEmpty;
+using testing::Pair;
using testing::UnorderedElementsAre;
MATCHER_P(TargetID, SID, "") { return arg.Target == SID; }
@@ -33,7 +34,7 @@ class GrammarTest : public ::testing::Test {
G = Grammar::parseBNF(BNF, Diags);
}
- SymbolID lookup(llvm::StringRef Name) const {
+ SymbolID id(llvm::StringRef Name) const {
for (unsigned I = 0; I < NumTerminals; ++I)
if (G->table().Terminals[I] == Name)
return tokenSymbol(static_cast<tok::TokenKind>(I));
@@ -50,31 +51,28 @@ class GrammarTest : public ::testing::Test {
};
TEST_F(GrammarTest, Basic) {
- build("expression := IDENTIFIER + expression # comment");
+ build("_ := IDENTIFIER + _ # comment");
EXPECT_THAT(Diags, IsEmpty());
auto ExpectedRule =
- AllOf(TargetID(lookup("expression")),
- Sequence(lookup("IDENTIFIER"), lookup("+"), lookup("expression")));
- auto ExpressionID = lookup("expression");
- EXPECT_EQ(G->symbolName(ExpressionID), "expression");
- EXPECT_THAT(G->rulesFor(ExpressionID), UnorderedElementsAre(ExpectedRule));
+ AllOf(TargetID(id("_")), Sequence(id("IDENTIFIER"), id("+"), id("_")));
+ EXPECT_EQ(G->symbolName(id("_")), "_");
+ EXPECT_THAT(G->rulesFor(id("_")), UnorderedElementsAre(ExpectedRule));
const auto &Rule = G->lookupRule(/*RID=*/0);
EXPECT_THAT(Rule, ExpectedRule);
EXPECT_THAT(G->symbolName(Rule.seq()[0]), "IDENTIFIER");
EXPECT_THAT(G->symbolName(Rule.seq()[1]), "+");
- EXPECT_THAT(G->symbolName(Rule.seq()[2]), "expression");
+ EXPECT_THAT(G->symbolName(Rule.seq()[2]), "_");
}
TEST_F(GrammarTest, EliminatedOptional) {
build("_ := CONST_opt INT ;_opt");
EXPECT_THAT(Diags, IsEmpty());
EXPECT_THAT(G->table().Rules,
- UnorderedElementsAre(
- Sequence(lookup("INT")),
- Sequence(lookup("CONST"), lookup("INT")),
- Sequence(lookup("CONST"), lookup("INT"), lookup(";")),
- Sequence(lookup("INT"), lookup(";"))));
+ UnorderedElementsAre(Sequence(id("INT")),
+ Sequence(id("CONST"), id("INT")),
+ Sequence(id("CONST"), id("INT"), id(";")),
+ Sequence(id("INT"), id(";"))));
}
TEST_F(GrammarTest, Diagnostics) {
@@ -87,6 +85,7 @@ TEST_F(GrammarTest, Diagnostics) {
invalid
)cpp");
+ EXPECT_EQ(G->startSymbol(), id("_"));
EXPECT_THAT(Diags, UnorderedElementsAre(
"Rule '_ := ,_opt' has a nullable RHS",
"Rule 'null := ' has a nullable RHS",
@@ -96,6 +95,66 @@ TEST_F(GrammarTest, Diagnostics) {
"No rules for nonterminal: IDENFIFIE"));
}
+TEST_F(GrammarTest, FirstAndFollowSets) {
+ build(
+ R"bnf(
+_ := expr
+expr := expr - term
+expr := term
+term := IDENTIFIER
+term := ( expr )
+)bnf");
+ ASSERT_TRUE(Diags.empty());
+ auto ToPairs = [](std::vector<llvm::DenseSet<SymbolID>> Input) {
+ std::vector<std::pair<SymbolID, llvm::DenseSet<SymbolID>>> Sets;
+ for (SymbolID ID = 0; ID < Input.size(); ++ID)
+ Sets.emplace_back(ID, std::move(Input[ID]));
+ return Sets;
+ };
+
+ EXPECT_THAT(
+ ToPairs(firstSets(*G)),
+ UnorderedElementsAre(
+ Pair(id("_"), UnorderedElementsAre(id("IDENTIFIER"), id("("))),
+ Pair(id("expr"), UnorderedElementsAre(id("IDENTIFIER"), id("("))),
+ Pair(id("term"), UnorderedElementsAre(id("IDENTIFIER"), id("(")))));
+ EXPECT_THAT(
+ ToPairs(followSets(*G)),
+ UnorderedElementsAre(
+ Pair(id("_"), UnorderedElementsAre(id("EOF"))),
+ Pair(id("expr"), UnorderedElementsAre(id("-"), id("EOF"), id(")"))),
+ Pair(id("term"), UnorderedElementsAre(id("-"), id("EOF"), id(")")))));
+
+ build(R"bnf(
+# A simplfied C++ decl-specifier-seq.
+_ := decl-specifier-seq
+decl-specifier-seq := decl-specifier decl-specifier-seq
+decl-specifier-seq := decl-specifier
+decl-specifier := simple-type-specifier
+decl-specifier := INLINE
+simple-type-specifier := INT
+ )bnf");
+ ASSERT_TRUE(Diags.empty());
+ EXPECT_THAT(
+ ToPairs(firstSets(*G)),
+ UnorderedElementsAre(
+ Pair(id("_"), UnorderedElementsAre(id("INLINE"), id("INT"))),
+ Pair(id("decl-specifier-seq"),
+ UnorderedElementsAre(id("INLINE"), id("INT"))),
+ Pair(id("simple-type-specifier"), UnorderedElementsAre(id("INT"))),
+ Pair(id("decl-specifier"),
+ UnorderedElementsAre(id("INLINE"), id("INT")))));
+ EXPECT_THAT(
+ ToPairs(followSets(*G)),
+ UnorderedElementsAre(
+ Pair(id("_"), UnorderedElementsAre(id("EOF"))),
+ Pair(id("decl-specifier-seq"), UnorderedElementsAre(id("EOF"))),
+ Pair(id("decl-specifier"),
+ UnorderedElementsAre(id("INLINE"), id("INT"), id("EOF"))),
+ Pair(id("simple-type-specifier"),
+ UnorderedElementsAre(id("INLINE"), id("INT"), id("EOF")))));
+}
+
} // namespace
} // namespace pseudo
} // namespace syntax
More information about the cfe-commits
mailing list