[clang-tools-extra] f1ac00c - [pseudo] Add grammar annotations support.
Haojian Wu via cfe-commits
cfe-commits at lists.llvm.org
Thu Jun 9 03:08:01 PDT 2022
Author: Haojian Wu
Date: 2022-06-09T12:06:22+02:00
New Revision: f1ac00c9b0d17e48f464709fc554ebf73f165158
URL: https://github.com/llvm/llvm-project/commit/f1ac00c9b0d17e48f464709fc554ebf73f165158
DIFF: https://github.com/llvm/llvm-project/commit/f1ac00c9b0d17e48f464709fc554ebf73f165158.diff
LOG: [pseudo] Add grammar annotations support.
Add annotation handling ([key=value]) in the BNF grammar parser, which
will be used in the conditional reduction, and error recovery.
Reviewed By: sammccall
Differential Revision: https://reviews.llvm.org/D126536
Added:
Modified:
clang-tools-extra/pseudo/include/clang-pseudo/Grammar.h
clang-tools-extra/pseudo/lib/grammar/Grammar.cpp
clang-tools-extra/pseudo/lib/grammar/GrammarBNF.cpp
clang-tools-extra/pseudo/unittests/GrammarTest.cpp
Removed:
################################################################################
diff --git a/clang-tools-extra/pseudo/include/clang-pseudo/Grammar.h b/clang-tools-extra/pseudo/include/clang-pseudo/Grammar.h
index df757c61a1ed..fff96e9771c8 100644
--- a/clang-tools-extra/pseudo/include/clang-pseudo/Grammar.h
+++ b/clang-tools-extra/pseudo/include/clang-pseudo/Grammar.h
@@ -19,6 +19,22 @@
// production rules. A rule is of BNF form (AAA := BBB CCC). A symbol is either
// nonterminal or terminal, identified by a SymbolID.
//
+// Annotations are supported in a syntax form of [key=value]. They specify
+// attributes which are associated with either a grammar symbol (on the
+// right-hand side of the symbol) or a grammar rule (at the end of the rule
+// body).
+// Attributes provide a way to inject custom code into the GLR parser. Each
+// unique attribute value creates an extension point (identified by ExtensionID
+// ), and an extension point corresponds to a piece of native code. For
+// example, C++ grammar has a rule:
+//
+// contextual-override := IDENTIFIER [guard=Override]
+//
+// GLR parser only conducts the reduction of the rule if the IDENTIFIER
+// content is `override`. This Override guard is implemented in CXX.cpp by
+// binding the ExtensionID for the `Override` value to a specific C++ function
+// that performs the check.
+//
// Notions about the BNF grammar:
// - "_" is the start symbol of the augmented grammar;
// - single-line comment is supported, starting with a #
@@ -69,6 +85,11 @@ inline SymbolID tokenSymbol(tok::TokenKind TK) {
return TokenFlag | static_cast<SymbolID>(TK);
}
+// An extension is a piece of native code specific to a grammar that modifies
+// the behavior of annotated rules. One ExtensionID is assigned for each unique
+// attribute value (all attributes share a namespace).
+using ExtensionID = uint16_t;
+
// A RuleID uniquely identifies a production rule in a grammar.
// It is an index into a table of rules.
using RuleID = uint16_t;
@@ -96,11 +117,17 @@ struct Rule {
uint8_t Size : SizeBits; // Size of the Sequence
SymbolID Sequence[MaxElements];
+ // A guard extension controls whether a reduction of a rule will be conducted
+ // by the GLR parser.
+ // 0 is sentinel unset extension ID, indicating there is no guard extension
+ // being set for this rule.
+ ExtensionID Guard = 0;
+
llvm::ArrayRef<SymbolID> seq() const {
return llvm::ArrayRef<SymbolID>(Sequence, Size);
}
friend bool operator==(const Rule &L, const Rule &R) {
- return L.Target == R.Target && L.seq() == R.seq();
+ return L.Target == R.Target && L.seq() == R.seq() && L.Guard == R.Guard;
}
};
@@ -186,6 +213,9 @@ struct GrammarTable {
// A table of nonterminals, sorted by name.
// SymbolID is the index of the table.
std::vector<Nonterminal> Nonterminals;
+ // A table of attribute values, sorted by name.
+ // ExtensionID is the index of the table.
+ std::vector<std::string> AttributeValues;
};
} // namespace pseudo
diff --git a/clang-tools-extra/pseudo/lib/grammar/Grammar.cpp b/clang-tools-extra/pseudo/lib/grammar/Grammar.cpp
index 2be34f8bec68..17634632baf0 100644
--- a/clang-tools-extra/pseudo/lib/grammar/Grammar.cpp
+++ b/clang-tools-extra/pseudo/lib/grammar/Grammar.cpp
@@ -61,6 +61,8 @@ std::string Grammar::dumpRule(RuleID RID) const {
OS << symbolName(R.Target) << " :=";
for (SymbolID SID : R.seq())
OS << " " << symbolName(SID);
+ if (R.Guard)
+ OS << " [guard=" << T->AttributeValues[R.Guard] << "]";
return Result;
}
diff --git a/clang-tools-extra/pseudo/lib/grammar/GrammarBNF.cpp b/clang-tools-extra/pseudo/lib/grammar/GrammarBNF.cpp
index f581adb3932e..82daf47af610 100644
--- a/clang-tools-extra/pseudo/lib/grammar/GrammarBNF.cpp
+++ b/clang-tools-extra/pseudo/lib/grammar/GrammarBNF.cpp
@@ -47,6 +47,9 @@ class GrammarBuilder {
// Assemble the name->ID and ID->nonterminal name maps.
llvm::DenseSet<llvm::StringRef> UniqueNonterminals;
llvm::DenseMap<llvm::StringRef, SymbolID> SymbolIds;
+
+ llvm::DenseSet<llvm::StringRef> UniqueAttributeValues;
+
for (uint16_t I = 0; I < NumTerminals; ++I)
SymbolIds.try_emplace(T->Terminals[I], tokenSymbol(tok::TokenKind(I)));
auto Consider = [&](llvm::StringRef Name) {
@@ -55,8 +58,11 @@ class GrammarBuilder {
};
for (const auto &Spec : Specs) {
Consider(Spec.Target);
- for (const RuleSpec::Element &Elt : Spec.Sequence)
+ for (const RuleSpec::Element &Elt : Spec.Sequence) {
Consider(Elt.Symbol);
+ for (const auto& KV : Elt.Attributes)
+ UniqueAttributeValues.insert(KV.second);
+ }
}
llvm::for_each(UniqueNonterminals, [&T](llvm::StringRef Name) {
T->Nonterminals.emplace_back();
@@ -68,6 +74,15 @@ class GrammarBuilder {
const GrammarTable::Nonterminal &R) {
return L.Name < R.Name;
});
+ // Add an empty string for the corresponding sentinel unset attribute.
+ T->AttributeValues.push_back("");
+ llvm::for_each(UniqueAttributeValues, [&T](llvm::StringRef Name) {
+ T->AttributeValues.emplace_back();
+ T->AttributeValues.back() = Name.str();
+ });
+ llvm::sort(T->AttributeValues);
+ assert(T->AttributeValues.front() == "");
+
// Build name -> ID maps for nonterminals.
for (SymbolID SID = 0; SID < T->Nonterminals.size(); ++SID)
SymbolIds.try_emplace(T->Nonterminals[SID].Name, SID);
@@ -86,7 +101,9 @@ class GrammarBuilder {
for (const RuleSpec::Element &Elt : Spec.Sequence)
Symbols.push_back(Lookup(Elt.Symbol));
T->Rules.push_back(Rule(Lookup(Spec.Target), Symbols));
+ applyAttributes(Spec, *T, T->Rules.back());
}
+
assert(T->Rules.size() < (1 << RuleBits) &&
"Too many rules to fit in RuleID bits!");
const auto &SymbolOrder = getTopologicalOrder(T.get());
@@ -164,6 +181,9 @@ class GrammarBuilder {
llvm::StringRef Target;
struct Element {
llvm::StringRef Symbol; // Name of the symbol
+ // Attributes that are associated to the sequence symbol or rule.
+ std::vector<std::pair<llvm::StringRef/*Key*/, llvm::StringRef/*Value*/>>
+ Attributes;
};
std::vector<Element> Sequence;
@@ -204,11 +224,46 @@ class GrammarBuilder {
Chunk = Chunk.trim();
if (Chunk.empty())
continue; // skip empty
+ if (Chunk.startswith("[") && Chunk.endswith("]")) {
+ if (Out.Sequence.empty())
+ continue;
+
+ parseAttributes(Chunk, Out.Sequence.back().Attributes);
+ continue;
+ }
Out.Sequence.push_back({Chunk});
}
return true;
- };
+ }
+
+ bool parseAttributes(
+ llvm::StringRef Content,
+ std::vector<std::pair<llvm::StringRef, llvm::StringRef>> &Out) {
+ assert(Content.startswith("[") && Content.endswith("]"));
+ auto KV = Content.drop_front().drop_back().split('=');
+ Out.push_back({KV.first, KV.second.trim()});
+
+ return true;
+ }
+ // Apply the parsed extensions (stored in RuleSpec) to the grammar Rule.
+ void applyAttributes(const RuleSpec& Spec, const GrammarTable& T, Rule& R) {
+ auto LookupExtensionID = [&T](llvm::StringRef Name) {
+ const auto It = llvm::partition_point(
+ T.AttributeValues, [&](llvm::StringRef X) { return X < Name; });
+ assert(It != T.AttributeValues.end() && *It == Name &&
+ "Didn't find the attribute in AttrValues!");
+ return It - T.AttributeValues.begin();
+ };
+ for (const auto &KV : Spec.Sequence.back().Attributes) {
+ if (KV.first == "guard") {
+ R.Guard = LookupExtensionID(KV.second);
+ continue;
+ }
+ Diagnostics.push_back(
+ llvm::formatv("Unknown attribute '{0}'", KV.first).str());
+ }
+ }
// Inlines all _opt symbols.
// For example, a rule E := id +_opt id, after elimination, we have two
diff --git a/clang-tools-extra/pseudo/unittests/GrammarTest.cpp b/clang-tools-extra/pseudo/unittests/GrammarTest.cpp
index 0f71c4701905..cc72ca2b6f56 100644
--- a/clang-tools-extra/pseudo/unittests/GrammarTest.cpp
+++ b/clang-tools-extra/pseudo/unittests/GrammarTest.cpp
@@ -99,6 +99,22 @@ TEST_F(GrammarTest, RuleIDSorted) {
EXPECT_LT(ruleFor("x"), ruleFor("_"));
}
+TEST_F(GrammarTest, Annotation) {
+ build(R"bnf(
+ _ := x
+
+ x := y [guard=value]
+ y := IDENTIFIER [guard=final]
+
+ )bnf");
+ ASSERT_TRUE(Diags.empty());
+ EXPECT_EQ(G->lookupRule(ruleFor("_")).Guard, 0);
+ EXPECT_GT(G->lookupRule(ruleFor("x")).Guard, 0);
+ EXPECT_GT(G->lookupRule(ruleFor("y")).Guard, 0);
+ EXPECT_NE(G->lookupRule(ruleFor("x")).Guard,
+ G->lookupRule(ruleFor("y")).Guard);
+}
+
TEST_F(GrammarTest, Diagnostics) {
build(R"cpp(
_ := ,_opt
@@ -110,6 +126,8 @@ TEST_F(GrammarTest, Diagnostics) {
# cycle
a := b
b := a
+
+ _ := IDENTIFIER [unknown=value]
)cpp");
EXPECT_EQ(G->underscore(), id("_"));
@@ -120,7 +138,8 @@ TEST_F(GrammarTest, Diagnostics) {
"Failed to parse 'invalid': no separator :=",
"Token-like name IDENFIFIE is used as a nonterminal",
"No rules for nonterminal: IDENFIFIE",
- "The grammar contains a cycle involving symbol a"));
+ "The grammar contains a cycle involving symbol a",
+ "Unknown attribute 'unknown'"));
}
TEST_F(GrammarTest, FirstAndFollowSets) {
More information about the cfe-commits
mailing list