[clang-tools-extra] 62d5f25 - [pseudo] Introduce parse forest.
Haojian Wu via cfe-commits
cfe-commits at lists.llvm.org
Thu Mar 24 06:49:32 PDT 2022
Author: Haojian Wu
Date: 2022-03-24T14:47:17+01:00
New Revision: 62d5f254ccd0fc745fceb15a80494ef892db367c
URL: https://github.com/llvm/llvm-project/commit/62d5f254ccd0fc745fceb15a80494ef892db367c
DIFF: https://github.com/llvm/llvm-project/commit/62d5f254ccd0fc745fceb15a80494ef892db367c.diff
LOG: [pseudo] Introduce parse forest.
Parse forest is the output of the GLR parser, it is a tree-like DAG
which presents all possible parse trees without duplicating subparse structures.
This is a patch split from https://reviews.llvm.org/D121150.
Differential Revision: https://reviews.llvm.org/D122139
Added:
clang-tools-extra/pseudo/include/clang-pseudo/Forest.h
clang-tools-extra/pseudo/lib/Forest.cpp
clang-tools-extra/pseudo/unittests/ForestTest.cpp
Modified:
clang-tools-extra/pseudo/lib/CMakeLists.txt
clang-tools-extra/pseudo/unittests/CMakeLists.txt
Removed:
################################################################################
diff --git a/clang-tools-extra/pseudo/include/clang-pseudo/Forest.h b/clang-tools-extra/pseudo/include/clang-pseudo/Forest.h
new file mode 100644
index 0000000000000..650f9131bd2ea
--- /dev/null
+++ b/clang-tools-extra/pseudo/include/clang-pseudo/Forest.h
@@ -0,0 +1,178 @@
+//===--- Forest.h - Parse forest, the output of the GLR parser ---*- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// A parse forest represents a set of possible parse trees efficiently, it is
+// produced by the GLR parser.
+//
+// Despite the name, its data structure is a tree-like DAG with a single root.
+// Multiple ways to parse the same tokens are presented as an ambiguous node
+// with all possible interpretations as children.
+// Common sub-parses are shared: if two interpretations both parse "1 + 1" as
+// "expr := expr + expr", they will share a Sequence node representing the expr.
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang-pseudo/Grammar.h"
+#include "clang-pseudo/Token.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/Allocator.h"
+#include <cstdint>
+
+namespace clang {
+namespace pseudo {
+
+// A node represents ways to parse a sequence of tokens, it interprets a fixed
+// range of tokens as a fixed grammar symbol.
+//
+// There are
diff erent kinds of nodes, some nodes have "children" (stored in a
+// trailing array) and have pointers to them. "Children" has
diff erent semantics
+// depending on the node kinds. For an Ambiguous node, it means all
+// possible interpretations; for a Sequence node, it means each symbol on the
+// right hand side of the production rule.
+//
+// Since this is a node in a DAG, a node may have multiple parents. And a node
+// doesn't have parent pointers.
+class alignas(class ForestNode *) ForestNode {
+public:
+ enum Kind : uint8_t {
+ // A Terminal node is a single terminal symbol bound to a token.
+ Terminal,
+ // A Sequence node is a nonterminal symbol parsed from a grammar rule,
+ // elements() are the parses of each symbol on the RHS of the rule.
+ // If the rule is A := X Y Z, the node is for nonterminal A, and elements()
+ // are [X, Y, Z].
+ Sequence,
+ // An Ambiguous node exposes multiple ways to interpret the code as the
+ // same symbol, alternatives() are all possible parses.
+ Ambiguous,
+ // An Opaque node is a placeholder. It asserts that tokens match a symbol,
+ // without saying how.
+ // It is used for lazy-parsing (not parsed yet), or error-recovery (invalid
+ // code).
+ Opaque,
+ };
+ Kind kind() const { return K; }
+
+ SymbolID symbol() const { return Symbol; }
+
+ // The start of the token range, it is a poistion within a token stream.
+ Token::Index startTokenIndex() const { return StartIndex; }
+
+ // Returns the corresponding grammar rule.
+ // REQUIRES: this is a Sequence node.
+ RuleID rule() const {
+ assert(kind() == Sequence);
+ return Data & ((1 << RuleBits) - 1);
+ }
+ // Returns the parses of each element on the RHS of the rule.
+ // REQUIRES: this is a Sequence node;
+ llvm::ArrayRef<const ForestNode *> elements() const {
+ assert(kind() == Sequence);
+ return children(Data >> RuleBits);
+ };
+
+ // Returns all possible interpretations of the code.
+ // REQUIRES: this is an Ambiguous node.
+ llvm::ArrayRef<const ForestNode *> alternatives() const {
+ assert(kind() == Ambiguous);
+ return children(Data);
+ }
+
+ std::string dump(const Grammar &) const;
+ std::string dumpRecursive(const Grammar &, bool Abbreviated = false) const;
+
+private:
+ friend class ForestArena;
+
+ ForestNode(Kind K, SymbolID Symbol, Token::Index StartIndex, uint16_t Data)
+ : StartIndex(StartIndex), K(K), Symbol(Symbol), Data(Data) {}
+
+ ForestNode(const ForestNode &) = delete;
+ ForestNode &operator=(const ForestNode &) = delete;
+ ForestNode(ForestNode &&) = delete;
+ ForestNode &operator=(ForestNode &&) = delete;
+
+ static uint16_t sequenceData(RuleID Rule,
+ llvm::ArrayRef<const ForestNode *> Elements) {
+ assert(Rule < (1 << RuleBits));
+ assert(Elements.size() < (1 << (16 - RuleBits)));
+ return Rule | Elements.size() << RuleBits;
+ }
+ static uint16_t
+ ambiguousData(llvm::ArrayRef<const ForestNode *> Alternatives) {
+ return Alternatives.size();
+ }
+
+ // Retrieves the trailing array.
+ llvm::ArrayRef<const ForestNode *> children(uint16_t Num) const {
+ return llvm::makeArrayRef(reinterpret_cast<ForestNode *const *>(this + 1),
+ Num);
+ }
+
+ Token::Index StartIndex;
+ Kind K : 4;
+ SymbolID Symbol : SymbolBits;
+ // Sequence - child count : 4 | RuleID : RuleBits (12)
+ // Ambiguous - child count : 16
+ // Terminal, Opaque - unused
+ uint16_t Data;
+ // An array of ForestNode* following the object.
+};
+// ForestNode may not be destroyed (for BumpPtrAllocator).
+static_assert(std::is_trivially_destructible<ForestNode>(), "");
+
+// A memory arena for the parse forest.
+class ForestArena {
+public:
+ llvm::ArrayRef<ForestNode> createTerminals(const TokenStream &Code);
+ ForestNode &createSequence(SymbolID SID, RuleID RID,
+ llvm::ArrayRef<const ForestNode *> Elements) {
+ assert(!Elements.empty());
+ return create(ForestNode::Sequence, SID,
+ Elements.front()->startTokenIndex(),
+ ForestNode::sequenceData(RID, Elements), Elements);
+ }
+ ForestNode &createAmbiguous(SymbolID SID,
+ llvm::ArrayRef<const ForestNode *> Alternatives) {
+ assert(!Alternatives.empty());
+ assert(llvm::all_of(Alternatives,
+ [SID](const ForestNode *Alternative) {
+ return SID == Alternative->symbol();
+ }) &&
+ "Ambiguous alternatives must represent the same symbol!");
+ return create(ForestNode::Ambiguous, SID,
+ Alternatives.front()->startTokenIndex(),
+ ForestNode::ambiguousData(Alternatives), Alternatives);
+ }
+ ForestNode &createOpaque(SymbolID SID, Token::Index Start) {
+ return create(ForestNode::Opaque, SID, Start, 0, {});
+ }
+
+ size_t nodeCount() const { return NodeCount; }
+ size_t bytes() const { return Arena.getBytesAllocated() + sizeof(this); }
+
+private:
+ ForestNode &create(ForestNode::Kind K, SymbolID SID, Token::Index Start,
+ uint16_t Data,
+ llvm::ArrayRef<const ForestNode *> Elements) {
+ ++NodeCount;
+ ForestNode *New = new (Arena.Allocate(
+ sizeof(ForestNode) + Elements.size() * sizeof(ForestNode *),
+ alignof(ForestNode))) ForestNode(K, SID, Start, Data);
+ if (!Elements.empty())
+ llvm::copy(Elements, reinterpret_cast<const ForestNode **>(New + 1));
+ return *New;
+ }
+
+ llvm::BumpPtrAllocator Arena;
+ uint32_t NodeCount = 0;
+};
+
+} // namespace pseudo
+} // namespace clang
diff --git a/clang-tools-extra/pseudo/lib/CMakeLists.txt b/clang-tools-extra/pseudo/lib/CMakeLists.txt
index 7239a58bc85f2..e23093845660d 100644
--- a/clang-tools-extra/pseudo/lib/CMakeLists.txt
+++ b/clang-tools-extra/pseudo/lib/CMakeLists.txt
@@ -2,6 +2,7 @@ set(LLVM_LINK_COMPONENTS Support)
add_clang_library(clangPseudo
DirectiveMap.cpp
+ Forest.cpp
Grammar.cpp
GrammarBNF.cpp
Lex.cpp
diff --git a/clang-tools-extra/pseudo/lib/Forest.cpp b/clang-tools-extra/pseudo/lib/Forest.cpp
new file mode 100644
index 0000000000000..41b9be8a8d30f
--- /dev/null
+++ b/clang-tools-extra/pseudo/lib/Forest.cpp
@@ -0,0 +1,126 @@
+//===--- Forest.cpp - Parse forest ------------------------------*- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang-pseudo/Forest.h"
+#include "clang-pseudo/Token.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/FormatVariadic.h"
+
+namespace clang {
+namespace pseudo {
+
+std::string ForestNode::dump(const Grammar &G) const {
+ switch (kind()) {
+ case Ambiguous:
+ return llvm::formatv("{0} := <ambiguous>", G.symbolName(symbol()));
+ case Terminal:
+ return llvm::formatv("{0} := tok[{1}]", G.symbolName(symbol()),
+ startTokenIndex());
+ case Sequence:
+ return G.dumpRule(rule());
+ case Opaque:
+ return llvm::formatv("{0} := <opaque>", G.symbolName(symbol()));
+ }
+ llvm_unreachable("Unhandled node kind!");
+}
+
+std::string ForestNode::dumpRecursive(const Grammar &G,
+ bool Abbreviated) const {
+ // Count visits of nodes so we can mark those seen multiple times.
+ llvm::DenseMap<const ForestNode *, /*VisitCount*/ unsigned> VisitCounts;
+ std::function<void(const ForestNode *)> CountVisits =
+ [&](const ForestNode *P) {
+ if (VisitCounts[P]++ > 0)
+ return; // Don't count children as multiply visited.
+ if (P->kind() == Ambiguous)
+ llvm::for_each(P->alternatives(), CountVisits);
+ else if (P->kind() == Sequence)
+ llvm::for_each(P->elements(), CountVisits);
+ };
+ CountVisits(this);
+
+ // We print a "#<id>" for nonterminal forest nodes that are being dumped
+ // multiple times.
+ llvm::DenseMap<const ForestNode *, size_t> ReferenceIds;
+ std::string Result;
+ constexpr Token::Index KEnd = std::numeric_limits<Token::Index>::max();
+ std::function<void(const ForestNode *, unsigned, Token::Index,
+ llvm::Optional<SymbolID>)>
+ Dump = [&](const ForestNode *P, unsigned Level, Token::Index End,
+ llvm::Optional<SymbolID> ElidedParent) {
+ llvm::ArrayRef<const ForestNode *> Children;
+ auto EndOfElement = [&](size_t ChildIndex) {
+ return ChildIndex + 1 == Children.size()
+ ? End
+ : Children[ChildIndex + 1]->startTokenIndex();
+ };
+ if (P->kind() == Ambiguous) {
+ Children = P->alternatives();
+ } else if (P->kind() == Sequence) {
+ Children = P->elements();
+ if (Abbreviated) {
+ if (P->startTokenIndex() == End)
+ return;
+ for (size_t I = 0; I < Children.size(); ++I)
+ if (Children[I]->startTokenIndex() == P->startTokenIndex() &&
+ EndOfElement(I) == End) {
+ return Dump(
+ Children[I], Level, End,
+ /*ElidedParent=*/ElidedParent.getValueOr(P->symbol()));
+ }
+ }
+ }
+
+ // FIXME: pretty ascii trees
+ if (End == KEnd)
+ Result += llvm::formatv("[{0,3}, end) ", P->startTokenIndex());
+ else
+ Result += llvm::formatv("[{0,3}, {1,3}) ", P->startTokenIndex(), End);
+ Result.append(2 * Level, ' ');
+ if (ElidedParent.hasValue()) {
+ Result += G.symbolName(*ElidedParent);
+ Result += "~";
+ }
+ Result.append(P->dump(G));
+
+ if (VisitCounts.find(P)->getSecond() > 1 &&
+ P->kind() != ForestNode::Terminal) {
+ // The first time, print as #1. Later, =#1.
+ auto It = ReferenceIds.try_emplace(P, ReferenceIds.size() + 1);
+ Result +=
+ llvm::formatv(" {0}#{1}", It.second ? "" : "=", It.first->second);
+ }
+ Result.push_back('\n');
+
+ ++Level;
+ for (size_t I = 0; I < Children.size(); ++I)
+ Dump(Children[I], Level,
+ P->kind() == Sequence ? EndOfElement(I) : End, llvm::None);
+ };
+ Dump(this, 0, KEnd, llvm::None);
+ return Result;
+}
+
+llvm::ArrayRef<ForestNode>
+ForestArena::createTerminals(const TokenStream &Code) {
+ ForestNode *Terminals = Arena.Allocate<ForestNode>(Code.tokens().size());
+ size_t Index = 0;
+ for (const auto &T : Code.tokens()) {
+ new (&Terminals[Index])
+ ForestNode(ForestNode::Terminal, tokenSymbol(T.Kind),
+ /*Start=*/Index, /*TerminalData*/ 0);
+ ++Index;
+ }
+ NodeCount = Index;
+ return llvm::makeArrayRef(Terminals, Index);
+}
+
+} // namespace pseudo
+} // namespace clang
diff --git a/clang-tools-extra/pseudo/unittests/CMakeLists.txt b/clang-tools-extra/pseudo/unittests/CMakeLists.txt
index e4919f8a12212..7b06e5bead519 100644
--- a/clang-tools-extra/pseudo/unittests/CMakeLists.txt
+++ b/clang-tools-extra/pseudo/unittests/CMakeLists.txt
@@ -5,6 +5,7 @@ set(LLVM_LINK_COMPONENTS
add_custom_target(ClangPseudoUnitTests)
add_unittest(ClangPseudoUnitTests ClangPseudoTests
DirectiveMapTest.cpp
+ ForestTest.cpp
GrammarTest.cpp
LRTableTest.cpp
TokenTest.cpp
diff --git a/clang-tools-extra/pseudo/unittests/ForestTest.cpp b/clang-tools-extra/pseudo/unittests/ForestTest.cpp
new file mode 100644
index 0000000000000..b9bd08d78288a
--- /dev/null
+++ b/clang-tools-extra/pseudo/unittests/ForestTest.cpp
@@ -0,0 +1,130 @@
+//===--- ForestTest.cpp - Test Forest dump ----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang-pseudo/Forest.h"
+#include "clang-pseudo/Token.h"
+#include "clang/Basic/LangOptions.h"
+#include "llvm/ADT/StringRef.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include <vector>
+
+namespace clang {
+namespace pseudo {
+namespace {
+
+// FIXME: extract to a TestGrammar class to allow code sharing among tests.
+class ForestTest : public ::testing::Test {
+public:
+ void build(llvm::StringRef BNF) {
+ Diags.clear();
+ G = Grammar::parseBNF(BNF, Diags);
+ }
+
+ SymbolID symbol(llvm::StringRef Name) const {
+ for (unsigned I = 0; I < NumTerminals; ++I)
+ if (G->table().Terminals[I] == Name)
+ return tokenSymbol(static_cast<tok::TokenKind>(I));
+ for (SymbolID ID = 0; ID < G->table().Nonterminals.size(); ++ID)
+ if (G->table().Nonterminals[ID].Name == Name)
+ return ID;
+ ADD_FAILURE() << "No such symbol found: " << Name;
+ return 0;
+ }
+
+ RuleID ruleFor(llvm::StringRef NonterminalName) const {
+ auto RuleRange = G->table().Nonterminals[symbol(NonterminalName)].RuleRange;
+ if (RuleRange.End - RuleRange.Start == 1)
+ return G->table().Nonterminals[symbol(NonterminalName)].RuleRange.Start;
+ ADD_FAILURE() << "Expected a single rule for " << NonterminalName
+ << ", but it has " << RuleRange.End - RuleRange.Start
+ << " rule!\n";
+ return 0;
+ }
+
+protected:
+ std::unique_ptr<Grammar> G;
+ std::vector<std::string> Diags;
+};
+
+TEST_F(ForestTest, DumpBasic) {
+ build(R"cpp(
+ _ := add-expression
+ add-expression := id-expression + id-expression
+ id-expression := IDENTIFIER
+ )cpp");
+ ASSERT_TRUE(Diags.empty());
+ ForestArena Arena;
+ const auto &TS =
+ cook(lex("a + b", clang::LangOptions()), clang::LangOptions());
+
+ auto T = Arena.createTerminals(TS);
+ ASSERT_EQ(T.size(), 3u);
+ const auto *Left = &Arena.createSequence(
+ symbol("id-expression"), ruleFor("id-expression"), {&T.front()});
+ const auto *Right = &Arena.createSequence(symbol("id-expression"),
+ ruleFor("id-expression"), {&T[2]});
+
+ const auto *Add =
+ &Arena.createSequence(symbol("add-expression"), ruleFor("add-expression"), {Left, &T[1], Right});
+ EXPECT_EQ(Add->dumpRecursive(*G, true),
+ "[ 0, end) add-expression := id-expression + id-expression\n"
+ "[ 0, 1) id-expression~IDENTIFIER := tok[0]\n"
+ "[ 1, 2) + := tok[1]\n"
+ "[ 2, end) id-expression~IDENTIFIER := tok[2]\n");
+ EXPECT_EQ(Add->dumpRecursive(*G, false),
+ "[ 0, end) add-expression := id-expression + id-expression\n"
+ "[ 0, 1) id-expression := IDENTIFIER\n"
+ "[ 0, 1) IDENTIFIER := tok[0]\n"
+ "[ 1, 2) + := tok[1]\n"
+ "[ 2, end) id-expression := IDENTIFIER\n"
+ "[ 2, end) IDENTIFIER := tok[2]\n");
+}
+
+TEST_F(ForestTest, DumpAmbiguousAndRefs) {
+ build(R"cpp(
+ _ := type
+ type := class-type # rule 1
+ type := enum-type # rule 2
+ class-type := shared-type
+ enum-type := shared-type
+ shared-type := IDENTIFIER)cpp");
+ ASSERT_TRUE(Diags.empty());
+ ForestArena Arena;
+ const auto &TS = cook(lex("abc", clang::LangOptions()), clang::LangOptions());
+
+ auto Terminals = Arena.createTerminals(TS);
+ ASSERT_EQ(Terminals.size(), 1u);
+
+ const auto *SharedType = &Arena.createSequence(
+ symbol("shared-type"), ruleFor("shared-type"), {Terminals.begin()});
+ const auto *ClassType = &Arena.createSequence(
+ symbol("class-type"), ruleFor("class-type"), {SharedType});
+ const auto *Enumtype = &Arena.createSequence(
+ symbol("enum-type"), ruleFor("enum-type"), {SharedType});
+ const auto *Alternative1 =
+ &Arena.createSequence(symbol("type"), /*RuleID=*/1, {ClassType});
+ const auto *Alternative2 =
+ &Arena.createSequence(symbol("type"), /*RuleID=*/2, {Enumtype});
+ const auto *Type =
+ &Arena.createAmbiguous(symbol("type"), {Alternative1, Alternative2});
+ EXPECT_EQ(Type->dumpRecursive(*G),
+ "[ 0, end) type := <ambiguous>\n"
+ "[ 0, end) class-type := shared-type\n"
+ "[ 0, end) class-type := shared-type\n"
+ "[ 0, end) shared-type := IDENTIFIER #1\n"
+ "[ 0, end) IDENTIFIER := tok[0]\n"
+ "[ 0, end) enum-type := shared-type\n"
+ "[ 0, end) enum-type := shared-type\n"
+ "[ 0, end) shared-type := IDENTIFIER =#1\n"
+ "[ 0, end) IDENTIFIER := tok[0]\n");
+}
+
+} // namespace
+} // namespace pseudo
+} // namespace clang
More information about the cfe-commits
mailing list