[clang-tools-extra] [clang-pseudo] Add a --print-terminal-tokens option (PR #87898)
Jeremy Rifkin via cfe-commits
cfe-commits at lists.llvm.org
Sat Apr 6 22:10:17 PDT 2024
https://github.com/jeremy-rifkin updated https://github.com/llvm/llvm-project/pull/87898
>From 2ebb15e08b5e2d8a9fe6cfddbe0dd2a8942b2542 Mon Sep 17 00:00:00 2001
From: Jeremy <51220084+jeremy-rifkin at users.noreply.github.com>
Date: Sat, 6 Apr 2024 17:02:20 -0500
Subject: [PATCH 1/3] Add a --print-terminal-tokens option
---
clang-tools-extra/pseudo/fuzzer/Fuzzer.cpp | 2 +-
.../pseudo/include/clang-pseudo/Forest.h | 11 ++++++--
clang-tools-extra/pseudo/lib/Forest.cpp | 26 +++++++++++++------
clang-tools-extra/pseudo/tool/ClangPseudo.cpp | 12 +++++++--
4 files changed, 38 insertions(+), 13 deletions(-)
diff --git a/clang-tools-extra/pseudo/fuzzer/Fuzzer.cpp b/clang-tools-extra/pseudo/fuzzer/Fuzzer.cpp
index 87b9d15480cc35..33b3da1ed6ea9f 100644
--- a/clang-tools-extra/pseudo/fuzzer/Fuzzer.cpp
+++ b/clang-tools-extra/pseudo/fuzzer/Fuzzer.cpp
@@ -46,7 +46,7 @@ class Fuzzer {
glrParse(clang::pseudo::ParseParams{ParseableStream, Arena, GSS},
*Lang.G.findNonterminal("translation-unit"), Lang);
if (Print)
- llvm::outs() << Root.dumpRecursive(Lang.G);
+ llvm::outs() << Root.dumpRecursive(Lang.G, std::nullopt);
}
};
diff --git a/clang-tools-extra/pseudo/include/clang-pseudo/Forest.h b/clang-tools-extra/pseudo/include/clang-pseudo/Forest.h
index e9edb40e02b64e..642c489b3fba41 100644
--- a/clang-tools-extra/pseudo/include/clang-pseudo/Forest.h
+++ b/clang-tools-extra/pseudo/include/clang-pseudo/Forest.h
@@ -26,6 +26,8 @@
#include "llvm/ADT/STLExtras.h"
#include "llvm/Support/Allocator.h"
#include <cstdint>
+#include <functional>
+#include <optional>
namespace clang {
namespace pseudo {
@@ -112,8 +114,13 @@ class alignas(class ForestNode *) ForestNode {
// Iteration over all nodes in the forest, including this.
llvm::iterator_range<RecursiveIterator> descendants() const;
- std::string dump(const Grammar &) const;
- std::string dumpRecursive(const Grammar &, bool Abbreviated = false) const;
+ std::string
+ dump(const Grammar &,
+ std::optional<std::reference_wrapper<const TokenStream>>) const;
+ std::string
+ dumpRecursive(const Grammar &,
+ std::optional<std::reference_wrapper<const TokenStream>>,
+ bool Abbreviated = false) const;
private:
friend class ForestArena;
diff --git a/clang-tools-extra/pseudo/lib/Forest.cpp b/clang-tools-extra/pseudo/lib/Forest.cpp
index e8e60e5ec475a4..adce731d6c1e1c 100644
--- a/clang-tools-extra/pseudo/lib/Forest.cpp
+++ b/clang-tools-extra/pseudo/lib/Forest.cpp
@@ -45,13 +45,21 @@ ForestNode::descendants() const {
return {RecursiveIterator(this), RecursiveIterator()};
}
-std::string ForestNode::dump(const Grammar &G) const {
+std::string ForestNode::dump(
+ const Grammar &G,
+ std::optional<std::reference_wrapper<const TokenStream>> Code) const {
switch (kind()) {
case Ambiguous:
return llvm::formatv("{0} := <ambiguous>", G.symbolName(symbol()));
case Terminal:
- return llvm::formatv("{0} := tok[{1}]", G.symbolName(symbol()),
- startTokenIndex());
+ if (Code) {
+ return llvm::formatv("{0} := tok[{1}] ({2})", G.symbolName(symbol()),
+ startTokenIndex(),
+ Code->get().tokens()[startTokenIndex()]);
+ } else {
+ return llvm::formatv("{0} := tok[{1}]", G.symbolName(symbol()),
+ startTokenIndex());
+ }
case Sequence:
return G.dumpRule(rule());
case Opaque:
@@ -60,8 +68,10 @@ std::string ForestNode::dump(const Grammar &G) const {
llvm_unreachable("Unhandled node kind!");
}
-std::string ForestNode::dumpRecursive(const Grammar &G,
- bool Abbreviated) const {
+std::string ForestNode::dumpRecursive(
+ const Grammar &G,
+ std::optional<std::reference_wrapper<const TokenStream>> Code,
+ bool Abbreviated) const {
using llvm::formatv;
Token::Index MaxToken = 0;
// Count visits of nodes so we can mark those seen multiple times.
@@ -95,7 +105,7 @@ std::string ForestNode::dumpRecursive(const Grammar &G,
std::string Result;
constexpr Token::Index KEnd = std::numeric_limits<Token::Index>::max();
std::function<void(const ForestNode *, Token::Index, std::optional<SymbolID>,
- LineDecoration &LineDec)>
+ LineDecoration LineDec)>
Dump = [&](const ForestNode *P, Token::Index End,
std::optional<SymbolID> ElidedParent, LineDecoration LineDec) {
bool SharedNode = VisitCounts.find(P)->getSecond() > 1;
@@ -145,13 +155,13 @@ std::string ForestNode::dumpRecursive(const Grammar &G,
// The first time, print as #1. Later, =#1.
if (First) {
- Result += formatv("{0} #{1}", P->dump(G), ID);
+ Result += formatv("{0} #{1}", P->dump(G, Code), ID);
} else {
Result += formatv("{0} =#{1}", G.symbolName(P->symbol()), ID);
Children = {}; // Don't walk the children again.
}
} else {
- Result.append(P->dump(G));
+ Result.append(P->dump(G, Code));
}
Result.push_back('\n');
diff --git a/clang-tools-extra/pseudo/tool/ClangPseudo.cpp b/clang-tools-extra/pseudo/tool/ClangPseudo.cpp
index 6a64760749cefe..4797dc01cdc13b 100644
--- a/clang-tools-extra/pseudo/tool/ClangPseudo.cpp
+++ b/clang-tools-extra/pseudo/tool/ClangPseudo.cpp
@@ -51,6 +51,9 @@ static opt<bool> Disambiguate("disambiguate",
desc("Choose best tree from parse forest"));
static opt<bool> PrintStatistics("print-statistics", desc("Print GLR parser statistics"));
static opt<bool> PrintForest("print-forest", desc("Print parse forest"));
+static opt<bool>
+ PrintTerminalTokens("print-terminal-tokens",
+ desc("Print terminal tokens in parse forest"));
static opt<bool> ForestAbbrev("forest-abbrev", desc("Abbreviate parse forest"),
init(true));
static opt<std::string> HTMLForest("html-forest",
@@ -161,9 +164,14 @@ int main(int argc, char *argv[]) {
auto &Root =
glrParse(clang::pseudo::ParseParams{*ParseableStream, Arena, GSS},
*StartSymID, Lang);
+ std::optional<std::reference_wrapper<const TokenStream>> Code;
+ if (PrintTerminalTokens) {
+ Code = *ParseableStream;
+ }
// If we're disambiguating, we'll print at the end instead.
if (PrintForest && !Disambiguate)
- llvm::outs() << Root.dumpRecursive(Lang.G, /*Abbreviated=*/ForestAbbrev);
+ llvm::outs() << Root.dumpRecursive(Lang.G, Code,
+ /*Abbreviated=*/ForestAbbrev);
clang::pseudo::Disambiguation Disambig;
if (Disambiguate)
Disambig = clang::pseudo::disambiguate(&Root, {});
@@ -234,7 +242,7 @@ int main(int argc, char *argv[]) {
ForestNode *DisambigRoot = &Root;
removeAmbiguities(DisambigRoot, Disambig);
llvm::outs() << "Disambiguated tree:\n";
- llvm::outs() << DisambigRoot->dumpRecursive(Lang.G,
+ llvm::outs() << DisambigRoot->dumpRecursive(Lang.G, Code,
/*Abbreviated=*/ForestAbbrev);
}
}
>From ed5e37ba210ea76c35d20f3d14cc985e987fa8fd Mon Sep 17 00:00:00 2001
From: Jeremy <51220084+jeremy-rifkin at users.noreply.github.com>
Date: Sat, 6 Apr 2024 19:41:45 -0500
Subject: [PATCH 2/3] Fix a LLVM_DEBUG
---
clang-tools-extra/pseudo/lib/GLR.cpp | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/clang-tools-extra/pseudo/lib/GLR.cpp b/clang-tools-extra/pseudo/lib/GLR.cpp
index ac43c02db521eb..e4b5be79d7e58d 100644
--- a/clang-tools-extra/pseudo/lib/GLR.cpp
+++ b/clang-tools-extra/pseudo/lib/GLR.cpp
@@ -527,7 +527,8 @@ class GLRReduce {
SequenceNodes.size() == 1
? SequenceNodes.front()
: &Params.Forest.createAmbiguous(F.Symbol, SequenceNodes);
- LLVM_DEBUG(llvm::dbgs() << " --> " << Parsed->dump(Lang.G) << "\n");
+ LLVM_DEBUG(llvm::dbgs()
+ << " --> " << Parsed->dump(Lang.G, std::nullopt) << "\n");
// Bases for this family, deduplicate them, and group by the goTo State.
sortAndUnique(FamilyBases);
>From ac98abcb934934b94c61bcf68fdfcb3b877e6505 Mon Sep 17 00:00:00 2001
From: Jeremy <51220084+jeremy-rifkin at users.noreply.github.com>
Date: Sun, 7 Apr 2024 00:10:03 -0500
Subject: [PATCH 3/3] Improvements and fixes
---
clang-tools-extra/pseudo/fuzzer/Fuzzer.cpp | 2 +-
.../pseudo/include/clang-pseudo/Forest.h | 13 +++++++++----
clang-tools-extra/pseudo/lib/GLR.cpp | 3 +--
clang-tools-extra/pseudo/tool/ClangPseudo.cpp | 2 +-
clang-tools-extra/pseudo/unittests/ForestTest.cpp | 6 +++---
5 files changed, 15 insertions(+), 11 deletions(-)
diff --git a/clang-tools-extra/pseudo/fuzzer/Fuzzer.cpp b/clang-tools-extra/pseudo/fuzzer/Fuzzer.cpp
index 33b3da1ed6ea9f..87b9d15480cc35 100644
--- a/clang-tools-extra/pseudo/fuzzer/Fuzzer.cpp
+++ b/clang-tools-extra/pseudo/fuzzer/Fuzzer.cpp
@@ -46,7 +46,7 @@ class Fuzzer {
glrParse(clang::pseudo::ParseParams{ParseableStream, Arena, GSS},
*Lang.G.findNonterminal("translation-unit"), Lang);
if (Print)
- llvm::outs() << Root.dumpRecursive(Lang.G, std::nullopt);
+ llvm::outs() << Root.dumpRecursive(Lang.G);
}
};
diff --git a/clang-tools-extra/pseudo/include/clang-pseudo/Forest.h b/clang-tools-extra/pseudo/include/clang-pseudo/Forest.h
index 642c489b3fba41..0735e1fae08014 100644
--- a/clang-tools-extra/pseudo/include/clang-pseudo/Forest.h
+++ b/clang-tools-extra/pseudo/include/clang-pseudo/Forest.h
@@ -114,12 +114,17 @@ class alignas(class ForestNode *) ForestNode {
// Iteration over all nodes in the forest, including this.
llvm::iterator_range<RecursiveIterator> descendants() const;
- std::string
- dump(const Grammar &,
- std::optional<std::reference_wrapper<const TokenStream>>) const;
+ // Dump forest node to text. If Code is std::nullopt, terminals will be
+ // displayed as token indexes.
+ std::string dump(const Grammar &,
+ std::optional<std::reference_wrapper<const TokenStream>>
+ Code = std::nullopt) const;
+ // Dump forest node recursively to text. If Code is std::nullopt, terminals
+ // will be displayed as token indexes.
std::string
dumpRecursive(const Grammar &,
- std::optional<std::reference_wrapper<const TokenStream>>,
+ std::optional<std::reference_wrapper<const TokenStream>> Code =
+ std::nullopt,
bool Abbreviated = false) const;
private:
diff --git a/clang-tools-extra/pseudo/lib/GLR.cpp b/clang-tools-extra/pseudo/lib/GLR.cpp
index e4b5be79d7e58d..ac43c02db521eb 100644
--- a/clang-tools-extra/pseudo/lib/GLR.cpp
+++ b/clang-tools-extra/pseudo/lib/GLR.cpp
@@ -527,8 +527,7 @@ class GLRReduce {
SequenceNodes.size() == 1
? SequenceNodes.front()
: &Params.Forest.createAmbiguous(F.Symbol, SequenceNodes);
- LLVM_DEBUG(llvm::dbgs()
- << " --> " << Parsed->dump(Lang.G, std::nullopt) << "\n");
+ LLVM_DEBUG(llvm::dbgs() << " --> " << Parsed->dump(Lang.G) << "\n");
// Bases for this family, deduplicate them, and group by the goTo State.
sortAndUnique(FamilyBases);
diff --git a/clang-tools-extra/pseudo/tool/ClangPseudo.cpp b/clang-tools-extra/pseudo/tool/ClangPseudo.cpp
index 4797dc01cdc13b..ec5e6bdf7bb287 100644
--- a/clang-tools-extra/pseudo/tool/ClangPseudo.cpp
+++ b/clang-tools-extra/pseudo/tool/ClangPseudo.cpp
@@ -53,7 +53,7 @@ static opt<bool> PrintStatistics("print-statistics", desc("Print GLR parser stat
static opt<bool> PrintForest("print-forest", desc("Print parse forest"));
static opt<bool>
PrintTerminalTokens("print-terminal-tokens",
- desc("Print terminal tokens in parse forest"));
+ desc("Print terminal tokens in parse forest"));
static opt<bool> ForestAbbrev("forest-abbrev", desc("Abbreviate parse forest"),
init(true));
static opt<std::string> HTMLForest("html-forest",
diff --git a/clang-tools-extra/pseudo/unittests/ForestTest.cpp b/clang-tools-extra/pseudo/unittests/ForestTest.cpp
index 36af896148209d..d959b69ecdc943 100644
--- a/clang-tools-extra/pseudo/unittests/ForestTest.cpp
+++ b/clang-tools-extra/pseudo/unittests/ForestTest.cpp
@@ -73,12 +73,12 @@ TEST_F(ForestTest, DumpBasic) {
const auto *Add =
&Arena.createSequence(symbol("add-expression"), ruleFor("add-expression"),
{Left, &T[1], Right});
- EXPECT_EQ(Add->dumpRecursive(G, true),
+ EXPECT_EQ(Add->dumpRecursive(G, std::nullopt, true),
"[ 0, end) add-expression := id-expression + id-expression\n"
"[ 0, 1) ├─id-expression~IDENTIFIER := tok[0]\n"
"[ 1, 2) ├─+ := tok[1]\n"
"[ 2, end) └─id-expression~IDENTIFIER := tok[2]\n");
- EXPECT_EQ(Add->dumpRecursive(G, false),
+ EXPECT_EQ(Add->dumpRecursive(G, std::nullopt, false),
"[ 0, end) add-expression := id-expression + id-expression\n"
"[ 0, 1) ├─id-expression := IDENTIFIER\n"
"[ 0, 1) │ └─IDENTIFIER := tok[0]\n"
@@ -144,7 +144,7 @@ TEST_F(ForestTest, DumpAbbreviatedShared) {
// We must not abbreviate away shared nodes: if we show A~* there's no way to
// show that the intermediate B node is shared between A1 and A2.
- EXPECT_EQ(A->dumpRecursive(G, /*Abbreviate=*/true),
+ EXPECT_EQ(A->dumpRecursive(G, std::nullopt, /*Abbreviate=*/true),
"[ 0, end) A := <ambiguous>\n"
"[ 0, end) ├─A~B := * #1\n"
"[ 0, end) │ └─* := tok[0]\n"
More information about the cfe-commits
mailing list