[clang-tools-extra] [clang-pseudo] Add a --print-terminal-tokens option (PR #87898)
Jeremy Rifkin via cfe-commits
cfe-commits at lists.llvm.org
Sat Apr 6 15:25:59 PDT 2024
https://github.com/jeremy-rifkin created https://github.com/llvm/llvm-project/pull/87898
This PR adds a `--print-terminal-tokens` option to clang-pseudo which prints tokens in a parse forest in addition to providing the token index:
```
› bin/clang-pseudo --source test.cpp --print-forest
[ 0, end) translation-unit~simple-declaration := decl-specifier-seq init-declarator-list ;
[ 0, 1) ├─decl-specifier-seq~simple-type-specifier := <ambiguous>
[ 0, 1) │ ├─simple-type-specifier~IDENTIFIER := tok[0]
[ 0, 1) │ └─simple-type-specifier~IDENTIFIER := tok[0]
[ 1, 3) ├─init-declarator-list~ptr-declarator := ptr-operator ptr-declarator
[ 1, 2) │ ├─ptr-operator~* := tok[1]
[ 2, 3) │ └─ptr-declarator~IDENTIFIER := tok[2]
[ 3, end) └─; := tok[3]
```
```
› bin/clang-pseudo --source test.cpp --print-forest --print-terminal-tokens
[ 0, end) translation-unit~simple-declaration := decl-specifier-seq init-declarator-list ;
[ 0, 1) ├─decl-specifier-seq~simple-type-specifier := <ambiguous>
[ 0, 1) │ ├─simple-type-specifier~IDENTIFIER := tok[0] (identifier 1:0 "T" flags=1)
[ 0, 1) │ └─simple-type-specifier~IDENTIFIER := tok[0] (identifier 1:0 "T" flags=1)
[ 1, 3) ├─init-declarator-list~ptr-declarator := ptr-operator ptr-declarator
[ 1, 2) │ ├─ptr-operator~* := tok[1] (star 1:0 "*")
[ 2, 3) │ └─ptr-declarator~IDENTIFIER := tok[2] (identifier 1:0 "y")
[ 3, end) └─; := tok[3] (semi 1:0 ";")
```
>From 2ebb15e08b5e2d8a9fe6cfddbe0dd2a8942b2542 Mon Sep 17 00:00:00 2001
From: Jeremy <51220084+jeremy-rifkin at users.noreply.github.com>
Date: Sat, 6 Apr 2024 17:02:20 -0500
Subject: [PATCH] Add a --print-terminal-tokens option
---
clang-tools-extra/pseudo/fuzzer/Fuzzer.cpp | 2 +-
.../pseudo/include/clang-pseudo/Forest.h | 11 ++++++--
clang-tools-extra/pseudo/lib/Forest.cpp | 26 +++++++++++++------
clang-tools-extra/pseudo/tool/ClangPseudo.cpp | 12 +++++++--
4 files changed, 38 insertions(+), 13 deletions(-)
diff --git a/clang-tools-extra/pseudo/fuzzer/Fuzzer.cpp b/clang-tools-extra/pseudo/fuzzer/Fuzzer.cpp
index 87b9d15480cc35..33b3da1ed6ea9f 100644
--- a/clang-tools-extra/pseudo/fuzzer/Fuzzer.cpp
+++ b/clang-tools-extra/pseudo/fuzzer/Fuzzer.cpp
@@ -46,7 +46,7 @@ class Fuzzer {
glrParse(clang::pseudo::ParseParams{ParseableStream, Arena, GSS},
*Lang.G.findNonterminal("translation-unit"), Lang);
if (Print)
- llvm::outs() << Root.dumpRecursive(Lang.G);
+ llvm::outs() << Root.dumpRecursive(Lang.G, std::nullopt);
}
};
diff --git a/clang-tools-extra/pseudo/include/clang-pseudo/Forest.h b/clang-tools-extra/pseudo/include/clang-pseudo/Forest.h
index e9edb40e02b64e..642c489b3fba41 100644
--- a/clang-tools-extra/pseudo/include/clang-pseudo/Forest.h
+++ b/clang-tools-extra/pseudo/include/clang-pseudo/Forest.h
@@ -26,6 +26,8 @@
#include "llvm/ADT/STLExtras.h"
#include "llvm/Support/Allocator.h"
#include <cstdint>
+#include <functional>
+#include <optional>
namespace clang {
namespace pseudo {
@@ -112,8 +114,13 @@ class alignas(class ForestNode *) ForestNode {
// Iteration over all nodes in the forest, including this.
llvm::iterator_range<RecursiveIterator> descendants() const;
- std::string dump(const Grammar &) const;
- std::string dumpRecursive(const Grammar &, bool Abbreviated = false) const;
+ std::string
+ dump(const Grammar &,
+ std::optional<std::reference_wrapper<const TokenStream>>) const;
+ std::string
+ dumpRecursive(const Grammar &,
+ std::optional<std::reference_wrapper<const TokenStream>>,
+ bool Abbreviated = false) const;
private:
friend class ForestArena;
diff --git a/clang-tools-extra/pseudo/lib/Forest.cpp b/clang-tools-extra/pseudo/lib/Forest.cpp
index e8e60e5ec475a4..adce731d6c1e1c 100644
--- a/clang-tools-extra/pseudo/lib/Forest.cpp
+++ b/clang-tools-extra/pseudo/lib/Forest.cpp
@@ -45,13 +45,21 @@ ForestNode::descendants() const {
return {RecursiveIterator(this), RecursiveIterator()};
}
-std::string ForestNode::dump(const Grammar &G) const {
+std::string ForestNode::dump(
+ const Grammar &G,
+ std::optional<std::reference_wrapper<const TokenStream>> Code) const {
switch (kind()) {
case Ambiguous:
return llvm::formatv("{0} := <ambiguous>", G.symbolName(symbol()));
case Terminal:
- return llvm::formatv("{0} := tok[{1}]", G.symbolName(symbol()),
- startTokenIndex());
+ if (Code) {
+ return llvm::formatv("{0} := tok[{1}] ({2})", G.symbolName(symbol()),
+ startTokenIndex(),
+ Code->get().tokens()[startTokenIndex()]);
+ } else {
+ return llvm::formatv("{0} := tok[{1}]", G.symbolName(symbol()),
+ startTokenIndex());
+ }
case Sequence:
return G.dumpRule(rule());
case Opaque:
@@ -60,8 +68,10 @@ std::string ForestNode::dump(const Grammar &G) const {
llvm_unreachable("Unhandled node kind!");
}
-std::string ForestNode::dumpRecursive(const Grammar &G,
- bool Abbreviated) const {
+std::string ForestNode::dumpRecursive(
+ const Grammar &G,
+ std::optional<std::reference_wrapper<const TokenStream>> Code,
+ bool Abbreviated) const {
using llvm::formatv;
Token::Index MaxToken = 0;
// Count visits of nodes so we can mark those seen multiple times.
@@ -95,7 +105,7 @@ std::string ForestNode::dumpRecursive(const Grammar &G,
std::string Result;
constexpr Token::Index KEnd = std::numeric_limits<Token::Index>::max();
std::function<void(const ForestNode *, Token::Index, std::optional<SymbolID>,
- LineDecoration &LineDec)>
+ LineDecoration LineDec)>
Dump = [&](const ForestNode *P, Token::Index End,
std::optional<SymbolID> ElidedParent, LineDecoration LineDec) {
bool SharedNode = VisitCounts.find(P)->getSecond() > 1;
@@ -145,13 +155,13 @@ std::string ForestNode::dumpRecursive(const Grammar &G,
// The first time, print as #1. Later, =#1.
if (First) {
- Result += formatv("{0} #{1}", P->dump(G), ID);
+ Result += formatv("{0} #{1}", P->dump(G, Code), ID);
} else {
Result += formatv("{0} =#{1}", G.symbolName(P->symbol()), ID);
Children = {}; // Don't walk the children again.
}
} else {
- Result.append(P->dump(G));
+ Result.append(P->dump(G, Code));
}
Result.push_back('\n');
diff --git a/clang-tools-extra/pseudo/tool/ClangPseudo.cpp b/clang-tools-extra/pseudo/tool/ClangPseudo.cpp
index 6a64760749cefe..4797dc01cdc13b 100644
--- a/clang-tools-extra/pseudo/tool/ClangPseudo.cpp
+++ b/clang-tools-extra/pseudo/tool/ClangPseudo.cpp
@@ -51,6 +51,9 @@ static opt<bool> Disambiguate("disambiguate",
desc("Choose best tree from parse forest"));
static opt<bool> PrintStatistics("print-statistics", desc("Print GLR parser statistics"));
static opt<bool> PrintForest("print-forest", desc("Print parse forest"));
+static opt<bool>
+ PrintTerminalTokens("print-terminal-tokens",
+ desc("Print terminal tokens in parse forest"));
static opt<bool> ForestAbbrev("forest-abbrev", desc("Abbreviate parse forest"),
init(true));
static opt<std::string> HTMLForest("html-forest",
@@ -161,9 +164,14 @@ int main(int argc, char *argv[]) {
auto &Root =
glrParse(clang::pseudo::ParseParams{*ParseableStream, Arena, GSS},
*StartSymID, Lang);
+ std::optional<std::reference_wrapper<const TokenStream>> Code;
+ if (PrintTerminalTokens) {
+ Code = *ParseableStream;
+ }
// If we're disambiguating, we'll print at the end instead.
if (PrintForest && !Disambiguate)
- llvm::outs() << Root.dumpRecursive(Lang.G, /*Abbreviated=*/ForestAbbrev);
+ llvm::outs() << Root.dumpRecursive(Lang.G, Code,
+ /*Abbreviated=*/ForestAbbrev);
clang::pseudo::Disambiguation Disambig;
if (Disambiguate)
Disambig = clang::pseudo::disambiguate(&Root, {});
@@ -234,7 +242,7 @@ int main(int argc, char *argv[]) {
ForestNode *DisambigRoot = &Root;
removeAmbiguities(DisambigRoot, Disambig);
llvm::outs() << "Disambiguated tree:\n";
- llvm::outs() << DisambigRoot->dumpRecursive(Lang.G,
+ llvm::outs() << DisambigRoot->dumpRecursive(Lang.G, Code,
/*Abbreviated=*/ForestAbbrev);
}
}
More information about the cfe-commits
mailing list