[clang-tools-extra] [clang-pseudo] Add a --print-terminal-tokens option (PR #87898)

Jeremy Rifkin via cfe-commits cfe-commits at lists.llvm.org
Sat Apr 6 15:25:59 PDT 2024


https://github.com/jeremy-rifkin created https://github.com/llvm/llvm-project/pull/87898

This PR adds a `--print-terminal-tokens` option to clang-pseudo which prints tokens in a parse forest in addition to providing the token index:

```
› bin/clang-pseudo --source test.cpp --print-forest
[  0, end) translation-unit~simple-declaration := decl-specifier-seq init-declarator-list ;
[  0,   1) ├─decl-specifier-seq~simple-type-specifier := <ambiguous>
[  0,   1) │ ├─simple-type-specifier~IDENTIFIER := tok[0]
[  0,   1) │ └─simple-type-specifier~IDENTIFIER := tok[0]
[  1,   3) ├─init-declarator-list~ptr-declarator := ptr-operator ptr-declarator
[  1,   2) │ ├─ptr-operator~* := tok[1]
[  2,   3) │ └─ptr-declarator~IDENTIFIER := tok[2]
[  3, end) └─; := tok[3]
```
```
› bin/clang-pseudo --source test.cpp --print-forest --print-terminal-tokens
[  0, end) translation-unit~simple-declaration := decl-specifier-seq init-declarator-list ;
[  0,   1) ├─decl-specifier-seq~simple-type-specifier := <ambiguous>
[  0,   1) │ ├─simple-type-specifier~IDENTIFIER := tok[0] (identifier 1:0 "T" flags=1)
[  0,   1) │ └─simple-type-specifier~IDENTIFIER := tok[0] (identifier 1:0 "T" flags=1)
[  1,   3) ├─init-declarator-list~ptr-declarator := ptr-operator ptr-declarator
[  1,   2) │ ├─ptr-operator~* := tok[1] (star 1:0 "*")
[  2,   3) │ └─ptr-declarator~IDENTIFIER := tok[2] (identifier 1:0 "y")
[  3, end) └─; := tok[3] (semi 1:0 ";")
```



>From 2ebb15e08b5e2d8a9fe6cfddbe0dd2a8942b2542 Mon Sep 17 00:00:00 2001
From: Jeremy <51220084+jeremy-rifkin at users.noreply.github.com>
Date: Sat, 6 Apr 2024 17:02:20 -0500
Subject: [PATCH] Add a --print-terminal-tokens option

---
 clang-tools-extra/pseudo/fuzzer/Fuzzer.cpp    |  2 +-
 .../pseudo/include/clang-pseudo/Forest.h      | 11 ++++++--
 clang-tools-extra/pseudo/lib/Forest.cpp       | 26 +++++++++++++------
 clang-tools-extra/pseudo/tool/ClangPseudo.cpp | 12 +++++++--
 4 files changed, 38 insertions(+), 13 deletions(-)

diff --git a/clang-tools-extra/pseudo/fuzzer/Fuzzer.cpp b/clang-tools-extra/pseudo/fuzzer/Fuzzer.cpp
index 87b9d15480cc35..33b3da1ed6ea9f 100644
--- a/clang-tools-extra/pseudo/fuzzer/Fuzzer.cpp
+++ b/clang-tools-extra/pseudo/fuzzer/Fuzzer.cpp
@@ -46,7 +46,7 @@ class Fuzzer {
         glrParse(clang::pseudo::ParseParams{ParseableStream, Arena, GSS},
                  *Lang.G.findNonterminal("translation-unit"), Lang);
     if (Print)
-      llvm::outs() << Root.dumpRecursive(Lang.G);
+      llvm::outs() << Root.dumpRecursive(Lang.G, std::nullopt);
   }
 };
 
diff --git a/clang-tools-extra/pseudo/include/clang-pseudo/Forest.h b/clang-tools-extra/pseudo/include/clang-pseudo/Forest.h
index e9edb40e02b64e..642c489b3fba41 100644
--- a/clang-tools-extra/pseudo/include/clang-pseudo/Forest.h
+++ b/clang-tools-extra/pseudo/include/clang-pseudo/Forest.h
@@ -26,6 +26,8 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Support/Allocator.h"
 #include <cstdint>
+#include <functional>
+#include <optional>
 
 namespace clang {
 namespace pseudo {
@@ -112,8 +114,13 @@ class alignas(class ForestNode *) ForestNode {
   // Iteration over all nodes in the forest, including this.
   llvm::iterator_range<RecursiveIterator> descendants() const;
 
-  std::string dump(const Grammar &) const;
-  std::string dumpRecursive(const Grammar &, bool Abbreviated = false) const;
+  std::string
+  dump(const Grammar &,
+       std::optional<std::reference_wrapper<const TokenStream>>) const;
+  std::string
+  dumpRecursive(const Grammar &,
+                std::optional<std::reference_wrapper<const TokenStream>>,
+                bool Abbreviated = false) const;
 
 private:
   friend class ForestArena;
diff --git a/clang-tools-extra/pseudo/lib/Forest.cpp b/clang-tools-extra/pseudo/lib/Forest.cpp
index e8e60e5ec475a4..adce731d6c1e1c 100644
--- a/clang-tools-extra/pseudo/lib/Forest.cpp
+++ b/clang-tools-extra/pseudo/lib/Forest.cpp
@@ -45,13 +45,21 @@ ForestNode::descendants() const {
   return {RecursiveIterator(this), RecursiveIterator()};
 }
 
-std::string ForestNode::dump(const Grammar &G) const {
+std::string ForestNode::dump(
+    const Grammar &G,
+    std::optional<std::reference_wrapper<const TokenStream>> Code) const {
   switch (kind()) {
   case Ambiguous:
     return llvm::formatv("{0} := <ambiguous>", G.symbolName(symbol()));
   case Terminal:
-    return llvm::formatv("{0} := tok[{1}]", G.symbolName(symbol()),
-                         startTokenIndex());
+    if (Code) {
+      return llvm::formatv("{0} := tok[{1}] ({2})", G.symbolName(symbol()),
+                           startTokenIndex(),
+                           Code->get().tokens()[startTokenIndex()]);
+    } else {
+      return llvm::formatv("{0} := tok[{1}]", G.symbolName(symbol()),
+                           startTokenIndex());
+    }
   case Sequence:
     return G.dumpRule(rule());
   case Opaque:
@@ -60,8 +68,10 @@ std::string ForestNode::dump(const Grammar &G) const {
   llvm_unreachable("Unhandled node kind!");
 }
 
-std::string ForestNode::dumpRecursive(const Grammar &G,
-                                      bool Abbreviated) const {
+std::string ForestNode::dumpRecursive(
+    const Grammar &G,
+    std::optional<std::reference_wrapper<const TokenStream>> Code,
+    bool Abbreviated) const {
   using llvm::formatv;
   Token::Index MaxToken = 0;
   // Count visits of nodes so we can mark those seen multiple times.
@@ -95,7 +105,7 @@ std::string ForestNode::dumpRecursive(const Grammar &G,
   std::string Result;
   constexpr Token::Index KEnd = std::numeric_limits<Token::Index>::max();
   std::function<void(const ForestNode *, Token::Index, std::optional<SymbolID>,
-                     LineDecoration &LineDec)>
+                     LineDecoration LineDec)>
       Dump = [&](const ForestNode *P, Token::Index End,
                  std::optional<SymbolID> ElidedParent, LineDecoration LineDec) {
         bool SharedNode = VisitCounts.find(P)->getSecond() > 1;
@@ -145,13 +155,13 @@ std::string ForestNode::dumpRecursive(const Grammar &G,
 
           // The first time, print as #1. Later, =#1.
           if (First) {
-            Result += formatv("{0} #{1}", P->dump(G), ID);
+            Result += formatv("{0} #{1}", P->dump(G, Code), ID);
           } else {
             Result += formatv("{0} =#{1}", G.symbolName(P->symbol()), ID);
             Children = {}; // Don't walk the children again.
           }
         } else {
-          Result.append(P->dump(G));
+          Result.append(P->dump(G, Code));
         }
         Result.push_back('\n');
 
diff --git a/clang-tools-extra/pseudo/tool/ClangPseudo.cpp b/clang-tools-extra/pseudo/tool/ClangPseudo.cpp
index 6a64760749cefe..4797dc01cdc13b 100644
--- a/clang-tools-extra/pseudo/tool/ClangPseudo.cpp
+++ b/clang-tools-extra/pseudo/tool/ClangPseudo.cpp
@@ -51,6 +51,9 @@ static opt<bool> Disambiguate("disambiguate",
                               desc("Choose best tree from parse forest"));
 static opt<bool> PrintStatistics("print-statistics", desc("Print GLR parser statistics"));
 static opt<bool> PrintForest("print-forest", desc("Print parse forest"));
+static opt<bool>
+    PrintTerminalTokens("print-terminal-tokens",
+                           desc("Print terminal tokens in parse forest"));
 static opt<bool> ForestAbbrev("forest-abbrev", desc("Abbreviate parse forest"),
                               init(true));
 static opt<std::string> HTMLForest("html-forest",
@@ -161,9 +164,14 @@ int main(int argc, char *argv[]) {
     auto &Root =
         glrParse(clang::pseudo::ParseParams{*ParseableStream, Arena, GSS},
                  *StartSymID, Lang);
+    std::optional<std::reference_wrapper<const TokenStream>> Code;
+    if (PrintTerminalTokens) {
+      Code = *ParseableStream;
+    }
     // If we're disambiguating, we'll print at the end instead.
     if (PrintForest && !Disambiguate)
-      llvm::outs() << Root.dumpRecursive(Lang.G, /*Abbreviated=*/ForestAbbrev);
+      llvm::outs() << Root.dumpRecursive(Lang.G, Code,
+                                         /*Abbreviated=*/ForestAbbrev);
     clang::pseudo::Disambiguation Disambig;
     if (Disambiguate)
       Disambig = clang::pseudo::disambiguate(&Root, {});
@@ -234,7 +242,7 @@ int main(int argc, char *argv[]) {
       ForestNode *DisambigRoot = &Root;
       removeAmbiguities(DisambigRoot, Disambig);
       llvm::outs() << "Disambiguated tree:\n";
-      llvm::outs() << DisambigRoot->dumpRecursive(Lang.G,
+      llvm::outs() << DisambigRoot->dumpRecursive(Lang.G, Code,
                                                   /*Abbreviated=*/ForestAbbrev);
     }
   }



More information about the cfe-commits mailing list