[clang-tools-extra] [clang-pseudo] Add a --print-terminal-tokens option (PR #87898)

Jeremy Rifkin via cfe-commits cfe-commits at lists.llvm.org
Sat Apr 6 22:10:17 PDT 2024


https://github.com/jeremy-rifkin updated https://github.com/llvm/llvm-project/pull/87898

>From 2ebb15e08b5e2d8a9fe6cfddbe0dd2a8942b2542 Mon Sep 17 00:00:00 2001
From: Jeremy <51220084+jeremy-rifkin at users.noreply.github.com>
Date: Sat, 6 Apr 2024 17:02:20 -0500
Subject: [PATCH 1/3] Add a --print-terminal-tokens option

---
 clang-tools-extra/pseudo/fuzzer/Fuzzer.cpp    |  2 +-
 .../pseudo/include/clang-pseudo/Forest.h      | 11 ++++++--
 clang-tools-extra/pseudo/lib/Forest.cpp       | 26 +++++++++++++------
 clang-tools-extra/pseudo/tool/ClangPseudo.cpp | 12 +++++++--
 4 files changed, 38 insertions(+), 13 deletions(-)

diff --git a/clang-tools-extra/pseudo/fuzzer/Fuzzer.cpp b/clang-tools-extra/pseudo/fuzzer/Fuzzer.cpp
index 87b9d15480cc35..33b3da1ed6ea9f 100644
--- a/clang-tools-extra/pseudo/fuzzer/Fuzzer.cpp
+++ b/clang-tools-extra/pseudo/fuzzer/Fuzzer.cpp
@@ -46,7 +46,7 @@ class Fuzzer {
         glrParse(clang::pseudo::ParseParams{ParseableStream, Arena, GSS},
                  *Lang.G.findNonterminal("translation-unit"), Lang);
     if (Print)
-      llvm::outs() << Root.dumpRecursive(Lang.G);
+      llvm::outs() << Root.dumpRecursive(Lang.G, std::nullopt);
   }
 };
 
diff --git a/clang-tools-extra/pseudo/include/clang-pseudo/Forest.h b/clang-tools-extra/pseudo/include/clang-pseudo/Forest.h
index e9edb40e02b64e..642c489b3fba41 100644
--- a/clang-tools-extra/pseudo/include/clang-pseudo/Forest.h
+++ b/clang-tools-extra/pseudo/include/clang-pseudo/Forest.h
@@ -26,6 +26,8 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Support/Allocator.h"
 #include <cstdint>
+#include <functional>
+#include <optional>
 
 namespace clang {
 namespace pseudo {
@@ -112,8 +114,13 @@ class alignas(class ForestNode *) ForestNode {
   // Iteration over all nodes in the forest, including this.
   llvm::iterator_range<RecursiveIterator> descendants() const;
 
-  std::string dump(const Grammar &) const;
-  std::string dumpRecursive(const Grammar &, bool Abbreviated = false) const;
+  std::string
+  dump(const Grammar &,
+       std::optional<std::reference_wrapper<const TokenStream>>) const;
+  std::string
+  dumpRecursive(const Grammar &,
+                std::optional<std::reference_wrapper<const TokenStream>>,
+                bool Abbreviated = false) const;
 
 private:
   friend class ForestArena;
diff --git a/clang-tools-extra/pseudo/lib/Forest.cpp b/clang-tools-extra/pseudo/lib/Forest.cpp
index e8e60e5ec475a4..adce731d6c1e1c 100644
--- a/clang-tools-extra/pseudo/lib/Forest.cpp
+++ b/clang-tools-extra/pseudo/lib/Forest.cpp
@@ -45,13 +45,21 @@ ForestNode::descendants() const {
   return {RecursiveIterator(this), RecursiveIterator()};
 }
 
-std::string ForestNode::dump(const Grammar &G) const {
+std::string ForestNode::dump(
+    const Grammar &G,
+    std::optional<std::reference_wrapper<const TokenStream>> Code) const {
   switch (kind()) {
   case Ambiguous:
     return llvm::formatv("{0} := <ambiguous>", G.symbolName(symbol()));
   case Terminal:
-    return llvm::formatv("{0} := tok[{1}]", G.symbolName(symbol()),
-                         startTokenIndex());
+    if (Code) {
+      return llvm::formatv("{0} := tok[{1}] ({2})", G.symbolName(symbol()),
+                           startTokenIndex(),
+                           Code->get().tokens()[startTokenIndex()]);
+    } else {
+      return llvm::formatv("{0} := tok[{1}]", G.symbolName(symbol()),
+                           startTokenIndex());
+    }
   case Sequence:
     return G.dumpRule(rule());
   case Opaque:
@@ -60,8 +68,10 @@ std::string ForestNode::dump(const Grammar &G) const {
   llvm_unreachable("Unhandled node kind!");
 }
 
-std::string ForestNode::dumpRecursive(const Grammar &G,
-                                      bool Abbreviated) const {
+std::string ForestNode::dumpRecursive(
+    const Grammar &G,
+    std::optional<std::reference_wrapper<const TokenStream>> Code,
+    bool Abbreviated) const {
   using llvm::formatv;
   Token::Index MaxToken = 0;
   // Count visits of nodes so we can mark those seen multiple times.
@@ -95,7 +105,7 @@ std::string ForestNode::dumpRecursive(const Grammar &G,
   std::string Result;
   constexpr Token::Index KEnd = std::numeric_limits<Token::Index>::max();
   std::function<void(const ForestNode *, Token::Index, std::optional<SymbolID>,
-                     LineDecoration &LineDec)>
+                     LineDecoration LineDec)>
       Dump = [&](const ForestNode *P, Token::Index End,
                  std::optional<SymbolID> ElidedParent, LineDecoration LineDec) {
         bool SharedNode = VisitCounts.find(P)->getSecond() > 1;
@@ -145,13 +155,13 @@ std::string ForestNode::dumpRecursive(const Grammar &G,
 
           // The first time, print as #1. Later, =#1.
           if (First) {
-            Result += formatv("{0} #{1}", P->dump(G), ID);
+            Result += formatv("{0} #{1}", P->dump(G, Code), ID);
           } else {
             Result += formatv("{0} =#{1}", G.symbolName(P->symbol()), ID);
             Children = {}; // Don't walk the children again.
           }
         } else {
-          Result.append(P->dump(G));
+          Result.append(P->dump(G, Code));
         }
         Result.push_back('\n');
 
diff --git a/clang-tools-extra/pseudo/tool/ClangPseudo.cpp b/clang-tools-extra/pseudo/tool/ClangPseudo.cpp
index 6a64760749cefe..4797dc01cdc13b 100644
--- a/clang-tools-extra/pseudo/tool/ClangPseudo.cpp
+++ b/clang-tools-extra/pseudo/tool/ClangPseudo.cpp
@@ -51,6 +51,9 @@ static opt<bool> Disambiguate("disambiguate",
                               desc("Choose best tree from parse forest"));
 static opt<bool> PrintStatistics("print-statistics", desc("Print GLR parser statistics"));
 static opt<bool> PrintForest("print-forest", desc("Print parse forest"));
+static opt<bool>
+    PrintTerminalTokens("print-terminal-tokens",
+                           desc("Print terminal tokens in parse forest"));
 static opt<bool> ForestAbbrev("forest-abbrev", desc("Abbreviate parse forest"),
                               init(true));
 static opt<std::string> HTMLForest("html-forest",
@@ -161,9 +164,14 @@ int main(int argc, char *argv[]) {
     auto &Root =
         glrParse(clang::pseudo::ParseParams{*ParseableStream, Arena, GSS},
                  *StartSymID, Lang);
+    std::optional<std::reference_wrapper<const TokenStream>> Code;
+    if (PrintTerminalTokens) {
+      Code = *ParseableStream;
+    }
     // If we're disambiguating, we'll print at the end instead.
     if (PrintForest && !Disambiguate)
-      llvm::outs() << Root.dumpRecursive(Lang.G, /*Abbreviated=*/ForestAbbrev);
+      llvm::outs() << Root.dumpRecursive(Lang.G, Code,
+                                         /*Abbreviated=*/ForestAbbrev);
     clang::pseudo::Disambiguation Disambig;
     if (Disambiguate)
       Disambig = clang::pseudo::disambiguate(&Root, {});
@@ -234,7 +242,7 @@ int main(int argc, char *argv[]) {
       ForestNode *DisambigRoot = &Root;
       removeAmbiguities(DisambigRoot, Disambig);
       llvm::outs() << "Disambiguated tree:\n";
-      llvm::outs() << DisambigRoot->dumpRecursive(Lang.G,
+      llvm::outs() << DisambigRoot->dumpRecursive(Lang.G, Code,
                                                   /*Abbreviated=*/ForestAbbrev);
     }
   }

>From ed5e37ba210ea76c35d20f3d14cc985e987fa8fd Mon Sep 17 00:00:00 2001
From: Jeremy <51220084+jeremy-rifkin at users.noreply.github.com>
Date: Sat, 6 Apr 2024 19:41:45 -0500
Subject: [PATCH 2/3] Fix a LLVM_DEBUG

---
 clang-tools-extra/pseudo/lib/GLR.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/clang-tools-extra/pseudo/lib/GLR.cpp b/clang-tools-extra/pseudo/lib/GLR.cpp
index ac43c02db521eb..e4b5be79d7e58d 100644
--- a/clang-tools-extra/pseudo/lib/GLR.cpp
+++ b/clang-tools-extra/pseudo/lib/GLR.cpp
@@ -527,7 +527,8 @@ class GLRReduce {
         SequenceNodes.size() == 1
             ? SequenceNodes.front()
             : &Params.Forest.createAmbiguous(F.Symbol, SequenceNodes);
-    LLVM_DEBUG(llvm::dbgs() << "    --> " << Parsed->dump(Lang.G) << "\n");
+    LLVM_DEBUG(llvm::dbgs()
+               << "    --> " << Parsed->dump(Lang.G, std::nullopt) << "\n");
 
     // Bases for this family, deduplicate them, and group by the goTo State.
     sortAndUnique(FamilyBases);

>From ac98abcb934934b94c61bcf68fdfcb3b877e6505 Mon Sep 17 00:00:00 2001
From: Jeremy <51220084+jeremy-rifkin at users.noreply.github.com>
Date: Sun, 7 Apr 2024 00:10:03 -0500
Subject: [PATCH 3/3] Improvements and fixes

---
 clang-tools-extra/pseudo/fuzzer/Fuzzer.cpp          |  2 +-
 .../pseudo/include/clang-pseudo/Forest.h            | 13 +++++++++----
 clang-tools-extra/pseudo/lib/GLR.cpp                |  3 +--
 clang-tools-extra/pseudo/tool/ClangPseudo.cpp       |  2 +-
 clang-tools-extra/pseudo/unittests/ForestTest.cpp   |  6 +++---
 5 files changed, 15 insertions(+), 11 deletions(-)

diff --git a/clang-tools-extra/pseudo/fuzzer/Fuzzer.cpp b/clang-tools-extra/pseudo/fuzzer/Fuzzer.cpp
index 33b3da1ed6ea9f..87b9d15480cc35 100644
--- a/clang-tools-extra/pseudo/fuzzer/Fuzzer.cpp
+++ b/clang-tools-extra/pseudo/fuzzer/Fuzzer.cpp
@@ -46,7 +46,7 @@ class Fuzzer {
         glrParse(clang::pseudo::ParseParams{ParseableStream, Arena, GSS},
                  *Lang.G.findNonterminal("translation-unit"), Lang);
     if (Print)
-      llvm::outs() << Root.dumpRecursive(Lang.G, std::nullopt);
+      llvm::outs() << Root.dumpRecursive(Lang.G);
   }
 };
 
diff --git a/clang-tools-extra/pseudo/include/clang-pseudo/Forest.h b/clang-tools-extra/pseudo/include/clang-pseudo/Forest.h
index 642c489b3fba41..0735e1fae08014 100644
--- a/clang-tools-extra/pseudo/include/clang-pseudo/Forest.h
+++ b/clang-tools-extra/pseudo/include/clang-pseudo/Forest.h
@@ -114,12 +114,17 @@ class alignas(class ForestNode *) ForestNode {
   // Iteration over all nodes in the forest, including this.
   llvm::iterator_range<RecursiveIterator> descendants() const;
 
-  std::string
-  dump(const Grammar &,
-       std::optional<std::reference_wrapper<const TokenStream>>) const;
+  // Dump forest node to text. If Code is std::nullopt, terminals will be
+  // displayed as token indexes.
+  std::string dump(const Grammar &,
+                   std::optional<std::reference_wrapper<const TokenStream>>
+                       Code = std::nullopt) const;
+  // Dump forest node recursively to text. If Code is std::nullopt, terminals
+  // will be displayed as token indexes.
   std::string
   dumpRecursive(const Grammar &,
-                std::optional<std::reference_wrapper<const TokenStream>>,
+                std::optional<std::reference_wrapper<const TokenStream>> Code =
+                    std::nullopt,
                 bool Abbreviated = false) const;
 
 private:
diff --git a/clang-tools-extra/pseudo/lib/GLR.cpp b/clang-tools-extra/pseudo/lib/GLR.cpp
index e4b5be79d7e58d..ac43c02db521eb 100644
--- a/clang-tools-extra/pseudo/lib/GLR.cpp
+++ b/clang-tools-extra/pseudo/lib/GLR.cpp
@@ -527,8 +527,7 @@ class GLRReduce {
         SequenceNodes.size() == 1
             ? SequenceNodes.front()
             : &Params.Forest.createAmbiguous(F.Symbol, SequenceNodes);
-    LLVM_DEBUG(llvm::dbgs()
-               << "    --> " << Parsed->dump(Lang.G, std::nullopt) << "\n");
+    LLVM_DEBUG(llvm::dbgs() << "    --> " << Parsed->dump(Lang.G) << "\n");
 
     // Bases for this family, deduplicate them, and group by the goTo State.
     sortAndUnique(FamilyBases);
diff --git a/clang-tools-extra/pseudo/tool/ClangPseudo.cpp b/clang-tools-extra/pseudo/tool/ClangPseudo.cpp
index 4797dc01cdc13b..ec5e6bdf7bb287 100644
--- a/clang-tools-extra/pseudo/tool/ClangPseudo.cpp
+++ b/clang-tools-extra/pseudo/tool/ClangPseudo.cpp
@@ -53,7 +53,7 @@ static opt<bool> PrintStatistics("print-statistics", desc("Print GLR parser stat
 static opt<bool> PrintForest("print-forest", desc("Print parse forest"));
 static opt<bool>
     PrintTerminalTokens("print-terminal-tokens",
-                           desc("Print terminal tokens in parse forest"));
+                        desc("Print terminal tokens in parse forest"));
 static opt<bool> ForestAbbrev("forest-abbrev", desc("Abbreviate parse forest"),
                               init(true));
 static opt<std::string> HTMLForest("html-forest",
diff --git a/clang-tools-extra/pseudo/unittests/ForestTest.cpp b/clang-tools-extra/pseudo/unittests/ForestTest.cpp
index 36af896148209d..d959b69ecdc943 100644
--- a/clang-tools-extra/pseudo/unittests/ForestTest.cpp
+++ b/clang-tools-extra/pseudo/unittests/ForestTest.cpp
@@ -73,12 +73,12 @@ TEST_F(ForestTest, DumpBasic) {
   const auto *Add =
       &Arena.createSequence(symbol("add-expression"), ruleFor("add-expression"),
                             {Left, &T[1], Right});
-  EXPECT_EQ(Add->dumpRecursive(G, true),
+  EXPECT_EQ(Add->dumpRecursive(G, std::nullopt, true),
             "[  0, end) add-expression := id-expression + id-expression\n"
             "[  0,   1) ├─id-expression~IDENTIFIER := tok[0]\n"
             "[  1,   2) ├─+ := tok[1]\n"
             "[  2, end) └─id-expression~IDENTIFIER := tok[2]\n");
-  EXPECT_EQ(Add->dumpRecursive(G, false),
+  EXPECT_EQ(Add->dumpRecursive(G, std::nullopt, false),
             "[  0, end) add-expression := id-expression + id-expression\n"
             "[  0,   1) ├─id-expression := IDENTIFIER\n"
             "[  0,   1) │ └─IDENTIFIER := tok[0]\n"
@@ -144,7 +144,7 @@ TEST_F(ForestTest, DumpAbbreviatedShared) {
 
   // We must not abbreviate away shared nodes: if we show A~* there's no way to
   // show that the intermediate B node is shared between A1 and A2.
-  EXPECT_EQ(A->dumpRecursive(G, /*Abbreviate=*/true),
+  EXPECT_EQ(A->dumpRecursive(G, std::nullopt, /*Abbreviate=*/true),
             "[  0, end) A := <ambiguous>\n"
             "[  0, end) ├─A~B := * #1\n"
             "[  0, end) │ └─* := tok[0]\n"



More information about the cfe-commits mailing list