[PATCH] D130523: [pseudo] Perform unconstrained recovery prior to completion.

Mon Jul 25 15:02:26 PDT 2022

sammccall updated this revision to Diff 447487.
sammccall added a comment.

restore removed debug


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D130523/new/

https://reviews.llvm.org/D130523

Files:
  clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRTable.h
  clang-tools-extra/pseudo/lib/GLR.cpp
  clang-tools-extra/pseudo/unittests/GLRTest.cpp


Index: clang-tools-extra/pseudo/unittests/GLRTest.cpp
===================================================================

--- clang-tools-extra/pseudo/unittests/GLRTest.cpp
+++ clang-tools-extra/pseudo/unittests/GLRTest.cpp
@@ -604,6 +604,33 @@
             "[  5, end) └─} := tok[5]\n");
 }
 
+TEST_F(GLRTest, RecoverUnrestrictedReduce) {
+  // Here, ! is not in any rule and therefore not in the follow set of `word`.
+  // We would not normally reduce `word := IDENTIFIER`, but do so for recovery.
+
+  build(R"bnf(
+    _ := sentence
+
+    word := IDENTIFIER
+    sentence := word word [recover=AcceptAnyTokenInstead]
+  )bnf");
+
+  clang::LangOptions LOptions;
+  const TokenStream &Tokens = cook(lex("id !", LOptions), LOptions);
+  TestLang.Table = LRTable::buildSLR(TestLang.G);
+  TestLang.RecoveryStrategies.try_emplace(
+      extensionID("AcceptAnyTokenInstead"),
+      [](Token::Index Start, const TokenStream &Stream) { return Start + 1; });
+
+  const ForestNode &Parsed =
+      glrParse({Tokens, Arena, GSStack}, id("sentence"), TestLang);
+  EXPECT_EQ(Parsed.dumpRecursive(TestLang.G),
+            "[  0, end) sentence := word word [recover=AcceptAnyTokenInstead]\n"
+            "[  0,   1) ├─word := IDENTIFIER\n"
+            "[  0,   1) │ └─IDENTIFIER := tok[0]\n"
+            "[  1, end) └─word := <opaque>\n");
+}
+
 TEST_F(GLRTest, NoExplicitAccept) {
   build(R"bnf(
     _ := test
Index: clang-tools-extra/pseudo/lib/GLR.cpp
===================================================================
--- clang-tools-extra/pseudo/lib/GLR.cpp
+++ clang-tools-extra/pseudo/lib/GLR.cpp
@@ -597,6 +597,10 @@
   std::vector<const GSS::Node *> Heads = {GSS.addNode(/*State=*/StartState,
                                                       /*ForestNode=*/nullptr,
                                                       {})};
+  // Invariant: Heads is partitioned by source: {shifted | reduced}.
+  // HeadsPartition is the index of the first head formed by reduction.
+  // We use this to discard and recreate the reduced heads during recovery.
+  unsigned HeadsPartition = 0;
   std::vector<const GSS::Node *> NextHeads;
   auto MaybeGC = [&, Roots(std::vector<const GSS::Node *>{}), I(0u)]() mutable {
     assert(NextHeads.empty() && "Running GC at the wrong time!");
@@ -619,8 +623,17 @@
     // If we weren't able to consume the token, try to skip over some tokens
     // so we can keep parsing.
     if (NextHeads.empty()) {
-      // FIXME: Heads may not be fully reduced, because our reductions were
-      // constrained by lookahead (but lookahead is meaningless to recovery).
+      // The reduction in the previous round was constrained by lookahead.
+      // On valid code this only rejects dead ends, but on broken code we should
+      // consider all possibilities.
+      //
+      // We discard all heads formed by reduction, and recreate them without
+      // this constraint. This may duplicate some nodes, but it's rare.
+      LLVM_DEBUG(llvm::dbgs() << "Shift failed, will attempt recovery. "
+                                 "Re-reducing without lookahead.");
+      Heads.resize(HeadsPartition);
+      Reduce(Heads, /*allow all reductions*/ tokenSymbol(tok::unknown));
+
       glrRecover(Heads, I, Params, Lang, NextHeads);
       if (NextHeads.empty())
         // FIXME: Ensure the `_ := start-symbol` rules have a fallback
@@ -632,6 +645,7 @@
     // Form nonterminals containing the token we just consumed.
     SymbolID Lookahead =
         I == Terminals.size() ? tokenSymbol(tok::eof) : Terminals[I].symbol();
+    HeadsPartition = NextHeads.size();
     Reduce(NextHeads, Lookahead);
     // Prepare for the next token.
     std::swap(Heads, NextHeads);
Index: clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRTable.h
===================================================================
--- clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRTable.h
+++ clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRTable.h
@@ -105,7 +105,11 @@
   bool canFollow(SymbolID Nonterminal, SymbolID Terminal) const {
     assert(isToken(Terminal));
     assert(isNonterminal(Nonterminal));
-    return FollowSets.test(tok::NUM_TOKENS * Nonterminal +
+    // tok::unknown is a sentinel value used in recovery: can follow anything.
+    if (tok::unknown)
+      return true;
+    return Terminal == tokenSymbol(tok::unknown) ||
+           FollowSets.test(tok::NUM_TOKENS * Nonterminal +
                            symbolToToken(Terminal));
   }
 


-------------- next part --------------
A non-text attachment was scrubbed...
Name: D130523.447487.patch
Type: text/x-patch
Size: 4524 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/cfe-commits/attachments/20220725/316d9959/attachment.bin>