[clang-tools-extra] 2cc7463 - [pseudo] Perform unconstrained reduction prior to recovery.

Sam McCall via cfe-commits cfe-commits at lists.llvm.org
Fri Aug 19 06:14:47 PDT 2022


Author: Sam McCall
Date: 2022-08-19T15:07:36+02:00
New Revision: 2cc7463c85c056e956ad332448189ce3e9004182

URL: https://github.com/llvm/llvm-project/commit/2cc7463c85c056e956ad332448189ce3e9004182
DIFF: https://github.com/llvm/llvm-project/commit/2cc7463c85c056e956ad332448189ce3e9004182.diff

LOG: [pseudo] Perform unconstrained reduction prior to recovery.

Our GLR uses lookahead: only perform reductions that might be consumed by the
shift immediately following. However when shift fails and so reduce is followed
by recovery instead, this restriction is incorrect and leads to missing heads.

In turn this means certain recovery strategies can't be made to work. e.g.
```
ns := NAMESPACE { namespace-body } [recover=Skip]
ns-body := namespace_opt
```
When `namespace { namespace {` is parsed, we can recover the inner `ns` (using
the `Skip` strategy to ignore the missing `}`). However this `namespace` will
not be reduced to a `namespace-body` as EOF is not in the follow-set, and so we
are unable to recover the outer `ns`.

This patch fixes this by tracking which heads were produced by constrained
reduce, and discarding and rebuilding them before performing recovery.

This is a prerequisite for the `Skip` strategy mentioned above, though there are
some other limitations we need to address too.

Reviewed By: hokein

Differential Revision: https://reviews.llvm.org/D130523

Added: 
    

Modified: 
    clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRTable.h
    clang-tools-extra/pseudo/lib/GLR.cpp
    clang-tools-extra/pseudo/unittests/GLRTest.cpp

Removed: 
    


################################################################################
diff  --git a/clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRTable.h b/clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRTable.h
index 890a5125a1ef..8d8f8f74f16e 100644
--- a/clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRTable.h
+++ b/clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRTable.h
@@ -105,7 +105,11 @@ class LRTable {
   bool canFollow(SymbolID Nonterminal, SymbolID Terminal) const {
     assert(isToken(Terminal));
     assert(isNonterminal(Nonterminal));
-    return FollowSets.test(tok::NUM_TOKENS * Nonterminal +
+    // tok::unknown is a sentinel value used in recovery: can follow anything.
+    if (tok::unknown)
+      return true;
+    return Terminal == tokenSymbol(tok::unknown) ||
+           FollowSets.test(tok::NUM_TOKENS * Nonterminal +
                            symbolToToken(Terminal));
   }
 

diff  --git a/clang-tools-extra/pseudo/lib/GLR.cpp b/clang-tools-extra/pseudo/lib/GLR.cpp
index 10a94860ffc9..1765eb18732c 100644
--- a/clang-tools-extra/pseudo/lib/GLR.cpp
+++ b/clang-tools-extra/pseudo/lib/GLR.cpp
@@ -601,6 +601,10 @@ const ForestNode &glrParse(const ParseParams &Params, SymbolID StartSymbol,
   std::vector<const GSS::Node *> Heads = {GSS.addNode(/*State=*/StartState,
                                                       /*ForestNode=*/nullptr,
                                                       {})};
+  // Invariant: Heads is partitioned by source: {shifted | reduced}.
+  // HeadsPartition is the index of the first head formed by reduction.
+  // We use this to discard and recreate the reduced heads during recovery.
+  unsigned HeadsPartition = 0;
   std::vector<const GSS::Node *> NextHeads;
   auto MaybeGC = [&, Roots(std::vector<const GSS::Node *>{}), I(0u)]() mutable {
     assert(NextHeads.empty() && "Running GC at the wrong time!");
@@ -623,8 +627,17 @@ const ForestNode &glrParse(const ParseParams &Params, SymbolID StartSymbol,
     // If we weren't able to consume the token, try to skip over some tokens
     // so we can keep parsing.
     if (NextHeads.empty()) {
-      // FIXME: Heads may not be fully reduced, because our reductions were
-      // constrained by lookahead (but lookahead is meaningless to recovery).
+      // The reduction in the previous round was constrained by lookahead.
+      // On valid code this only rejects dead ends, but on broken code we should
+      // consider all possibilities.
+      //
+      // We discard all heads formed by reduction, and recreate them without
+      // this constraint. This may duplicate some nodes, but it's rare.
+      LLVM_DEBUG(llvm::dbgs() << "Shift failed, will attempt recovery. "
+                                 "Re-reducing without lookahead.");
+      Heads.resize(HeadsPartition);
+      Reduce(Heads, /*allow all reductions*/ tokenSymbol(tok::unknown));
+
       glrRecover(Heads, I, Params, Lang, NextHeads);
       if (NextHeads.empty())
         // FIXME: Ensure the `_ := start-symbol` rules have a fallback
@@ -636,6 +649,7 @@ const ForestNode &glrParse(const ParseParams &Params, SymbolID StartSymbol,
     // Form nonterminals containing the token we just consumed.
     SymbolID Lookahead =
         I == Terminals.size() ? tokenSymbol(tok::eof) : Terminals[I].symbol();
+    HeadsPartition = NextHeads.size();
     Reduce(NextHeads, Lookahead);
     // Prepare for the next token.
     std::swap(Heads, NextHeads);

diff  --git a/clang-tools-extra/pseudo/unittests/GLRTest.cpp b/clang-tools-extra/pseudo/unittests/GLRTest.cpp
index 7abf9295d157..397ad6d3e800 100644
--- a/clang-tools-extra/pseudo/unittests/GLRTest.cpp
+++ b/clang-tools-extra/pseudo/unittests/GLRTest.cpp
@@ -625,6 +625,32 @@ TEST_F(GLRTest, RecoverTerminal) {
             "[  1, end) └─; := <opaque>\n");
 }
 
+TEST_F(GLRTest, RecoverUnrestrictedReduce) {
+  // Here, ! is not in any rule and therefore not in the follow set of `word`.
+  // We would not normally reduce `word := IDENTIFIER`, but do so for recovery.
+
+  build(R"bnf(
+    _ := sentence
+
+    word := IDENTIFIER
+    sentence := word word [recover=AcceptAnyTokenInstead]
+  )bnf");
+
+  clang::LangOptions LOptions;
+  const TokenStream &Tokens = cook(lex("id !", LOptions), LOptions);
+  TestLang.Table = LRTable::buildSLR(TestLang.G);
+  TestLang.RecoveryStrategies.try_emplace(
+      extensionID("AcceptAnyTokenInstead"),
+      [](Token::Index Start, const TokenStream &Stream) { return Start + 1; });
+
+  const ForestNode &Parsed =
+      glrParse({Tokens, Arena, GSStack}, id("sentence"), TestLang);
+  EXPECT_EQ(Parsed.dumpRecursive(TestLang.G),
+            "[  0, end) sentence := word word [recover=AcceptAnyTokenInstead]\n"
+            "[  0,   1) ├─word := IDENTIFIER\n"
+            "[  0,   1) │ └─IDENTIFIER := tok[0]\n"
+            "[  1, end) └─word := <opaque>\n");
+}
 
 TEST_F(GLRTest, NoExplicitAccept) {
   build(R"bnf(


        


More information about the cfe-commits mailing list