[clang-tools-extra] a0f4c10 - [pseudo] Add error-recovery framework & brace-based recovery

Sam McCall via cfe-commits cfe-commits at lists.llvm.org
Tue Jun 28 12:08:53 PDT 2022


Author: Sam McCall
Date: 2022-06-28T21:08:43+02:00
New Revision: a0f4c10ae227a62c2a63611e64eba83f0ff0f577

URL: https://github.com/llvm/llvm-project/commit/a0f4c10ae227a62c2a63611e64eba83f0ff0f577
DIFF: https://github.com/llvm/llvm-project/commit/a0f4c10ae227a62c2a63611e64eba83f0ff0f577.diff

LOG: [pseudo] Add error-recovery framework & brace-based recovery

The idea is:
 - a parse failure is detected when all heads die when trying to shift
   the next token
 - we can recover by choosing a nonterminal we're partway through parsing,
   and determining where it ends through nonlocal means (e.g. matching brackets)
 - we can find candidates by walking up the stack from the (ex-)heads
 - the token range is defined using heuristics attached to grammar rules
 - the unparsed region is represented in the forest by an Opaque node

This patch has the core GLR functionality.
It does not allow recovery heuristics to be attached as extensions to
the grammar, but rather infers a brace-based heuristic.

Expected followups:
 - make recovery heuristics grammar extensions (depends on D127448)
 - add recover to our grammar for bracketed constructs and sequence nodes
 - change the structure of our augmented `_ := start` rules to eliminate
   some special-cases in glrParse.
 - (if I can work out how): avoid some spurious recovery cases described
   in comments
 - grammar changes to eliminate the hard distinction between init-list
   and designated-init-list shown in the recovery-init-list.cpp testcase

Differential Revision: https://reviews.llvm.org/D128486

Added: 
    clang-tools-extra/pseudo/test/cxx/recovery-init-list.cpp

Modified: 
    clang-tools-extra/pseudo/include/clang-pseudo/GLR.h
    clang-tools-extra/pseudo/include/clang-pseudo/grammar/Grammar.h
    clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRGraph.h
    clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRTable.h
    clang-tools-extra/pseudo/lib/GLR.cpp
    clang-tools-extra/pseudo/lib/grammar/Grammar.cpp
    clang-tools-extra/pseudo/lib/grammar/GrammarBNF.cpp
    clang-tools-extra/pseudo/lib/grammar/LRGraph.cpp
    clang-tools-extra/pseudo/lib/grammar/LRTableBuild.cpp
    clang-tools-extra/pseudo/test/cxx/empty-member-spec.cpp
    clang-tools-extra/pseudo/unittests/GLRTest.cpp

Removed: 
    


################################################################################
diff  --git a/clang-tools-extra/pseudo/include/clang-pseudo/GLR.h b/clang-tools-extra/pseudo/include/clang-pseudo/GLR.h
index a3e8611de425..edc4b2acc0a8 100644
--- a/clang-tools-extra/pseudo/include/clang-pseudo/GLR.h
+++ b/clang-tools-extra/pseudo/include/clang-pseudo/GLR.h
@@ -144,6 +144,26 @@ void glrShift(llvm::ArrayRef<const GSS::Node *> OldHeads,
 void glrReduce(std::vector<const GSS::Node *> &Heads, SymbolID Lookahead,
                const ParseParams &Params);
 
+// Heuristically recover from a state where no further parsing is possible.
+//
+// OldHeads is the parse state at TokenIndex.
+// This function consumes consumes zero or more tokens (advancing TokenIndex),
+// and places any recovery states created in NewHeads.
+//
+// On failure, NewHeads is empty and TokenIndex is unchanged.
+//
+// WARNING: glrRecover acts as a "fallback shift". If it consumes no tokens,
+// there is a risk of the parser falling into an infinite loop, creating an
+// endless sequence of recovery nodes.
+// Generally it is safe for recovery to match 0 tokens against sequence symbols
+// like `statement-seq`, as the grammar won't permit another statement-seq
+// immediately afterwards. However recovery strategies for `statement` should
+// consume at least one token, as statements may be adjacent in the input.
+void glrRecover(llvm::ArrayRef<const GSS::Node *> OldHeads,
+                unsigned &TokenIndex, const TokenStream &Tokens,
+                const ParseParams &Params,
+                std::vector<const GSS::Node *> &NewHeads);
+
 } // namespace pseudo
 } // namespace clang
 

diff  --git a/clang-tools-extra/pseudo/include/clang-pseudo/grammar/Grammar.h b/clang-tools-extra/pseudo/include/clang-pseudo/grammar/Grammar.h
index 382da41397f0..7240f5adbb03 100644
--- a/clang-tools-extra/pseudo/include/clang-pseudo/grammar/Grammar.h
+++ b/clang-tools-extra/pseudo/include/clang-pseudo/grammar/Grammar.h
@@ -81,9 +81,12 @@ inline tok::TokenKind symbolToToken(SymbolID SID) {
   assert(SID < NumTerminals);
   return static_cast<tok::TokenKind>(SID);
 }
-inline SymbolID tokenSymbol(tok::TokenKind TK) {
+inline constexpr SymbolID tokenSymbol(tok::TokenKind TK) {
   return TokenFlag | static_cast<SymbolID>(TK);
 }
+// Error recovery strategies.
+// FIXME: these should be provided as extensions instead.
+enum class RecoveryStrategy : uint8_t { None, Braces };
 
 // An extension is a piece of native code specific to a grammar that modifies
 // the behavior of annotated rules. One ExtensionID is assigned for each unique
@@ -107,7 +110,7 @@ struct Rule {
   // length to 9 (this is the longest sequence in cxx grammar).
   static constexpr unsigned SizeBits = 4;
   static constexpr unsigned MaxElements = 9;
-  static_assert(MaxElements <= (1 << SizeBits), "Exceeds the maximum limit");
+  static_assert(MaxElements < (1 << SizeBits), "Exceeds the maximum limit");
   static_assert(SizeBits + SymbolBits <= 16,
                 "Must be able to store symbol ID + size efficiently");
 
@@ -123,6 +126,13 @@ struct Rule {
   // being set for this rule.
   ExtensionID Guard = 0;
 
+  // Specifies the index within Sequence eligible for error recovery.
+  // Given stmt := { stmt-seq_opt }, if we fail to parse the stmt-seq then we
+  // should recover by finding the matching brace, and forcing stmt-seq to match
+  // everything between braces.
+  uint8_t RecoveryIndex = -1;
+  RecoveryStrategy Recovery = RecoveryStrategy::None;
+
   llvm::ArrayRef<SymbolID> seq() const {
     return llvm::ArrayRef<SymbolID>(Sequence, Size);
   }

diff  --git a/clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRGraph.h b/clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRGraph.h
index 1b5365327431..f5c2cc7a1623 100644
--- a/clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRGraph.h
+++ b/clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRGraph.h
@@ -137,8 +137,20 @@ class LRGraph {
     SymbolID Label;
   };
 
+  // A possible error recovery: choose to match some tokens against a symbol.
+  //
+  // e.g. a state that contains
+  //   stmt := { . stmt-seq [recover=braces] }
+  // has a Recovery { Src = S, Strategy=braces, Result=stmt-seq }.
+  struct Recovery {
+    StateID Src; // The state we are in when encountering the error.
+    RecoveryStrategy Strategy; // Heuristic choosing the tokens to match.
+    SymbolID Result;           // The symbol that is produced.
+  };
+
   llvm::ArrayRef<State> states() const { return States; }
   llvm::ArrayRef<Edge> edges() const { return Edges; }
+  llvm::ArrayRef<Recovery> recoveries() const { return Recoveries; }
   llvm::ArrayRef<std::pair<SymbolID, StateID>> startStates() const {
     return StartStates;
   }
@@ -147,12 +159,15 @@ class LRGraph {
 
 private:
   LRGraph(std::vector<State> States, std::vector<Edge> Edges,
+          std::vector<Recovery> Recoveries,
           std::vector<std::pair<SymbolID, StateID>> StartStates)
       : States(std::move(States)), Edges(std::move(Edges)),
-        StartStates(std::move(StartStates)) {}
+        Recoveries(std::move(Recoveries)), StartStates(std::move(StartStates)) {
+  }
 
   std::vector<State> States;
   std::vector<Edge> Edges;
+  std::vector<Recovery> Recoveries;
   std::vector<std::pair<SymbolID, StateID>> StartStates;
 };
 

diff  --git a/clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRTable.h b/clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRTable.h
index 70ce52924f11..c8e48dbaf309 100644
--- a/clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRTable.h
+++ b/clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRTable.h
@@ -121,6 +121,14 @@ class LRTable {
     uint16_t Value : ValueBits;
   };
 
+  struct Recovery {
+    RecoveryStrategy Strategy;
+    SymbolID Result;
+  };
+
+  // Returns all available actions for the given state on a terminal.
+  // Expected to be called by LR parsers.
+  llvm::ArrayRef<Action> getActions(StateID State, SymbolID Terminal) const;
   // Returns the state after we reduce a nonterminal.
   // Expected to be called by LR parsers.
   // REQUIRES: Nonterminal is valid here.
@@ -151,6 +159,12 @@ class LRTable {
                            symbolToToken(Terminal));
   }
 
+  // Looks up available recovery actions if we stopped parsing in this state.
+  llvm::ArrayRef<Recovery> getRecovery(StateID State) const {
+    return llvm::makeArrayRef(Recoveries.data() + RecoveryOffset[State],
+                              Recoveries.data() + RecoveryOffset[State + 1]);
+  }
+
   // Returns the state from which the LR parser should start to parse the input
   // tokens as the given StartSymbol.
   //
@@ -188,9 +202,15 @@ class LRTable {
     StateID State;
     RuleID Rule;
   };
-  // Build a specifid table for testing purposes.
-  static LRTable buildForTests(const Grammar &G, llvm::ArrayRef<Entry>,
-                               llvm::ArrayRef<ReduceEntry>);
+  struct RecoveryEntry {
+    StateID State;
+    RecoveryStrategy Strategy;
+    SymbolID Result;
+  };
+  // Build a specified table for testing purposes.
+  static LRTable buildForTests(const Grammar &, llvm::ArrayRef<Entry>,
+                               llvm::ArrayRef<ReduceEntry>,
+                               llvm::ArrayRef<RecoveryEntry> = {});
 
 private:
   // Looks up actions stored in the generic table.
@@ -222,6 +242,11 @@ class LRTable {
   // This is flattened by encoding the (SymbolID Nonterminal, tok::Kind Token)
   // as an index: Nonterminal * NUM_TOKENS + Token.
   llvm::BitVector FollowSets;
+
+  // Recovery stores all recovery actions from all states.
+  // A given state has [RecoveryOffset[S], RecoveryOffset[S+1]).
+  std::vector<uint32_t> RecoveryOffset;
+  std::vector<Recovery> Recoveries;
 };
 llvm::raw_ostream &operator<<(llvm::raw_ostream &, const LRTable::Action &);
 

diff  --git a/clang-tools-extra/pseudo/lib/GLR.cpp b/clang-tools-extra/pseudo/lib/GLR.cpp
index 0b9cf46245a9..ccb3cd770f7f 100644
--- a/clang-tools-extra/pseudo/lib/GLR.cpp
+++ b/clang-tools-extra/pseudo/lib/GLR.cpp
@@ -24,6 +24,156 @@
 
 namespace clang {
 namespace pseudo {
+namespace {
+
+llvm::Optional<unsigned>
+findRecoveryEndpoint(RecoveryStrategy Strategy,
+                     const GSS::Node *RecoveryNode,
+                     const TokenStream &Tokens) {
+  assert(Strategy == RecoveryStrategy::Braces);
+  const ForestNode *LBrace = RecoveryNode->Payload;
+  assert(LBrace->kind() == ForestNode::Terminal &&
+         LBrace->symbol() == tokenSymbol(tok::l_brace));
+  if (const Token *RBrace = Tokens.tokens()[LBrace->startTokenIndex()].pair())
+    return Tokens.index(*RBrace);
+  return llvm::None;
+}
+
+} // namespace
+
+void glrRecover(llvm::ArrayRef<const GSS::Node *> OldHeads,
+                unsigned &TokenIndex, const TokenStream &Tokens,
+                const ParseParams &Params,
+                std::vector<const GSS::Node *> &NewHeads) {
+  LLVM_DEBUG(llvm::dbgs() << "Recovery at token " << TokenIndex << "...\n");
+  // Describes a possibility to recover by forcibly interpreting a range of
+  // tokens around the cursor as a nonterminal that we expected to see.
+  struct PlaceholderRecovery {
+    // The token prior to the nonterminal which is being recovered.
+    // This starts of the region we're skipping, so higher Position is better.
+    Token::Index Position;
+    // The nonterminal which will be created in order to recover.
+    SymbolID Symbol;
+    // The heuristic used to choose the bounds of the nonterminal to recover.
+    RecoveryStrategy Strategy;
+
+    // The GSS head where we are expecting the recovered nonterminal.
+    const GSS::Node *RecoveryNode;
+    // Payload of nodes on the way back from the OldHead to the recovery node.
+    // These represent the partial parse that is being discarded.
+    // They should become the children of the opaque recovery node.
+    //
+    // There may be multiple paths leading to the same recovery node, we choose
+    // one arbitrarily.
+    std::vector<const ForestNode *> Path;
+  };
+  std::vector<PlaceholderRecovery> Options;
+
+  // Find recovery options by walking up the stack.
+  //
+  // This is similar to exception handling: we walk up the "frames" of nested
+  // rules being parsed until we find one that has a "handler" which allows us
+  // to determine the node bounds without parsing it.
+  //
+  // Unfortunately there's a significant 
diff erence: the stack contains both
+  // "upward" nodes (ancestor parses) and "leftward" ones.
+  //  e.g. when parsing `int(2 + ?)`, the stack contains:
+  //    expr := expr + . expr     - which we're currently parsing
+  //    expr := type ( . expr )   - (up) we should recover this outer expr
+  //    expr := . type ( expr )   - (up+left) we should not recover this type!
+  //
+  // It's not obvious how to avoid collecting the latter as a recovery option.
+  // I think the distinction is ill-defined after merging items into states.
+  // For now, we have to take this into account when defining recovery rules.
+  // FIXME: find a more satisfying way to avoid such false recovery.
+  std::vector<const ForestNode *> Path;
+  llvm::DenseSet<const GSS::Node *> Seen;
+  auto DFS = [&](const GSS::Node *N, Token::Index NextTok, auto &DFS) {
+    if (!Seen.insert(N).second)
+      return;
+    for (auto Strategy : Params.Table.getRecovery(N->State)) {
+      Options.push_back(PlaceholderRecovery{
+          NextTok,
+          Strategy.Result,
+          Strategy.Strategy,
+          N,
+          Path,
+      });
+      LLVM_DEBUG(llvm::dbgs()
+                 << "Option: recover " << Params.G.symbolName(Strategy.Result)
+                 << " at token " << NextTok << "\n");
+    }
+    Path.push_back(N->Payload);
+    for (const GSS::Node *Parent : N->parents())
+      DFS(Parent, N->Payload->startTokenIndex(), DFS);
+    Path.pop_back();
+  };
+  for (auto *N : llvm::reverse(OldHeads))
+    DFS(N, TokenIndex, DFS);
+
+  // Now we select the option(s) we will use to recover.
+  //
+  // We prefer options starting further right, as these discard less code
+  // (e.g. we prefer to recover inner scopes rather than outer ones).
+  // The options also need to agree on an endpoint, so the parser has a
+  // consistent position afterwards.
+  //
+  // So conceptually we're sorting by the tuple (start, end), though we avoid
+  // computing `end` for options that can't be winners.
+
+  // Consider options starting further right first.
+  // Don't drop the others yet though, we may still use them if preferred fails.
+  llvm::stable_sort(Options, [&](const auto &L, const auto &R) {
+    return L.Position > R.Position;
+  });
+
+  assert(NewHeads.empty()); // We may repeatedly populate and clear it.
+  llvm::Optional<Token::Range> RecoveryRange;
+  for (const PlaceholderRecovery &Option : Options) {
+    // If this starts further right than options we've already found, then
+    // we'll never find anything better. Skip computing End for the rest.
+    if (RecoveryRange && Option.Position < RecoveryRange->Begin)
+      break;
+
+    auto End =
+        findRecoveryEndpoint(Option.Strategy, Option.RecoveryNode, Tokens);
+    // Only consider recovery that advances the parse.
+    if (!End || *End <= TokenIndex)
+      continue;
+    if (RecoveryRange) {
+      // If this is worse than our previous options, ignore it.
+      if (RecoveryRange->End < *End)
+        continue;
+      // If this is an improvement over our previous options, then drop them.
+      if (RecoveryRange->End > *End)
+        NewHeads.clear();
+    }
+    // Create recovery nodes and heads for them in the GSS. These may be
+    // discarded if a better recovery is later found, but this path isn't hot.
+    RecoveryRange = {Option.Position, *End};
+    const ForestNode &Placeholder =
+        Params.Forest.createOpaque(Option.Symbol, Option.Position);
+    const GSS::Node *NewHead = Params.GSStack.addNode(
+        Params.Table.getGoToState(Option.RecoveryNode->State, Option.Symbol),
+        &Placeholder, {Option.RecoveryNode});
+    NewHeads.push_back(NewHead);
+  }
+
+  // Advance the cursor, whether recovery succeeded or not.
+  if (RecoveryRange) {
+    LLVM_DEBUG({
+      llvm::dbgs() << "Recovered range=" << *RecoveryRange << ":";
+      for (const auto *Head : NewHeads)
+        llvm::dbgs() << " " << Params.G.symbolName(Head->Payload->symbol());
+      llvm::dbgs() << "\n";
+    });
+    TokenIndex = RecoveryRange->End;
+  } else {
+    LLVM_DEBUG(llvm::dbgs() << "Recovery failed after trying " << Options.size()
+                            << " strategies\n");
+    ++TokenIndex;
+  }
+}
 
 using StateID = LRTable::StateID;
 
@@ -31,8 +181,9 @@ llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const GSS::Node &N) {
   std::vector<std::string> ParentStates;
   for (const auto *Parent : N.parents())
     ParentStates.push_back(llvm::formatv("{0}", Parent->State));
-  OS << llvm::formatv("state {0}, parsed symbol {1}, parents {2}", N.State,
-                      N.Payload->symbol(), llvm::join(ParentStates, ", "));
+  OS << llvm::formatv("state {0}, parsed symbol {1}, parents {3}", N.State,
+                      N.Payload ? N.Payload->symbol() : 0,
+                      llvm::join(ParentStates, ", "));
   return OS;
 }
 
@@ -427,15 +578,27 @@ const ForestNode &glrParse(const TokenStream &Tokens, const ParseParams &Params,
     GSS.gc(std::move(Roots));
   };
   // Each iteration fully processes a single token.
-  for (unsigned I = 0; I < Terminals.size(); ++I) {
+  for (unsigned I = 0; I < Terminals.size();) {
     LLVM_DEBUG(llvm::dbgs() << llvm::formatv(
                    "Next token {0} (id={1})\n",
                    G.symbolName(Terminals[I].symbol()), Terminals[I].symbol()));
     // Consume the token.
     glrShift(Heads, Terminals[I], Params, NextHeads);
+
+    // If we weren't able to consume the token, try to skip over some tokens
+    // so we can keep parsing.
+    if (NextHeads.empty()) {
+      glrRecover(Heads, I, Tokens, Params, NextHeads);
+      if (NextHeads.empty())
+        // FIXME: Ensure the `_ := start-symbol` rules have a fallback
+        // error-recovery strategy attached. Then this condition can't happen.
+        return Params.Forest.createOpaque(StartSymbol, /*Token::Index=*/0);
+    } else
+      ++I;
+
     // Form nonterminals containing the token we just consumed.
-    SymbolID Lookahead = I + 1 == Terminals.size() ? tokenSymbol(tok::eof)
-                                                   : Terminals[I + 1].symbol();
+    SymbolID Lookahead =
+        I == Terminals.size() ? tokenSymbol(tok::eof) : Terminals[I].symbol();
     Reduce(NextHeads, Lookahead);
     // Prepare for the next token.
     std::swap(Heads, NextHeads);
@@ -444,22 +607,35 @@ const ForestNode &glrParse(const TokenStream &Tokens, const ParseParams &Params,
   }
   LLVM_DEBUG(llvm::dbgs() << llvm::formatv("Reached eof\n"));
 
+  // The parse was successful if we're in state `_ := start-symbol .`
   StateID AcceptState = Params.Table.getGoToState(StartState, StartSymbol);
-  const ForestNode *Result = nullptr;
-  for (const auto *Head : Heads) {
-    if (Head->State == AcceptState) {
-      assert(Head->Payload->symbol() == StartSymbol);
-      assert(Result == nullptr && "multiple results!");
-      Result = Head->Payload;
+  auto SearchForAccept = [&](llvm::ArrayRef<const GSS::Node *> Heads) {
+    const ForestNode *Result = nullptr;
+    for (const auto *Head : Heads) {
+      if (Head->State == AcceptState) {
+        assert(Head->Payload->symbol() == StartSymbol);
+        assert(Result == nullptr && "multiple results!");
+        Result = Head->Payload;
+      }
     }
-  }
-  if (Result)
+    return Result;
+  };
+  if (auto *Result = SearchForAccept(Heads))
     return *Result;
+  // Failed to parse the input, attempt to run recovery.
+  // FIXME: this awkwardly repeats the recovery in the loop, when shift fails.
+  // More elegant is to include EOF in the token stream, and make the
+  // augmented rule: `_ := translation-unit EOF`. In this way recovery at EOF
+  // would not be a special case: it show up as a failure to shift the EOF
+  // token.
+  unsigned I = Terminals.size();
+  glrRecover(Heads, I, Tokens, Params, NextHeads);
+  Reduce(NextHeads, tokenSymbol(tok::eof));
+  if (auto *Result = SearchForAccept(NextHeads))
+    return *Result;
+
   // We failed to parse the input, returning an opaque forest node for recovery.
-  //
-  // FIXME: We will need to invoke our generic error-recovery handlers when we
-  // reach EOF without reaching accept state, and involving the eof
-  // token in the above main for-loopmay be the best way to reuse the code).
+  // FIXME: as above, we can add fallback error handling so this is impossible.
   return Params.Forest.createOpaque(StartSymbol, /*Token::Index=*/0);
 }
 
@@ -470,9 +646,10 @@ void glrReduce(std::vector<const GSS::Node *> &Heads, SymbolID Lookahead,
 }
 
 const GSS::Node *GSS::addNode(LRTable::StateID State, const ForestNode *Symbol,
+
                               llvm::ArrayRef<const Node *> Parents) {
   Node *Result = new (allocate(Parents.size()))
-      Node({State, GCParity, static_cast<unsigned>(Parents.size())});
+      Node({State, GCParity, static_cast<uint16_t>(Parents.size())});
   Alive.push_back(Result);
   ++NodesCreated;
   Result->Payload = Symbol;

diff  --git a/clang-tools-extra/pseudo/lib/grammar/Grammar.cpp b/clang-tools-extra/pseudo/lib/grammar/Grammar.cpp
index 8e3bcb7afb37..dc4d95820264 100644
--- a/clang-tools-extra/pseudo/lib/grammar/Grammar.cpp
+++ b/clang-tools-extra/pseudo/lib/grammar/Grammar.cpp
@@ -59,8 +59,11 @@ std::string Grammar::dumpRule(RuleID RID) const {
   llvm::raw_string_ostream OS(Result);
   const Rule &R = T->Rules[RID];
   OS << symbolName(R.Target) << " :=";
-  for (SymbolID SID : R.seq())
-    OS << " " << symbolName(SID);
+  for (unsigned I = 0; I < R.Size; ++I) {
+    OS << " " << symbolName(R.Sequence[I]);
+    if (R.RecoveryIndex == I)
+      OS << " [recover=" << static_cast<unsigned>(R.Recovery) << "]";
+  }
   if (R.Guard)
     OS << " [guard=" << T->AttributeValues[R.Guard] << "]";
   return Result;

diff  --git a/clang-tools-extra/pseudo/lib/grammar/GrammarBNF.cpp b/clang-tools-extra/pseudo/lib/grammar/GrammarBNF.cpp
index 9fbc34da7155..281c08681c92 100644
--- a/clang-tools-extra/pseudo/lib/grammar/GrammarBNF.cpp
+++ b/clang-tools-extra/pseudo/lib/grammar/GrammarBNF.cpp
@@ -106,6 +106,17 @@ class GrammarBuilder {
 
     assert(T->Rules.size() < (1 << RuleBits) &&
            "Too many rules to fit in RuleID bits!");
+    // Wherever RHS contains { foo }, mark foo for brace-recovery.
+    // FIXME: this should be grammar annotations instead.
+    for (auto &Rule : T->Rules) {
+      for (unsigned I = 2; I < Rule.Size; ++I)
+        if (Rule.Sequence[I] == tokenSymbol(tok::r_brace) &&
+            Rule.Sequence[I - 2] == tokenSymbol(tok::l_brace) &&
+            !isToken(Rule.Sequence[I - 1])) {
+          Rule.Recovery = RecoveryStrategy::Braces;
+          Rule.RecoveryIndex = I - 1;
+        }
+    }
     const auto &SymbolOrder = getTopologicalOrder(T.get());
     llvm::stable_sort(
         T->Rules, [&SymbolOrder](const Rule &Left, const Rule &Right) {

diff  --git a/clang-tools-extra/pseudo/lib/grammar/LRGraph.cpp b/clang-tools-extra/pseudo/lib/grammar/LRGraph.cpp
index 5312a675cdb8..290a4f32f576 100644
--- a/clang-tools-extra/pseudo/lib/grammar/LRGraph.cpp
+++ b/clang-tools-extra/pseudo/lib/grammar/LRGraph.cpp
@@ -120,6 +120,20 @@ nextAvailableKernelItems(const State &S, const Grammar &G) {
   return Results;
 }
 
+std::vector<std::pair<RecoveryStrategy, SymbolID>>
+availableRecovery(const State &S, const Grammar &G) {
+  std::vector<std::pair<RecoveryStrategy, SymbolID>> Result;
+  for (const Item &I : S.Items) {
+    const auto &Rule = G.lookupRule(I.rule());
+    if (I.dot() != Rule.RecoveryIndex)
+      continue;
+    Result.push_back({Rule.Recovery, Rule.seq()[Rule.RecoveryIndex]});
+  }
+  llvm::sort(Result);
+  Result.erase(std::unique(Result.begin(), Result.end()), Result.end());
+  return Result;
+}
+
 } // namespace
 
 std::string Item::dump(const Grammar &G) const {
@@ -130,9 +144,10 @@ std::string Item::dump(const Grammar &G) const {
       Results.push_back(G.symbolName(SID));
     return Results;
   };
-  return llvm::formatv("{0} := {1} • {2}", G.symbolName(Rule.Target),
+  return llvm::formatv("{0} := {1} • {2}{3}", G.symbolName(Rule.Target),
                        llvm::join(ToNames(Rule.seq().take_front(DotPos)), " "),
-                       llvm::join(ToNames(Rule.seq().drop_front(DotPos)), " "))
+                       llvm::join(ToNames(Rule.seq().drop_front(DotPos)), " "),
+                       Rule.RecoveryIndex == DotPos ? " [recovery]" : "")
       .str();
 }
 
@@ -181,6 +196,11 @@ LRGraph LRGraph::buildLR0(const Grammar &G) {
       Edges.push_back({Src, Dst, Label});
     }
 
+    void insertRecovery(StateID Src, RecoveryStrategy Strategy,
+                        SymbolID Result) {
+      Recoveries.push_back({Src, Strategy, Result});
+    }
+
     // Returns a state with the given id.
     const State &find(StateID ID) const {
       assert(ID < States.size());
@@ -194,9 +214,10 @@ LRGraph LRGraph::buildLR0(const Grammar &G) {
     LRGraph build() && {
       States.shrink_to_fit();
       Edges.shrink_to_fit();
+      Recoveries.shrink_to_fit();
       llvm::sort(StartStates);
       StartStates.shrink_to_fit();
-      return LRGraph(std::move(States), std::move(Edges),
+      return LRGraph(std::move(States), std::move(Edges), std::move(Recoveries),
                      std::move(StartStates));
     }
 
@@ -205,6 +226,7 @@ LRGraph LRGraph::buildLR0(const Grammar &G) {
     llvm::DenseMap<ItemSet, /*index of States*/ size_t> StatesIndex;
     std::vector<State> States;
     std::vector<Edge> Edges;
+    std::vector<Recovery> Recoveries;
     const Grammar &G;
     std::vector<std::pair<SymbolID, StateID>> StartStates;
   } Builder(G);
@@ -225,15 +247,16 @@ LRGraph LRGraph::buildLR0(const Grammar &G) {
   }
 
   while (!PendingStates.empty()) {
-    auto CurrentStateID = PendingStates.back();
+    auto StateID = PendingStates.back();
     PendingStates.pop_back();
-    for (auto Next :
-         nextAvailableKernelItems(Builder.find(CurrentStateID), G)) {
+    for (auto Next : nextAvailableKernelItems(Builder.find(StateID), G)) {
       auto Insert = Builder.insert(Next.second);
       if (Insert.second) // new state, insert to the pending queue.
         PendingStates.push_back(Insert.first);
-      Builder.insertEdge(CurrentStateID, Insert.first, Next.first);
+      Builder.insertEdge(StateID, Insert.first, Next.first);
     }
+    for (auto Recovery : availableRecovery(Builder.find(StateID), G))
+      Builder.insertRecovery(StateID, Recovery.first, Recovery.second);
   }
   return std::move(Builder).build();
 }

diff  --git a/clang-tools-extra/pseudo/lib/grammar/LRTableBuild.cpp b/clang-tools-extra/pseudo/lib/grammar/LRTableBuild.cpp
index 59ea4ce5e327..3d36042044ad 100644
--- a/clang-tools-extra/pseudo/lib/grammar/LRTableBuild.cpp
+++ b/clang-tools-extra/pseudo/lib/grammar/LRTableBuild.cpp
@@ -11,6 +11,7 @@
 #include "clang-pseudo/grammar/LRTable.h"
 #include "clang/Basic/TokenKinds.h"
 #include "llvm/ADT/SmallSet.h"
+#include "llvm/Support/raw_ostream.h"
 #include <cstdint>
 
 namespace llvm {
@@ -44,6 +45,7 @@ struct LRTable::Builder {
   llvm::DenseSet<Entry> Entries;
   llvm::DenseMap<StateID, llvm::SmallSet<RuleID, 4>> Reduces;
   std::vector<llvm::DenseSet<SymbolID>> FollowSets;
+  std::vector<LRGraph::Recovery> Recoveries;
 
   LRTable build(unsigned NumStates) && {
     // E.g. given the following parsing table with 3 states and 3 terminals:
@@ -88,6 +90,26 @@ struct LRTable::Builder {
     }
     Table.StartStates = std::move(StartStates);
 
+    // Error recovery entries: sort (no dups already), and build offset lookup.
+    llvm::sort(Recoveries,
+               [&](const LRGraph::Recovery &L, const LRGraph::Recovery &R) {
+                 return std::tie(L.Src, L.Result, L.Strategy) <
+                        std::tie(R.Src, R.Result, R.Strategy);
+               });
+    Table.Recoveries.reserve(Recoveries.size());
+    for (const auto &R : Recoveries)
+      Table.Recoveries.push_back({R.Strategy, R.Result});
+    Table.RecoveryOffset = std::vector<uint32_t>(NumStates + 1, 0);
+    SortedIndex = 0;
+    for (StateID State = 0; State < NumStates; ++State) {
+      Table.RecoveryOffset[State] = SortedIndex;
+      while (SortedIndex < Recoveries.size() &&
+             Recoveries[SortedIndex].Src == State)
+        SortedIndex++;
+    }
+    Table.RecoveryOffset[NumStates] = SortedIndex;
+    assert(SortedIndex == Recoveries.size());
+
     // Compile the follow sets into a bitmap.
     Table.FollowSets.resize(tok::NUM_TOKENS * FollowSets.size());
     for (SymbolID NT = 0; NT < FollowSets.size(); ++NT)
@@ -114,7 +136,8 @@ struct LRTable::Builder {
 };
 
 LRTable LRTable::buildForTests(const Grammar &G, llvm::ArrayRef<Entry> Entries,
-                               llvm::ArrayRef<ReduceEntry> Reduces) {
+                               llvm::ArrayRef<ReduceEntry> Reduces,
+                               llvm::ArrayRef<RecoveryEntry> Recoveries) {
   StateID MaxState = 0;
   for (const auto &Entry : Entries) {
     MaxState = std::max(MaxState, Entry.State);
@@ -128,6 +151,8 @@ LRTable LRTable::buildForTests(const Grammar &G, llvm::ArrayRef<Entry> Entries,
   for (const ReduceEntry &E : Reduces)
     Build.Reduces[E.State].insert(E.Rule);
   Build.FollowSets = followSets(G);
+  for (const auto &R : Recoveries)
+    Build.Recoveries.push_back({R.State, R.Strategy, R.Result});
   return std::move(Build).build(/*NumStates=*/MaxState + 1);
 }
 
@@ -135,6 +160,7 @@ LRTable LRTable::buildSLR(const Grammar &G) {
   auto Graph = LRGraph::buildLR0(G);
   Builder Build;
   Build.StartStates = Graph.startStates();
+  Build.Recoveries = Graph.recoveries();
   for (const auto &T : Graph.edges()) {
     Action Act = isToken(T.Label) ? Action::shift(T.Dst) : Action::goTo(T.Dst);
     Build.Entries.insert({T.Src, T.Label, Act});

diff  --git a/clang-tools-extra/pseudo/test/cxx/empty-member-spec.cpp b/clang-tools-extra/pseudo/test/cxx/empty-member-spec.cpp
index 6d7a6823d0bf..bcfdff192b99 100644
--- a/clang-tools-extra/pseudo/test/cxx/empty-member-spec.cpp
+++ b/clang-tools-extra/pseudo/test/cxx/empty-member-spec.cpp
@@ -2,7 +2,7 @@
 class Foo {
 public:
 };
-// CHECK:      decl-specifier-seq~class-specifier := class-head { member-specification }
+// CHECK:      decl-specifier-seq~class-specifier := class-head { member-specification [recover=1] }
 // CHECK-NEXT: ├─class-head := class-key class-head-name
 // CHECK-NEXT: │ ├─class-key~CLASS := tok[0]
 // CHECK-NEXT: │ └─class-head-name~IDENTIFIER := tok[1]

diff  --git a/clang-tools-extra/pseudo/test/cxx/recovery-init-list.cpp b/clang-tools-extra/pseudo/test/cxx/recovery-init-list.cpp
new file mode 100644
index 000000000000..a8bbe88dadd8
--- /dev/null
+++ b/clang-tools-extra/pseudo/test/cxx/recovery-init-list.cpp
@@ -0,0 +1,13 @@
+// RUN: clang-pseudo -grammar=%cxx-bnf-file -source=%s --print-forest | FileCheck %s
+auto x = { complete garbage };
+// CHECK:      translation-unit~simple-declaration
+// CHECK-NEXT: ├─decl-specifier-seq~AUTO := tok[0]
+// CHECK-NEXT: ├─init-declarator-list~init-declarator
+// CHECK-NEXT: │ ├─declarator~IDENTIFIER := tok[1]
+// CHECK-NEXT: │ └─initializer~brace-or-equal-initializer
+// CHECK-NEXT: │   ├─= := tok[2]
+// CHECK-NEXT: │   └─initializer-clause~braced-init-list
+// CHECK-NEXT: │     ├─{ := tok[3]
+// CHECK-NEXT: │     ├─initializer-list := <opaque>
+// CHECK-NEXT: │     └─} := tok[6]
+// CHECK-NEXT: └─; := tok[7]

diff  --git a/clang-tools-extra/pseudo/unittests/GLRTest.cpp b/clang-tools-extra/pseudo/unittests/GLRTest.cpp
index 846f2dbf1677..a37fda3fe2f5 100644
--- a/clang-tools-extra/pseudo/unittests/GLRTest.cpp
+++ b/clang-tools-extra/pseudo/unittests/GLRTest.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "clang-pseudo/GLR.h"
+#include "clang-pseudo/Bracket.h"
 #include "clang-pseudo/Token.h"
 #include "clang-pseudo/grammar/Grammar.h"
 #include "clang/Basic/LangOptions.h"
@@ -32,11 +33,13 @@ namespace {
 using Action = LRTable::Action;
 using testing::AllOf;
 using testing::ElementsAre;
+using testing::IsEmpty;
 using testing::UnorderedElementsAre;
 
 MATCHER_P(state, StateID, "") { return arg->State == StateID; }
 MATCHER_P(parsedSymbol, FNode, "") { return arg->Payload == FNode; }
 MATCHER_P(parsedSymbolID, SID, "") { return arg->Payload->symbol() == SID; }
+MATCHER_P(start, Start, "") { return arg->Payload->startTokenIndex() == Start; }
 
 testing::Matcher<const GSS::Node *>
 parents(llvm::ArrayRef<const GSS::Node *> Parents) {
@@ -238,9 +241,9 @@ TEST_F(GLRTest, ReduceJoiningWithMultipleBases) {
       /*State=*/1, /*ForestNode=*/CVQualifierNode, /*Parents=*/{GSSNode0});
   const auto *GSSNode2 = GSStack.addNode(
       /*State=*/2, /*ForestNode=*/CVQualifierNode, /*Parents=*/{GSSNode0});
-  const auto *GSSNode3 =
-      GSStack.addNode(/*State=*/3, /*ForestNode=*/ClassNameNode,
-                      /*Parents=*/{GSSNode1});
+  const auto *GSSNode3 = GSStack.addNode(
+      /*State=*/3, /*ForestNode=*/ClassNameNode,
+      /*Parents=*/{GSSNode1});
   const auto *GSSNode4 =
       GSStack.addNode(/*State=*/4, /*ForestNode=*/EnumNameNode,
                       /*Parents=*/{GSSNode2});
@@ -363,6 +366,124 @@ TEST_F(GLRTest, ReduceLookahead) {
   EXPECT_THAT(Heads, ElementsAre(GSSNode1));
 }
 
+TEST_F(GLRTest, Recover) {
+  // Recovery while parsing "word" inside braces.
+  //  Before:
+  //    0--1({)--2(?)
+  //  After recovering a `word` at state 1:
+  //    0--3(word)  // 3 is goto(1, word)
+  buildGrammar({"word"}, {});
+  LRTable Table = LRTable::buildForTests(
+      G, {{/*State=*/1, id("word"), Action::goTo(3)}}, /*Reduce=*/{},
+      /*Recovery=*/{{/*State=*/1, RecoveryStrategy::Braces, id("word")}});
+
+  auto *LBrace = &Arena.createTerminal(tok::l_brace, 0);
+  auto *Question1 = &Arena.createTerminal(tok::question, 1);
+  const auto *Root = GSStack.addNode(0, nullptr, {});
+  const auto *OpenedBraces = GSStack.addNode(1, LBrace, {Root});
+  const auto *AfterQuestion1 = GSStack.addNode(2, Question1, {OpenedBraces});
+
+  // Need a token stream with paired braces so the strategy works.
+  clang::LangOptions LOptions;
+  TokenStream Tokens = cook(lex("{ ? ? ? }", LOptions), LOptions);
+  pairBrackets(Tokens);
+  std::vector<const GSS::Node *> NewHeads;
+
+  unsigned TokenIndex = 2;
+  glrRecover({AfterQuestion1}, TokenIndex, Tokens, {G, Table, Arena, GSStack},
+             NewHeads);
+  EXPECT_EQ(TokenIndex, 4u) << "should skip ahead to matching brace";
+  EXPECT_THAT(NewHeads, ElementsAre(
+                            AllOf(state(3), parsedSymbolID(id("word")),
+                                  parents({OpenedBraces}), start(1u))));
+  EXPECT_EQ(NewHeads.front()->Payload->kind(), ForestNode::Opaque);
+
+  // Test recovery failure: omit closing brace so strategy fails
+  TokenStream NoRBrace = cook(lex("{ ? ? ? ?", LOptions), LOptions);
+  pairBrackets(NoRBrace);
+  NewHeads.clear();
+  TokenIndex = 2;
+  glrRecover({AfterQuestion1}, TokenIndex, NoRBrace,
+             {G, Table, Arena, GSStack}, NewHeads);
+  EXPECT_EQ(TokenIndex, 3u) << "should advance by 1 by default";
+  EXPECT_THAT(NewHeads, IsEmpty());
+}
+
+TEST_F(GLRTest, RecoverRightmost) {
+  // In a nested block structure, we recover at the innermost possible block.
+  //  Before:
+  //    0--1({)--1({)--1({)
+  //  After recovering a `block` at inside the second braces:
+  //    0--1({)--2(body)  // 2 is goto(1, body)
+  buildGrammar({"body"}, {});
+  LRTable Table = LRTable::buildForTests(
+      G, {{/*State=*/1, id("body"), Action::goTo(2)}}, /*Reduce=*/{},
+      /*Recovery=*/{{/*State=*/1, RecoveryStrategy::Braces, id("body")}});
+
+  clang::LangOptions LOptions;
+  // Innermost brace is unmatched, to test fallback to next brace.
+  TokenStream Tokens = cook(lex("{ { { ? ? } }", LOptions), LOptions);
+  Tokens.tokens()[0].Pair = 5;
+  Tokens.tokens()[1].Pair = 4;
+  Tokens.tokens()[4].Pair = 1;
+  Tokens.tokens()[5].Pair = 0;
+
+  auto *Brace1 = &Arena.createTerminal(tok::l_brace, 0);
+  auto *Brace2 = &Arena.createTerminal(tok::l_brace, 1);
+  auto *Brace3 = &Arena.createTerminal(tok::l_brace, 2);
+  const auto *Root = GSStack.addNode(0, nullptr, {});
+  const auto *In1 = GSStack.addNode(1, Brace1, {Root});
+  const auto *In2 = GSStack.addNode(1, Brace2, {In1});
+  const auto *In3 = GSStack.addNode(1, Brace3, {In2});
+
+  unsigned TokenIndex = 3;
+  std::vector<const GSS::Node *> NewHeads;
+  glrRecover({In3}, TokenIndex, Tokens, {G, Table, Arena, GSStack}, NewHeads);
+  EXPECT_EQ(TokenIndex, 5u);
+  EXPECT_THAT(NewHeads, ElementsAre(AllOf(state(2), parsedSymbolID(id("body")),
+                                          parents({In2}), start(2u))));
+}
+
+TEST_F(GLRTest, RecoverAlternatives) {
+  // Recovery inside braces with multiple equally good options
+  //  Before:
+  //    0--1({)
+  //  After recovering either `word` or `number` inside the braces:
+  //    0--1({)--2(word)   // 2 is goto(1, word)
+  //          └--3(number) // 3 is goto(1, number)
+  buildGrammar({"number", "word"}, {});
+  LRTable Table = LRTable::buildForTests(
+      G,
+      {
+          {/*State=*/1, id("number"), Action::goTo(2)},
+          {/*State=*/1, id("word"), Action::goTo(3)},
+      },
+      /*Reduce=*/{},
+      /*Recovery=*/
+      {
+          {/*State=*/1, RecoveryStrategy::Braces, id("number")},
+          {/*State=*/1, RecoveryStrategy::Braces, id("word")},
+      });
+  auto *LBrace = &Arena.createTerminal(tok::l_brace, 0);
+  const auto *Root = GSStack.addNode(0, nullptr, {});
+  const auto *OpenedBraces = GSStack.addNode(1, LBrace, {Root});
+
+  clang::LangOptions LOptions;
+  TokenStream Tokens = cook(lex("{ ? }", LOptions), LOptions);
+  pairBrackets(Tokens);
+  std::vector<const GSS::Node *> NewHeads;
+  unsigned TokenIndex = 1;
+
+  glrRecover({OpenedBraces}, TokenIndex, Tokens, {G, Table, Arena, GSStack},
+             NewHeads);
+  EXPECT_EQ(TokenIndex, 2u);
+  EXPECT_THAT(NewHeads,
+              UnorderedElementsAre(AllOf(state(2), parsedSymbolID(id("number")),
+                                         parents({OpenedBraces}), start(1u)),
+                                   AllOf(state(3), parsedSymbolID(id("word")),
+                                         parents({OpenedBraces}), start(1u))));
+}
+
 TEST_F(GLRTest, PerfectForestNodeSharing) {
   // Run the GLR on a simple grammar and test that we build exactly one forest
   // node per (SymbolID, token range).
@@ -431,6 +552,40 @@ TEST_F(GLRTest, GLRReduceOrder) {
                                      "[  0, end)     └─IDENTIFIER := tok[0]\n");
 }
 
+TEST_F(GLRTest, RecoveryEndToEnd) {
+  // Simple example of brace-based recovery showing:
+  //  - recovered region includes tokens both ahead of and behind the cursor
+  //  - multiple possible recovery rules
+  //  - recovery from outer scopes is rejected
+  build(R"bnf(
+    _ := block
+
+    block := { block }
+    block := { numbers }
+    numbers := NUMERIC_CONSTANT NUMERIC_CONSTANT
+  )bnf");
+  auto LRTable = LRTable::buildSLR(G);
+  clang::LangOptions LOptions;
+  TokenStream Tokens = cook(lex("{ { 42 ? } }", LOptions), LOptions);
+  pairBrackets(Tokens);
+
+  const ForestNode &Parsed =
+      glrParse(Tokens, {G, LRTable, Arena, GSStack}, id("block"));
+  EXPECT_EQ(Parsed.dumpRecursive(G),
+            "[  0, end) block := { block [recover=1] }\n"
+            "[  0,   1) ├─{ := tok[0]\n"
+            "[  1,   5) ├─block := <ambiguous>\n"
+            "[  1,   5) │ ├─block := { block [recover=1] }\n"
+            "[  1,   2) │ │ ├─{ := tok[1]\n"
+            "[  2,   4) │ │ ├─block := <opaque>\n"
+            "[  4,   5) │ │ └─} := tok[4]\n"
+            "[  1,   5) │ └─block := { numbers [recover=1] }\n"
+            "[  1,   2) │   ├─{ := tok[1]\n"
+            "[  2,   4) │   ├─numbers := <opaque>\n"
+            "[  4,   5) │   └─} := tok[4]\n"
+            "[  5, end) └─} := tok[5]\n");
+}
+
 TEST_F(GLRTest, NoExplicitAccept) {
   build(R"bnf(
     _ := test


        


More information about the cfe-commits mailing list