[clang-tools-extra] 3121167 - [pseudo] Add error-recovery framework & brace-based recovery

Sam McCall via cfe-commits cfe-commits at lists.llvm.org
Tue Jul 5 11:49:52 PDT 2022


Author: Sam McCall
Date: 2022-07-05T20:49:41+02:00
New Revision: 31211674889095299eb817d7be034fde23f7b00a

URL: https://github.com/llvm/llvm-project/commit/31211674889095299eb817d7be034fde23f7b00a
DIFF: https://github.com/llvm/llvm-project/commit/31211674889095299eb817d7be034fde23f7b00a.diff

LOG: [pseudo] Add error-recovery framework & brace-based recovery

The idea is:

- a parse failure is detected when all heads die when trying to shift the next token
- we can recover by choosing a nonterminal we're partway through parsing, and
  determining where it ends through nonlocal means (e.g. matching brackets)
- we can find candidates by walking up the stack from the (ex-)heads
- the token range is defined using heuristics attached to grammar rules
- the unparsed region is represented in the forest by an Opaque node

This patch has the core GLR functionality.
It does not allow recovery heuristics to be attached as extensions to
the grammar, but rather infers a brace-based heuristic.

Expected followups:

- make recovery heuristics grammar extensions (depends on D127448)
- add recovery to our grammar for bracketed constructs and sequence nodes
- change the structure of our augmented `_ := start` rules to eliminate some
  special-cases in glrParse.
- (if I can work out how): avoid some spurious recovery cases described in comments

(Previously mistakenly committed as a0f4c10ae227a62c2a63611e64eba83f0ff0f577)

Differential Revision: https://reviews.llvm.org/D128486

Added: 
    clang-tools-extra/pseudo/test/cxx/recovery-init-list.cpp

Modified: 
    clang-tools-extra/pseudo/include/clang-pseudo/GLR.h
    clang-tools-extra/pseudo/include/clang-pseudo/grammar/Grammar.h
    clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRGraph.h
    clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRTable.h
    clang-tools-extra/pseudo/lib/GLR.cpp
    clang-tools-extra/pseudo/lib/grammar/Grammar.cpp
    clang-tools-extra/pseudo/lib/grammar/GrammarBNF.cpp
    clang-tools-extra/pseudo/lib/grammar/LRGraph.cpp
    clang-tools-extra/pseudo/lib/grammar/LRTableBuild.cpp
    clang-tools-extra/pseudo/test/cxx/empty-member-spec.cpp
    clang-tools-extra/pseudo/unittests/GLRTest.cpp

Removed: 
    


################################################################################
diff  --git a/clang-tools-extra/pseudo/include/clang-pseudo/GLR.h b/clang-tools-extra/pseudo/include/clang-pseudo/GLR.h
index ebf54685b01b..7b0a80920da9 100644
--- a/clang-tools-extra/pseudo/include/clang-pseudo/GLR.h
+++ b/clang-tools-extra/pseudo/include/clang-pseudo/GLR.h
@@ -143,6 +143,25 @@ void glrShift(llvm::ArrayRef<const GSS::Node *> OldHeads,
 void glrReduce(std::vector<const GSS::Node *> &Heads, SymbolID Lookahead,
                const ParseParams &Params, const Language &Lang);
 
+// Heuristically recover from a state where no further parsing is possible.
+//
+// OldHeads is the parse state at TokenIndex.
+// This function consumes zero or more tokens by advancing TokenIndex,
+// and places any recovery states created in NewHeads.
+//
+// On failure, NewHeads is empty and TokenIndex is unchanged.
+//
+// WARNING: glrRecover acts as a "fallback shift". If it consumes no tokens,
+// there is a risk of the parser falling into an infinite loop, creating an
+// endless sequence of recovery nodes.
+// Generally it is safe for recovery to match 0 tokens against sequence symbols
+// like `statement-seq`, as the grammar won't permit another statement-seq
+// immediately afterwards. However recovery strategies for `statement` should
+// consume at least one token, as statements may be adjacent in the input.
+void glrRecover(llvm::ArrayRef<const GSS::Node *> OldHeads,
+                unsigned &TokenIndex, const ParseParams &Params,
+                const Language &Lang, std::vector<const GSS::Node *> &NewHeads);
+
 } // namespace pseudo
 } // namespace clang
 

diff  --git a/clang-tools-extra/pseudo/include/clang-pseudo/grammar/Grammar.h b/clang-tools-extra/pseudo/include/clang-pseudo/grammar/Grammar.h
index f60bf487eeaf..e2957cc2ec71 100644
--- a/clang-tools-extra/pseudo/include/clang-pseudo/grammar/Grammar.h
+++ b/clang-tools-extra/pseudo/include/clang-pseudo/grammar/Grammar.h
@@ -82,9 +82,12 @@ inline tok::TokenKind symbolToToken(SymbolID SID) {
   assert(SID < NumTerminals);
   return static_cast<tok::TokenKind>(SID);
 }
-inline SymbolID tokenSymbol(tok::TokenKind TK) {
+inline constexpr SymbolID tokenSymbol(tok::TokenKind TK) {
   return TokenFlag | static_cast<SymbolID>(TK);
 }
+// Error recovery strategies.
+// FIXME: these should be provided as extensions instead.
+enum class RecoveryStrategy : uint8_t { None, Braces };
 
 // An extension is a piece of native code specific to a grammar that modifies
 // the behavior of annotated rules. One ExtensionID is assigned for each unique
@@ -108,7 +111,7 @@ struct Rule {
   // length to 9 (this is the longest sequence in cxx grammar).
   static constexpr unsigned SizeBits = 4;
   static constexpr unsigned MaxElements = 9;
-  static_assert(MaxElements <= (1 << SizeBits), "Exceeds the maximum limit");
+  static_assert(MaxElements < (1 << SizeBits), "Exceeds the maximum limit");
   static_assert(SizeBits + SymbolBits <= 16,
                 "Must be able to store symbol ID + size efficiently");
 
@@ -124,6 +127,14 @@ struct Rule {
   // being set for this rule.
   ExtensionID Guard = 0;
 
+  // Specifies the index within Sequence eligible for error recovery.
+  // Given stmt := { stmt-seq_opt }, if we fail to parse the stmt-seq then we
+  // should recover by finding the matching brace, and forcing stmt-seq to match
+  // everything between braces.
+  // For now, only a single strategy at a single point is possible.
+  uint8_t RecoveryIndex = -1;
+  RecoveryStrategy Recovery = RecoveryStrategy::None;
+
   llvm::ArrayRef<SymbolID> seq() const {
     return llvm::ArrayRef<SymbolID>(Sequence, Size);
   }

diff  --git a/clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRGraph.h b/clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRGraph.h
index 1b5365327431..f5c2cc7a1623 100644
--- a/clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRGraph.h
+++ b/clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRGraph.h
@@ -137,8 +137,20 @@ class LRGraph {
     SymbolID Label;
   };
 
+  // A possible error recovery: choose to match some tokens against a symbol.
+  //
+  // e.g. a state that contains
+  //   stmt := { . stmt-seq [recover=braces] }
+  // has a Recovery { Src = S, Strategy=braces, Result=stmt-seq }.
+  struct Recovery {
+    StateID Src; // The state we are in when encountering the error.
+    RecoveryStrategy Strategy; // Heuristic choosing the tokens to match.
+    SymbolID Result;           // The symbol that is produced.
+  };
+
   llvm::ArrayRef<State> states() const { return States; }
   llvm::ArrayRef<Edge> edges() const { return Edges; }
+  llvm::ArrayRef<Recovery> recoveries() const { return Recoveries; }
   llvm::ArrayRef<std::pair<SymbolID, StateID>> startStates() const {
     return StartStates;
   }
@@ -147,12 +159,15 @@ class LRGraph {
 
 private:
   LRGraph(std::vector<State> States, std::vector<Edge> Edges,
+          std::vector<Recovery> Recoveries,
           std::vector<std::pair<SymbolID, StateID>> StartStates)
       : States(std::move(States)), Edges(std::move(Edges)),
-        StartStates(std::move(StartStates)) {}
+        Recoveries(std::move(Recoveries)), StartStates(std::move(StartStates)) {
+  }
 
   std::vector<State> States;
   std::vector<Edge> Edges;
+  std::vector<Recovery> Recoveries;
   std::vector<std::pair<SymbolID, StateID>> StartStates;
 };
 

diff  --git a/clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRTable.h b/clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRTable.h
index 7575b3271dc2..aac80ad53ffd 100644
--- a/clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRTable.h
+++ b/clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRTable.h
@@ -42,7 +42,6 @@
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/Support/Capacity.h"
 #include "llvm/Support/MathExtras.h"
-#include "llvm/Support/raw_ostream.h"
 #include <cstdint>
 #include <vector>
 
@@ -68,6 +67,11 @@ class LRTable {
   using StateID = uint16_t;
   static constexpr unsigned StateBits = 13;
 
+  struct Recovery {
+    RecoveryStrategy Strategy;
+    SymbolID Result;
+  };
+
   // Returns the state after we reduce a nonterminal.
   // Expected to be called by LR parsers.
   // If the nonterminal is invalid here, returns None.
@@ -105,6 +109,12 @@ class LRTable {
                            symbolToToken(Terminal));
   }
 
+  // Looks up available recovery actions if we stopped parsing in this state.
+  llvm::ArrayRef<Recovery> getRecovery(StateID State) const {
+    return llvm::makeArrayRef(Recoveries.data() + RecoveryOffset[State],
+                              Recoveries.data() + RecoveryOffset[State + 1]);
+  }
+
   // Returns the state from which the LR parser should start to parse the input
   // tokens as the given StartSymbol.
   //
@@ -147,6 +157,8 @@ class LRTable {
     llvm::DenseMap<StateID, llvm::SmallSet<RuleID, 4>> Reduce;
     // FollowSets[NT] is the set of terminals that can follow the nonterminal.
     std::vector<llvm::DenseSet<SymbolID>> FollowSets;
+    // Recovery options available at each state.
+    std::vector<std::pair<StateID, Recovery>> Recoveries;
 
     LRTable build() &&;
   };
@@ -251,6 +263,11 @@ class LRTable {
   // This is flattened by encoding the (SymbolID Nonterminal, tok::Kind Token)
   // as an index: Nonterminal * NUM_TOKENS + Token.
   llvm::BitVector FollowSets;
+
+  // Recovery stores all recovery actions from all states.
+  // A given state has [RecoveryOffset[S], RecoveryOffset[S+1]).
+  std::vector<uint32_t> RecoveryOffset;
+  std::vector<Recovery> Recoveries;
 };
 
 } // namespace pseudo

diff  --git a/clang-tools-extra/pseudo/lib/GLR.cpp b/clang-tools-extra/pseudo/lib/GLR.cpp
index 7ef09bf419e9..2c1b1c09ce69 100644
--- a/clang-tools-extra/pseudo/lib/GLR.cpp
+++ b/clang-tools-extra/pseudo/lib/GLR.cpp
@@ -25,6 +25,172 @@
 
 namespace clang {
 namespace pseudo {
+namespace {
+
+Token::Index findRecoveryEndpoint(RecoveryStrategy Strategy,
+                                  const GSS::Node *RecoveryNode,
+                                  const TokenStream &Tokens) {
+  assert(Strategy == RecoveryStrategy::Braces);
+  const ForestNode *LBrace = RecoveryNode->Payload;
+  assert(LBrace->kind() == ForestNode::Terminal &&
+         LBrace->symbol() == tokenSymbol(tok::l_brace));
+  if (const Token *RBrace = Tokens.tokens()[LBrace->startTokenIndex()].pair())
+    return Tokens.index(*RBrace);
+  return Token::Invalid;
+}
+
+} // namespace
+
+void glrRecover(llvm::ArrayRef<const GSS::Node *> OldHeads,
+                unsigned &TokenIndex, const ParseParams &Params,
+                const Language &Lang,
+                std::vector<const GSS::Node *> &NewHeads) {
+  LLVM_DEBUG(llvm::dbgs() << "Recovery at token " << TokenIndex << "...\n");
+  // Describes a possibility to recover by forcibly interpreting a range of
+  // tokens around the cursor as a nonterminal that we expected to see.
+  struct PlaceholderRecovery {
+    // The token prior to the nonterminal which is being recovered.
+    // This starts of the region we're skipping, so higher Position is better.
+    Token::Index Position;
+    // The nonterminal which will be created in order to recover.
+    SymbolID Symbol;
+    // The heuristic used to choose the bounds of the nonterminal to recover.
+    RecoveryStrategy Strategy;
+
+    // The GSS head where we are expecting the recovered nonterminal.
+    const GSS::Node *RecoveryNode;
+    // Payload of nodes on the way back from the OldHead to the recovery node.
+    // These represent the partial parse that is being discarded.
+    // They should become the children of the opaque recovery node.
+    // FIXME: internal structure of opaque nodes is not implemented.
+    //
+    // There may be multiple paths leading to the same recovery node, we choose
+    // one arbitrarily.
+    std::vector<const ForestNode *> DiscardedParse;
+  };
+  std::vector<PlaceholderRecovery> Options;
+
+  // Find recovery options by walking up the stack.
+  //
+  // This is similar to exception handling: we walk up the "frames" of nested
+  // rules being parsed until we find one that has a "handler" which allows us
+  // to determine the node bounds without parsing it.
+  //
+  // Unfortunately there's a significant 
diff erence: the stack contains both
+  // "upward" nodes (ancestor parses) and "leftward" ones.
+  // e.g. when parsing `{ if (1) ? }` as compound-stmt, the stack contains:
+  //   stmt := IF ( expr ) . stmt      - current state, we should recover here!
+  //   stmt := IF ( expr . ) stmt      - (left, no recovery here)
+  //   stmt := IF ( . expr ) stmt      - left, we should NOT recover here!
+  //   stmt := IF . ( expr ) stmt      - (left, no recovery here)
+  //   stmt-seq := . stmt              - up, we might recover here
+  //   compound-stmt := { . stmt-seq } - up, we should recover here!
+  //
+  // It's not obvious how to avoid collecting "leftward" recovery options.
+  // I think the distinction is ill-defined after merging items into states.
+  // For now, we have to take this into account when defining recovery rules.
+  // (e.g. in the expr recovery above, stay inside the parentheses).
+  // FIXME: find a more satisfying way to avoid such false recovery.
+  // FIXME: Add a test for spurious recovery once tests can define strategies.
+  std::vector<const ForestNode *> Path;
+  llvm::DenseSet<const GSS::Node *> Seen;
+  auto WalkUp = [&](const GSS::Node *N, Token::Index NextTok, auto &WalkUp) {
+    if (!Seen.insert(N).second)
+      return;
+    for (auto Strategy : Lang.Table.getRecovery(N->State)) {
+      Options.push_back(PlaceholderRecovery{
+          NextTok,
+          Strategy.Result,
+          Strategy.Strategy,
+          N,
+          Path,
+      });
+      LLVM_DEBUG(llvm::dbgs()
+                 << "Option: recover " << Lang.G.symbolName(Strategy.Result)
+                 << " at token " << NextTok << "\n");
+    }
+    Path.push_back(N->Payload);
+    for (const GSS::Node *Parent : N->parents())
+      WalkUp(Parent, N->Payload->startTokenIndex(), WalkUp);
+    Path.pop_back();
+  };
+  for (auto *N : OldHeads)
+    WalkUp(N, TokenIndex, WalkUp);
+
+  // Now we select the option(s) we will use to recover.
+  //
+  // We prefer options starting further right, as these discard less code
+  // (e.g. we prefer to recover inner scopes rather than outer ones).
+  // The options also need to agree on an endpoint, so the parser has a
+  // consistent position afterwards.
+  //
+  // So conceptually we're sorting by the tuple (start, end), though we avoid
+  // computing `end` for options that can't be winners.
+
+  // Consider options starting further right first.
+  // Don't drop the others yet though, we may still use them if preferred fails.
+  llvm::stable_sort(Options, [&](const auto &L, const auto &R) {
+    return L.Position > R.Position;
+  });
+
+  // We may find multiple winners, but they will have the same range.
+  llvm::Optional<Token::Range> RecoveryRange;
+  std::vector<const PlaceholderRecovery *> BestOptions;
+  for (const PlaceholderRecovery &Option : Options) {
+    // If this starts further left than options we've already found, then
+    // we'll never find anything better. Skip computing End for the rest.
+    if (RecoveryRange && Option.Position < RecoveryRange->Begin)
+      break;
+
+    auto End =
+        findRecoveryEndpoint(Option.Strategy, Option.RecoveryNode, Params.Code);
+    // Recovery may not take the parse backwards.
+    if (End == Token::Invalid || End < TokenIndex)
+      continue;
+    if (RecoveryRange) {
+      // If this is worse than our previous options, ignore it.
+      if (RecoveryRange->End < End)
+        continue;
+      // If this is an improvement over our previous options, then drop them.
+      if (RecoveryRange->End > End)
+        BestOptions.clear();
+    }
+    // Create recovery nodes and heads for them in the GSS. These may be
+    // discarded if a better recovery is later found, but this path isn't hot.
+    RecoveryRange = {Option.Position, End};
+    BestOptions.push_back(&Option);
+  }
+
+  if (BestOptions.empty()) {
+    LLVM_DEBUG(llvm::dbgs() << "Recovery failed after trying " << Options.size()
+                            << " strategies\n");
+    return;
+  }
+
+  // We've settled on a set of recovery options, so create their nodes and
+  // advance the cursor.
+  LLVM_DEBUG({
+    llvm::dbgs() << "Recovered range=" << *RecoveryRange << ":";
+    for (const auto *Option : BestOptions)
+      llvm::dbgs() << " " << Lang.G.symbolName(Option->Symbol);
+    llvm::dbgs() << "\n";
+  });
+  // FIXME: in general, we might have the same Option->Symbol multiple times,
+  // and we risk creating redundant Forest and GSS nodes.
+  // We also may inadvertently set up the next glrReduce to create a sequence
+  // node duplicating an opaque node that we're creating here.
+  // There are various options, including simply breaking ties between options.
+  // For now it's obscure enough to ignore.
+  for (const PlaceholderRecovery *Option : BestOptions) {
+    const ForestNode &Placeholder =
+        Params.Forest.createOpaque(Option->Symbol, RecoveryRange->Begin);
+    const GSS::Node *NewHead = Params.GSStack.addNode(
+        *Lang.Table.getGoToState(Option->RecoveryNode->State, Option->Symbol),
+        &Placeholder, {Option->RecoveryNode});
+    NewHeads.push_back(NewHead);
+  }
+  TokenIndex = RecoveryRange->End;
+}
 
 using StateID = LRTable::StateID;
 
@@ -32,8 +198,9 @@ llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const GSS::Node &N) {
   std::vector<std::string> ParentStates;
   for (const auto *Parent : N.parents())
     ParentStates.push_back(llvm::formatv("{0}", Parent->State));
-  OS << llvm::formatv("state {0}, parsed symbol {1}, parents {2}", N.State,
-                      N.Payload->symbol(), llvm::join(ParentStates, ", "));
+  OS << llvm::formatv("state {0}, parsed symbol {1}, parents {3}", N.State,
+                      N.Payload ? N.Payload->symbol() : 0,
+                      llvm::join(ParentStates, ", "));
   return OS;
 }
 
@@ -420,8 +587,8 @@ class GLRReduce {
 
 } // namespace
 
-const ForestNode &glrParse( const ParseParams &Params, SymbolID StartSymbol,
-                           const Language& Lang) {
+const ForestNode &glrParse(const ParseParams &Params, SymbolID StartSymbol,
+                           const Language &Lang) {
   GLRReduce Reduce(Params, Lang);
   assert(isNonterminal(StartSymbol) && "Start symbol must be a nonterminal");
   llvm::ArrayRef<ForestNode> Terminals = Params.Forest.createTerminals(Params.Code);
@@ -444,15 +611,29 @@ const ForestNode &glrParse( const ParseParams &Params, SymbolID StartSymbol,
     GSS.gc(std::move(Roots));
   };
   // Each iteration fully processes a single token.
-  for (unsigned I = 0; I < Terminals.size(); ++I) {
+  for (unsigned I = 0; I < Terminals.size();) {
     LLVM_DEBUG(llvm::dbgs() << llvm::formatv(
                    "Next token {0} (id={1})\n",
                   Lang.G.symbolName(Terminals[I].symbol()), Terminals[I].symbol()));
     // Consume the token.
     glrShift(Heads, Terminals[I], Params, Lang, NextHeads);
+
+    // If we weren't able to consume the token, try to skip over some tokens
+    // so we can keep parsing.
+    if (NextHeads.empty()) {
+      // FIXME: Heads may not be fully reduced, because our reductions were
+      // constrained by lookahead (but lookahead is meaningless to recovery).
+      glrRecover(Heads, I, Params, Lang, NextHeads);
+      if (NextHeads.empty())
+        // FIXME: Ensure the `_ := start-symbol` rules have a fallback
+        // error-recovery strategy attached. Then this condition can't happen.
+        return Params.Forest.createOpaque(StartSymbol, /*Token::Index=*/0);
+    } else
+      ++I;
+
     // Form nonterminals containing the token we just consumed.
-    SymbolID Lookahead = I + 1 == Terminals.size() ? tokenSymbol(tok::eof)
-                                                   : Terminals[I + 1].symbol();
+    SymbolID Lookahead =
+        I == Terminals.size() ? tokenSymbol(tok::eof) : Terminals[I].symbol();
     Reduce(NextHeads, Lookahead);
     // Prepare for the next token.
     std::swap(Heads, NextHeads);
@@ -461,23 +642,36 @@ const ForestNode &glrParse( const ParseParams &Params, SymbolID StartSymbol,
   }
   LLVM_DEBUG(llvm::dbgs() << llvm::formatv("Reached eof\n"));
 
+  // The parse was successful if we're in state `_ := start-symbol .`
   auto AcceptState = Lang.Table.getGoToState(StartState, StartSymbol);
   assert(AcceptState.hasValue() && "goto must succeed after start symbol!");
-  const ForestNode *Result = nullptr;
-  for (const auto *Head : Heads) {
-    if (Head->State == AcceptState) {
-      assert(Head->Payload->symbol() == StartSymbol);
-      assert(Result == nullptr && "multiple results!");
-      Result = Head->Payload;
+  auto SearchForAccept = [&](llvm::ArrayRef<const GSS::Node *> Heads) {
+    const ForestNode *Result = nullptr;
+    for (const auto *Head : Heads) {
+      if (Head->State == *AcceptState) {
+        assert(Head->Payload->symbol() == StartSymbol);
+        assert(Result == nullptr && "multiple results!");
+        Result = Head->Payload;
+      }
     }
-  }
-  if (Result)
+    return Result;
+  };
+  if (auto *Result = SearchForAccept(Heads))
+    return *Result;
+  // Failed to parse the input, attempt to run recovery.
+  // FIXME: this awkwardly repeats the recovery in the loop, when shift fails.
+  // More elegant is to include EOF in the token stream, and make the
+  // augmented rule: `_ := translation-unit EOF`. In this way recovery at EOF
+  // would not be a special case: it show up as a failure to shift the EOF
+  // token.
+  unsigned I = Terminals.size();
+  glrRecover(Heads, I, Params, Lang, NextHeads);
+  Reduce(NextHeads, tokenSymbol(tok::eof));
+  if (auto *Result = SearchForAccept(NextHeads))
     return *Result;
+
   // We failed to parse the input, returning an opaque forest node for recovery.
-  //
-  // FIXME: We will need to invoke our generic error-recovery handlers when we
-  // reach EOF without reaching accept state, and involving the eof
-  // token in the above main for-loopmay be the best way to reuse the code).
+  // FIXME: as above, we can add fallback error handling so this is impossible.
   return Params.Forest.createOpaque(StartSymbol, /*Token::Index=*/0);
 }
 
@@ -488,9 +682,10 @@ void glrReduce(std::vector<const GSS::Node *> &Heads, SymbolID Lookahead,
 }
 
 const GSS::Node *GSS::addNode(LRTable::StateID State, const ForestNode *Symbol,
+
                               llvm::ArrayRef<const Node *> Parents) {
   Node *Result = new (allocate(Parents.size()))
-      Node({State, GCParity, static_cast<unsigned>(Parents.size())});
+      Node({State, GCParity, static_cast<uint16_t>(Parents.size())});
   Alive.push_back(Result);
   ++NodesCreated;
   Result->Payload = Symbol;

diff  --git a/clang-tools-extra/pseudo/lib/grammar/Grammar.cpp b/clang-tools-extra/pseudo/lib/grammar/Grammar.cpp
index 8e3bcb7afb37..dc4d95820264 100644
--- a/clang-tools-extra/pseudo/lib/grammar/Grammar.cpp
+++ b/clang-tools-extra/pseudo/lib/grammar/Grammar.cpp
@@ -59,8 +59,11 @@ std::string Grammar::dumpRule(RuleID RID) const {
   llvm::raw_string_ostream OS(Result);
   const Rule &R = T->Rules[RID];
   OS << symbolName(R.Target) << " :=";
-  for (SymbolID SID : R.seq())
-    OS << " " << symbolName(SID);
+  for (unsigned I = 0; I < R.Size; ++I) {
+    OS << " " << symbolName(R.Sequence[I]);
+    if (R.RecoveryIndex == I)
+      OS << " [recover=" << static_cast<unsigned>(R.Recovery) << "]";
+  }
   if (R.Guard)
     OS << " [guard=" << T->AttributeValues[R.Guard] << "]";
   return Result;

diff  --git a/clang-tools-extra/pseudo/lib/grammar/GrammarBNF.cpp b/clang-tools-extra/pseudo/lib/grammar/GrammarBNF.cpp
index 9fbc34da7155..281c08681c92 100644
--- a/clang-tools-extra/pseudo/lib/grammar/GrammarBNF.cpp
+++ b/clang-tools-extra/pseudo/lib/grammar/GrammarBNF.cpp
@@ -106,6 +106,17 @@ class GrammarBuilder {
 
     assert(T->Rules.size() < (1 << RuleBits) &&
            "Too many rules to fit in RuleID bits!");
+    // Wherever RHS contains { foo }, mark foo for brace-recovery.
+    // FIXME: this should be grammar annotations instead.
+    for (auto &Rule : T->Rules) {
+      for (unsigned I = 2; I < Rule.Size; ++I)
+        if (Rule.Sequence[I] == tokenSymbol(tok::r_brace) &&
+            Rule.Sequence[I - 2] == tokenSymbol(tok::l_brace) &&
+            !isToken(Rule.Sequence[I - 1])) {
+          Rule.Recovery = RecoveryStrategy::Braces;
+          Rule.RecoveryIndex = I - 1;
+        }
+    }
     const auto &SymbolOrder = getTopologicalOrder(T.get());
     llvm::stable_sort(
         T->Rules, [&SymbolOrder](const Rule &Left, const Rule &Right) {

diff  --git a/clang-tools-extra/pseudo/lib/grammar/LRGraph.cpp b/clang-tools-extra/pseudo/lib/grammar/LRGraph.cpp
index 5312a675cdb8..290a4f32f576 100644
--- a/clang-tools-extra/pseudo/lib/grammar/LRGraph.cpp
+++ b/clang-tools-extra/pseudo/lib/grammar/LRGraph.cpp
@@ -120,6 +120,20 @@ nextAvailableKernelItems(const State &S, const Grammar &G) {
   return Results;
 }
 
+std::vector<std::pair<RecoveryStrategy, SymbolID>>
+availableRecovery(const State &S, const Grammar &G) {
+  std::vector<std::pair<RecoveryStrategy, SymbolID>> Result;
+  for (const Item &I : S.Items) {
+    const auto &Rule = G.lookupRule(I.rule());
+    if (I.dot() != Rule.RecoveryIndex)
+      continue;
+    Result.push_back({Rule.Recovery, Rule.seq()[Rule.RecoveryIndex]});
+  }
+  llvm::sort(Result);
+  Result.erase(std::unique(Result.begin(), Result.end()), Result.end());
+  return Result;
+}
+
 } // namespace
 
 std::string Item::dump(const Grammar &G) const {
@@ -130,9 +144,10 @@ std::string Item::dump(const Grammar &G) const {
       Results.push_back(G.symbolName(SID));
     return Results;
   };
-  return llvm::formatv("{0} := {1} • {2}", G.symbolName(Rule.Target),
+  return llvm::formatv("{0} := {1} • {2}{3}", G.symbolName(Rule.Target),
                        llvm::join(ToNames(Rule.seq().take_front(DotPos)), " "),
-                       llvm::join(ToNames(Rule.seq().drop_front(DotPos)), " "))
+                       llvm::join(ToNames(Rule.seq().drop_front(DotPos)), " "),
+                       Rule.RecoveryIndex == DotPos ? " [recovery]" : "")
       .str();
 }
 
@@ -181,6 +196,11 @@ LRGraph LRGraph::buildLR0(const Grammar &G) {
       Edges.push_back({Src, Dst, Label});
     }
 
+    void insertRecovery(StateID Src, RecoveryStrategy Strategy,
+                        SymbolID Result) {
+      Recoveries.push_back({Src, Strategy, Result});
+    }
+
     // Returns a state with the given id.
     const State &find(StateID ID) const {
       assert(ID < States.size());
@@ -194,9 +214,10 @@ LRGraph LRGraph::buildLR0(const Grammar &G) {
     LRGraph build() && {
       States.shrink_to_fit();
       Edges.shrink_to_fit();
+      Recoveries.shrink_to_fit();
       llvm::sort(StartStates);
       StartStates.shrink_to_fit();
-      return LRGraph(std::move(States), std::move(Edges),
+      return LRGraph(std::move(States), std::move(Edges), std::move(Recoveries),
                      std::move(StartStates));
     }
 
@@ -205,6 +226,7 @@ LRGraph LRGraph::buildLR0(const Grammar &G) {
     llvm::DenseMap<ItemSet, /*index of States*/ size_t> StatesIndex;
     std::vector<State> States;
     std::vector<Edge> Edges;
+    std::vector<Recovery> Recoveries;
     const Grammar &G;
     std::vector<std::pair<SymbolID, StateID>> StartStates;
   } Builder(G);
@@ -225,15 +247,16 @@ LRGraph LRGraph::buildLR0(const Grammar &G) {
   }
 
   while (!PendingStates.empty()) {
-    auto CurrentStateID = PendingStates.back();
+    auto StateID = PendingStates.back();
     PendingStates.pop_back();
-    for (auto Next :
-         nextAvailableKernelItems(Builder.find(CurrentStateID), G)) {
+    for (auto Next : nextAvailableKernelItems(Builder.find(StateID), G)) {
       auto Insert = Builder.insert(Next.second);
       if (Insert.second) // new state, insert to the pending queue.
         PendingStates.push_back(Insert.first);
-      Builder.insertEdge(CurrentStateID, Insert.first, Next.first);
+      Builder.insertEdge(StateID, Insert.first, Next.first);
     }
+    for (auto Recovery : availableRecovery(Builder.find(StateID), G))
+      Builder.insertRecovery(StateID, Recovery.first, Recovery.second);
   }
   return std::move(Builder).build();
 }

diff  --git a/clang-tools-extra/pseudo/lib/grammar/LRTableBuild.cpp b/clang-tools-extra/pseudo/lib/grammar/LRTableBuild.cpp
index 15b66ceea75e..3d6d440dd4ab 100644
--- a/clang-tools-extra/pseudo/lib/grammar/LRTableBuild.cpp
+++ b/clang-tools-extra/pseudo/lib/grammar/LRTableBuild.cpp
@@ -65,6 +65,25 @@ LRTable LRTable::Builder::build() && {
   }
   Table.ReduceOffset.push_back(Table.Reduces.size());
 
+  // Error recovery entries: sort (no dups already), and build offset lookup.
+  llvm::sort(Recoveries, [&](const auto &L, const auto &R) {
+    return std::tie(L.first, L.second.Result, L.second.Strategy) <
+           std::tie(R.first, R.second.Result, R.second.Strategy);
+  });
+  Table.Recoveries.reserve(Recoveries.size());
+  for (const auto &R : Recoveries)
+    Table.Recoveries.push_back({R.second.Strategy, R.second.Result});
+  Table.RecoveryOffset = std::vector<uint32_t>(NumStates + 1, 0);
+  unsigned SortedIndex = 0;
+  for (StateID State = 0; State < NumStates; ++State) {
+    Table.RecoveryOffset[State] = SortedIndex;
+    while (SortedIndex < Recoveries.size() &&
+           Recoveries[SortedIndex].first == State)
+      SortedIndex++;
+  }
+  Table.RecoveryOffset[NumStates] = SortedIndex;
+  assert(SortedIndex == Recoveries.size());
+
   return Table;
 }
 
@@ -74,6 +93,10 @@ LRTable LRTable::buildSLR(const Grammar &G) {
   Build.StartStates = Graph.startStates();
   for (const auto &T : Graph.edges())
     Build.Transition.try_emplace({T.Src, T.Label}, T.Dst);
+  for (const auto &Entry : Graph.recoveries())
+    Build.Recoveries.push_back(
+        {Entry.Src, Recovery{Entry.Strategy, Entry.Result}});
+  Build.FollowSets = followSets(G);
   assert(Graph.states().size() <= (1 << StateBits) &&
          "Graph states execceds the maximum limit!");
   // Add reduce actions.

diff  --git a/clang-tools-extra/pseudo/test/cxx/empty-member-spec.cpp b/clang-tools-extra/pseudo/test/cxx/empty-member-spec.cpp
index e6dbbddf0906..071790e8d0a2 100644
--- a/clang-tools-extra/pseudo/test/cxx/empty-member-spec.cpp
+++ b/clang-tools-extra/pseudo/test/cxx/empty-member-spec.cpp
@@ -2,7 +2,7 @@
 class Foo {
 public:
 };
-// CHECK:      decl-specifier-seq~class-specifier := class-head { member-specification }
+// CHECK:      decl-specifier-seq~class-specifier := class-head { member-specification [recover=1] }
 // CHECK-NEXT: ├─class-head := class-key class-head-name
 // CHECK-NEXT: │ ├─class-key~CLASS := tok[0]
 // CHECK-NEXT: │ └─class-head-name~IDENTIFIER := tok[1]

diff  --git a/clang-tools-extra/pseudo/test/cxx/recovery-init-list.cpp b/clang-tools-extra/pseudo/test/cxx/recovery-init-list.cpp
new file mode 100644
index 000000000000..a8bbe88dadd8
--- /dev/null
+++ b/clang-tools-extra/pseudo/test/cxx/recovery-init-list.cpp
@@ -0,0 +1,13 @@
+// RUN: clang-pseudo -grammar=%cxx-bnf-file -source=%s --print-forest | FileCheck %s
+auto x = { complete garbage };
+// CHECK:      translation-unit~simple-declaration
+// CHECK-NEXT: ├─decl-specifier-seq~AUTO := tok[0]
+// CHECK-NEXT: ├─init-declarator-list~init-declarator
+// CHECK-NEXT: │ ├─declarator~IDENTIFIER := tok[1]
+// CHECK-NEXT: │ └─initializer~brace-or-equal-initializer
+// CHECK-NEXT: │   ├─= := tok[2]
+// CHECK-NEXT: │   └─initializer-clause~braced-init-list
+// CHECK-NEXT: │     ├─{ := tok[3]
+// CHECK-NEXT: │     ├─initializer-list := <opaque>
+// CHECK-NEXT: │     └─} := tok[6]
+// CHECK-NEXT: └─; := tok[7]

diff  --git a/clang-tools-extra/pseudo/unittests/GLRTest.cpp b/clang-tools-extra/pseudo/unittests/GLRTest.cpp
index 15ed5bb24274..c0f3e3dd46a0 100644
--- a/clang-tools-extra/pseudo/unittests/GLRTest.cpp
+++ b/clang-tools-extra/pseudo/unittests/GLRTest.cpp
@@ -7,8 +7,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "clang-pseudo/GLR.h"
-#include "clang-pseudo/Token.h"
+#include "clang-pseudo/Bracket.h"
 #include "clang-pseudo/Language.h"
+#include "clang-pseudo/Token.h"
 #include "clang-pseudo/grammar/Grammar.h"
 #include "clang/Basic/LangOptions.h"
 #include "clang/Basic/TokenKinds.h"
@@ -33,11 +34,13 @@ namespace {
 using StateID = LRTable::StateID;
 using testing::AllOf;
 using testing::ElementsAre;
+using testing::IsEmpty;
 using testing::UnorderedElementsAre;
 
 MATCHER_P(state, StateID, "") { return arg->State == StateID; }
 MATCHER_P(parsedSymbol, FNode, "") { return arg->Payload == FNode; }
 MATCHER_P(parsedSymbolID, SID, "") { return arg->Payload->symbol() == SID; }
+MATCHER_P(start, Start, "") { return arg->Payload->startTokenIndex() == Start; }
 
 testing::Matcher<const GSS::Node *>
 parents(llvm::ArrayRef<const GSS::Node *> Parents) {
@@ -247,9 +250,9 @@ TEST_F(GLRTest, ReduceJoiningWithMultipleBases) {
       /*State=*/1, /*ForestNode=*/CVQualifierNode, /*Parents=*/{GSSNode0});
   const auto *GSSNode2 = GSStack.addNode(
       /*State=*/2, /*ForestNode=*/CVQualifierNode, /*Parents=*/{GSSNode0});
-  const auto *GSSNode3 =
-      GSStack.addNode(/*State=*/3, /*ForestNode=*/ClassNameNode,
-                      /*Parents=*/{GSSNode1});
+  const auto *GSSNode3 = GSStack.addNode(
+      /*State=*/3, /*ForestNode=*/ClassNameNode,
+      /*Parents=*/{GSSNode1});
   const auto *GSSNode4 =
       GSStack.addNode(/*State=*/4, /*ForestNode=*/EnumNameNode,
                       /*Parents=*/{GSSNode2});
@@ -366,6 +369,120 @@ TEST_F(GLRTest, ReduceLookahead) {
   EXPECT_THAT(Heads, ElementsAre(GSSNode1));
 }
 
+TEST_F(GLRTest, Recover) {
+  // Recovery while parsing "word" inside braces.
+  //  Before:
+  //    0--1({)--2(?)
+  //  After recovering a `word` at state 1:
+  //    0--3(word)  // 3 is goto(1, word)
+  buildGrammar({"word"}, {});
+  LRTable::Builder B(TestLang.G);
+  B.Transition[{StateID{1}, id("word")}] = StateID{3};
+  B.Recoveries.push_back({StateID{1}, {RecoveryStrategy::Braces, id("word")}});
+  TestLang.Table = std::move(B).build();
+
+  auto *LBrace = &Arena.createTerminal(tok::l_brace, 0);
+  auto *Question1 = &Arena.createTerminal(tok::question, 1);
+  const auto *Root = GSStack.addNode(0, nullptr, {});
+  const auto *OpenedBraces = GSStack.addNode(1, LBrace, {Root});
+  const auto *AfterQuestion1 = GSStack.addNode(2, Question1, {OpenedBraces});
+
+  // Need a token stream with paired braces so the strategy works.
+  clang::LangOptions LOptions;
+  TokenStream Tokens = cook(lex("{ ? ? ? }", LOptions), LOptions);
+  pairBrackets(Tokens);
+  std::vector<const GSS::Node *> NewHeads;
+
+  unsigned TokenIndex = 2;
+  glrRecover({AfterQuestion1}, TokenIndex, {Tokens, Arena, GSStack}, TestLang,
+             NewHeads);
+  EXPECT_EQ(TokenIndex, 4u) << "should skip ahead to matching brace";
+  EXPECT_THAT(NewHeads, ElementsAre(AllOf(state(3), parsedSymbolID(id("word")),
+                                          parents({OpenedBraces}), start(1u))));
+  EXPECT_EQ(NewHeads.front()->Payload->kind(), ForestNode::Opaque);
+
+  // Test recovery failure: omit closing brace so strategy fails
+  TokenStream NoRBrace = cook(lex("{ ? ? ? ?", LOptions), LOptions);
+  pairBrackets(NoRBrace);
+  NewHeads.clear();
+  TokenIndex = 2;
+  glrRecover({AfterQuestion1}, TokenIndex, {NoRBrace, Arena, GSStack}, TestLang,
+             NewHeads);
+  EXPECT_EQ(TokenIndex, 2u) << "should not advance on failure";
+  EXPECT_THAT(NewHeads, IsEmpty());
+}
+
+TEST_F(GLRTest, RecoverRightmost) {
+  // In a nested block structure, we recover at the innermost possible block.
+  //  Before:
+  //    0--1({)--1({)--1({)
+  //  After recovering a `block` at inside the second braces:
+  //    0--1({)--2(body)  // 2 is goto(1, body)
+  buildGrammar({"body"}, {});
+  LRTable::Builder B(TestLang.G);
+  B.Transition[{StateID{1}, id("body")}] = StateID{2};
+  B.Recoveries.push_back({StateID{1}, {RecoveryStrategy::Braces, id("body")}});
+  TestLang.Table = std::move(B).build();
+
+  clang::LangOptions LOptions;
+  // Innermost brace is unmatched, to test fallback to next brace.
+  TokenStream Tokens = cook(lex("{ { { ? } }", LOptions), LOptions);
+  Tokens.tokens()[0].Pair = 5;
+  Tokens.tokens()[1].Pair = 4;
+  Tokens.tokens()[4].Pair = 1;
+  Tokens.tokens()[5].Pair = 0;
+
+  auto *Brace1 = &Arena.createTerminal(tok::l_brace, 0);
+  auto *Brace2 = &Arena.createTerminal(tok::l_brace, 1);
+  auto *Brace3 = &Arena.createTerminal(tok::l_brace, 2);
+  const auto *Root = GSStack.addNode(0, nullptr, {});
+  const auto *In1 = GSStack.addNode(1, Brace1, {Root});
+  const auto *In2 = GSStack.addNode(1, Brace2, {In1});
+  const auto *In3 = GSStack.addNode(1, Brace3, {In2});
+
+  unsigned TokenIndex = 3;
+  std::vector<const GSS::Node *> NewHeads;
+  glrRecover({In3}, TokenIndex, {Tokens, Arena, GSStack}, TestLang, NewHeads);
+  EXPECT_EQ(TokenIndex, 5u);
+  EXPECT_THAT(NewHeads, ElementsAre(AllOf(state(2), parsedSymbolID(id("body")),
+                                          parents({In2}), start(2u))));
+}
+
+TEST_F(GLRTest, RecoverAlternatives) {
+  // Recovery inside braces with multiple equally good options
+  //  Before:
+  //    0--1({)
+  //  After recovering either `word` or `number` inside the braces:
+  //    0--1({)--2(word)   // 2 is goto(1, word)
+  //          └--3(number) // 3 is goto(1, number)
+  buildGrammar({"number", "word"}, {});
+  LRTable::Builder B(TestLang.G);
+  B.Transition[{StateID{1}, id("number")}] = StateID{2};
+  B.Transition[{StateID{1}, id("word")}] = StateID{3};
+  B.Recoveries.push_back(
+      {StateID{1}, {RecoveryStrategy::Braces, id("number")}});
+  B.Recoveries.push_back({StateID{1}, {RecoveryStrategy::Braces, id("word")}});
+  TestLang.Table = std::move(B).build();
+  auto *LBrace = &Arena.createTerminal(tok::l_brace, 0);
+  const auto *Root = GSStack.addNode(0, nullptr, {});
+  const auto *OpenedBraces = GSStack.addNode(1, LBrace, {Root});
+
+  clang::LangOptions LOptions;
+  TokenStream Tokens = cook(lex("{ ? }", LOptions), LOptions);
+  pairBrackets(Tokens);
+  std::vector<const GSS::Node *> NewHeads;
+  unsigned TokenIndex = 1;
+
+  glrRecover({OpenedBraces}, TokenIndex, {Tokens, Arena, GSStack}, TestLang,
+             NewHeads);
+  EXPECT_EQ(TokenIndex, 2u);
+  EXPECT_THAT(NewHeads,
+              UnorderedElementsAre(AllOf(state(2), parsedSymbolID(id("number")),
+                                         parents({OpenedBraces}), start(1u)),
+                                   AllOf(state(3), parsedSymbolID(id("word")),
+                                         parents({OpenedBraces}), start(1u))));
+}
+
 TEST_F(GLRTest, PerfectForestNodeSharing) {
   // Run the GLR on a simple grammar and test that we build exactly one forest
   // node per (SymbolID, token range).
@@ -435,6 +552,40 @@ TEST_F(GLRTest, GLRReduceOrder) {
             "[  0, end)     └─IDENTIFIER := tok[0]\n");
 }
 
+TEST_F(GLRTest, RecoveryEndToEnd) {
+  // Simple example of brace-based recovery showing:
+  //  - recovered region includes tokens both ahead of and behind the cursor
+  //  - multiple possible recovery rules
+  //  - recovery from outer scopes is rejected
+  build(R"bnf(
+    _ := block
+
+    block := { block }
+    block := { numbers }
+    numbers := NUMERIC_CONSTANT NUMERIC_CONSTANT
+  )bnf");
+  TestLang.Table = LRTable::buildSLR(TestLang.G);
+  clang::LangOptions LOptions;
+  TokenStream Tokens = cook(lex("{ { 42 ? } }", LOptions), LOptions);
+  pairBrackets(Tokens);
+
+  const ForestNode &Parsed =
+      glrParse({Tokens, Arena, GSStack}, id("block"), TestLang);
+  EXPECT_EQ(Parsed.dumpRecursive(TestLang.G),
+            "[  0, end) block := { block [recover=1] }\n"
+            "[  0,   1) ├─{ := tok[0]\n"
+            "[  1,   5) ├─block := <ambiguous>\n"
+            "[  1,   5) │ ├─block := { block [recover=1] }\n"
+            "[  1,   2) │ │ ├─{ := tok[1]\n"
+            "[  2,   4) │ │ ├─block := <opaque>\n"
+            "[  4,   5) │ │ └─} := tok[4]\n"
+            "[  1,   5) │ └─block := { numbers [recover=1] }\n"
+            "[  1,   2) │   ├─{ := tok[1]\n"
+            "[  2,   4) │   ├─numbers := <opaque>\n"
+            "[  4,   5) │   └─} := tok[4]\n"
+            "[  5, end) └─} := tok[5]\n");
+}
+
 TEST_F(GLRTest, NoExplicitAccept) {
   build(R"bnf(
     _ := test


        


More information about the cfe-commits mailing list