[clang] 20e05b9 - [syntax][pseudo] Add Grammar for the clang pseudo-parser

Thu Feb 3 02:28:51 PST 2022

Author: Haojian Wu
Date: 2022-02-03T11:28:27+01:00
New Revision: 20e05b9f0ebea35076b96c89257becd35d6de859

URL: https://github.com/llvm/llvm-project/commit/20e05b9f0ebea35076b96c89257becd35d6de859
DIFF: https://github.com/llvm/llvm-project/commit/20e05b9f0ebea35076b96c89257becd35d6de859.diff

LOG: [syntax][pseudo] Add Grammar for the clang pseudo-parser

This patch introduces the Grammar class, which is a critial piece for constructing
a tabled-based parser.

As the first patch, the scope is limited to:
  - define base types (symbol, rules) of modeling the grammar
  - construct Grammar by parsing the BNF file (annotations are excluded for now)

Differential Revision: https://reviews.llvm.org/D114790

Added: 
    clang/include/clang/Tooling/Syntax/Pseudo/Grammar.h
    clang/lib/Tooling/Syntax/Pseudo/CMakeLists.txt
    clang/lib/Tooling/Syntax/Pseudo/Grammar.cpp
    clang/lib/Tooling/Syntax/Pseudo/GrammarBNF.cpp
    clang/unittests/Tooling/Syntax/Pseudo/CMakeLists.txt
    clang/unittests/Tooling/Syntax/Pseudo/GrammarTests.cpp

Modified: 
    clang/lib/Tooling/Syntax/CMakeLists.txt
    clang/unittests/Tooling/Syntax/CMakeLists.txt

Removed: 
    


################################################################################
diff  --git a/clang/include/clang/Tooling/Syntax/Pseudo/Grammar.h b/clang/include/clang/Tooling/Syntax/Pseudo/Grammar.h
new file mode 100644
index 0000000000000..80db9f268ff13

--- /dev/null
+++ b/clang/include/clang/Tooling/Syntax/Pseudo/Grammar.h
@@ -0,0 +1,170 @@
+//===--- Grammar.h - grammar used by clang pseudo parser  --------*- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//  This file defines base structures for parsing & modeling a grammar for a
+//  programming language:
+//
+//    # This is a fake C++ BNF grammar
+//    _ := translation-unit
+//    translation-unit := declaration-seq_opt
+//    declaration-seq := declaration
+//    declaration-seq := declaration-seq declaration
+//
+//  A grammar formally describes a language, and it is constructed by a set of
+//  production rules. A rule is of BNF form (AAA := BBB CCC). A symbol is either
+//  non-terminal or terminal, identified by a SymbolID.
+//
+//  Notions about the BNF grammar:
+//  - "_" is the augmented symbol, formed by start symbols.
+//  - single-line comment is supported, starting with a #
+//  - A rule describes how a nonterminal (left side of :=) is constructed, and
+//    it is *per line* in the grammar file
+//  - Terminals (also called tokens) correspond to the clang::TokenKind; they
+//    are written in the grammar like "IDENTIFIER", "USING", "+"
+//  - Nonterminals are specified with "lower-case" names in the grammar; they
+//    shouldn't be nullable (has an empty sequence)
+//  - optional symbols are supported (specified with a _opt suffix), and they
+//    will be eliminated during the grammar parsing stage
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_TOOLING_SYNTAX_GRAMMAR_H
+#define LLVM_CLANG_TOOLING_SYNTAX_GRAMMAR_H
+
+#include "clang/Basic/TokenKinds.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringRef.h"
+#include <cstdint>
+#include <vector>
+
+namespace clang {
+namespace syntax {
+namespace pseudo {
+// A SymbolID uniquely identifies a terminal/non-terminal symbol in a grammar.
+// Non-terminal IDs are indexes into a table of non-terminal symbols.
+// Terminal IDs correspond to the clang TokenKind enum.
+using SymbolID = uint16_t;
+// SymbolID is only 12 bits wide.
+// There are maximum 2^11 terminals (aka tokens) and 2^11 nonterminals.
+static constexpr uint16_t SymbolBits = 12;
+static constexpr uint16_t NumTerminals = tok::NUM_TOKENS;
+// SymbolIDs with the top bit set are tokens/terminals.
+static constexpr SymbolID TokenFlag = 1 << (SymbolBits - 1);
+inline bool isToken(SymbolID ID) { return ID & TokenFlag; }
+inline bool isNonterminal(SymbolID ID) { return !isToken(ID); }
+// The terminals are always the clang tok::TokenKind (not all are used).
+inline tok::TokenKind symbolToToken(SymbolID SID) {
+  assert(isToken(SID));
+  SID &= ~TokenFlag;
+  assert(SID < NumTerminals);
+  return static_cast<tok::TokenKind>(SID);
+}
+inline SymbolID tokenSymbol(tok::TokenKind TK) {
+  return TokenFlag | static_cast<SymbolID>(TK);
+}
+
+// A RuleID uniquely identifies a production rule in a grammar.
+// It is an index into a table of rules.
+using RuleID = uint16_t;
+// There are maximum 2^12 rules.
+static constexpr unsigned RuleBits = 12;
+
+// Represent a production rule in the grammar, e.g.
+//   expression := a b c
+//   ^Target       ^Sequence
+struct Rule {
+  Rule(SymbolID Target, llvm::ArrayRef<SymbolID> Seq);
+
+  // We occupy 4 bits for the sequence, in theory, it can be at most 2^4 tokens
+  // long, however, we're stricter in order to reduce the size, we limit the max
+  // lenth to 9 (this is the longest sequence in cxx grammar).
+  static constexpr unsigned SizeBits = 4;
+  static constexpr unsigned MaxElements = 9;
+  static_assert(MaxElements <= (1 << SizeBits), "Exceeds the maximum limit");
+  static_assert(SizeBits + SymbolBits <= 16,
+                "Must be able to store symbol ID + size efficiently");
+
+  // 16 bits for target symbol and size of sequence:
+  // SymbolID : 12 | Size : 4
+  SymbolID Target : SymbolBits;
+  uint8_t Size : SizeBits; // Size of the Sequence
+  SymbolID Sequence[MaxElements];
+
+  llvm::ArrayRef<SymbolID> seq() const {
+    return llvm::ArrayRef<SymbolID>(Sequence, Size);
+  }
+  friend bool operator==(const Rule &L, const Rule &R) {
+    return L.Target == R.Target && L.seq() == R.seq();
+  }
+};
+
+struct GrammarTable;
+
+// Grammar that describes a programming language, e.g. C++. It represents the
+// contents of the specified grammar.
+// It is a building block for constructing a table-based parser.
+class Grammar {
+public:
+  explicit Grammar(std::unique_ptr<GrammarTable> T) : T(std::move(T)) {}
+
+  // Parses grammar from a BNF file.
+  // Diagnostics emitted during parsing are stored in Diags.
+  static std::unique_ptr<Grammar> parseBNF(llvm::StringRef BNF,
+                                           std::vector<std::string> &Diags);
+
+  // Returns all rules of the given non-terminal symbol.
+  llvm::ArrayRef<Rule> rulesFor(SymbolID SID) const;
+  const Rule &lookupRule(RuleID RID) const;
+
+  // Gets symbol (terminal or non-terminal) name.
+  // Terminals have names like "," (kw_comma) or "OPERATOR" (kw_operator).
+  llvm::StringRef symbolName(SymbolID) const;
+
+  // Dumps the whole grammar.
+  std::string dump() const;
+  // Dumps a particular rule.
+  std::string dumpRule(RuleID) const;
+  // Dumps all rules of the given nonterminal symbol.
+  std::string dumpRules(SymbolID) const;
+
+  const GrammarTable &table() const { return *T; }
+
+private:
+  std::unique_ptr<GrammarTable> T;
+};
+
+// Storage for the underlying data of the Grammar.
+// It can be constructed dynamically (from compiling BNF file) or statically
+// (a compiled data-source).
+struct GrammarTable {
+  struct Nonterminal {
+    std::string Name;
+    // Corresponding rules that construct the non-terminal, it is a [start, end)
+    // index range of the Rules table.
+    struct {
+      RuleID start;
+      RuleID end;
+    } RuleRange;
+  };
+
+  // The rules are sorted (and thus grouped) by target symbol.
+  // RuleID is the index of the vector.
+  std::vector<Rule> Rules;
+  // A table of terminals (aka tokens). It correspond to the clang::Token.
+  // clang::tok::TokenKind is the index of the table.
+  std::vector<std::string> Terminals;
+  // A table of nonterminals, sorted by name.
+  // SymbolID is the index of the table.
+  std::vector<Nonterminal> Nonterminals;
+};
+
+} // namespace pseudo
+} // namespace syntax
+} // namespace clang
+
+#endif // LLVM_CLANG_TOOLING_SYNTAX_GRAMMAR_H

diff  --git a/clang/lib/Tooling/Syntax/CMakeLists.txt b/clang/lib/Tooling/Syntax/CMakeLists.txt
index e933faeb0f506..f8d9184977e8a 100644
--- a/clang/lib/Tooling/Syntax/CMakeLists.txt
+++ b/clang/lib/Tooling/Syntax/CMakeLists.txt
@@ -19,3 +19,5 @@ add_clang_library(clangToolingSyntax
   DEPENDS
   omp_gen
   )
+
+add_subdirectory(Pseudo)

diff  --git a/clang/lib/Tooling/Syntax/Pseudo/CMakeLists.txt b/clang/lib/Tooling/Syntax/Pseudo/CMakeLists.txt
new file mode 100644
index 0000000000000..77dce4b70ea11
--- /dev/null
+++ b/clang/lib/Tooling/Syntax/Pseudo/CMakeLists.txt
@@ -0,0 +1,10 @@
+set(LLVM_LINK_COMPONENTS Support)
+
+add_clang_library(clangSyntaxPseudo
+  Grammar.cpp
+  GrammarBNF.cpp
+  
+  LINK_LIBS
+  clangBasic
+  clangLex
+  )

diff  --git a/clang/lib/Tooling/Syntax/Pseudo/Grammar.cpp b/clang/lib/Tooling/Syntax/Pseudo/Grammar.cpp
new file mode 100644
index 0000000000000..014e6b4d28bc6
--- /dev/null
+++ b/clang/lib/Tooling/Syntax/Pseudo/Grammar.cpp
@@ -0,0 +1,77 @@
+//===--- Grammar.cpp - Grammar for clang pseudo parser  ----------*- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/Tooling/Syntax/Pseudo/Grammar.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace clang {
+namespace syntax {
+namespace pseudo {
+
+Rule::Rule(SymbolID Target, llvm::ArrayRef<SymbolID> Sequence)
+    : Target(Target), Size(static_cast<uint8_t>(Sequence.size())) {
+  assert(Sequence.size() <= Rule::MaxElements);
+  llvm::copy(Sequence, this->Sequence);
+}
+
+llvm::ArrayRef<Rule> Grammar::rulesFor(SymbolID SID) const {
+  assert(isNonterminal(SID));
+  const auto &R = T->Nonterminals[SID].RuleRange;
+  assert(R.end <= T->Rules.size());
+  return llvm::makeArrayRef(&T->Rules[R.start], R.end - R.start);
+}
+
+const Rule &Grammar::lookupRule(RuleID RID) const {
+  assert(RID < T->Rules.size());
+  return T->Rules[RID];
+}
+
+llvm::StringRef Grammar::symbolName(SymbolID SID) const {
+  if (isToken(SID))
+    return T->Terminals[symbolToToken(SID)];
+  return T->Nonterminals[SID].Name;
+}
+
+std::string Grammar::dumpRule(RuleID RID) const {
+  std::string Result;
+  llvm::raw_string_ostream OS(Result);
+  const Rule &R = T->Rules[RID];
+  OS << symbolName(R.Target) << " :=";
+  for (SymbolID SID : R.seq())
+    OS << " " << symbolName(SID);
+  return Result;
+}
+
+std::string Grammar::dumpRules(SymbolID SID) const {
+  assert(isNonterminal(SID));
+  std::string Result;
+  const auto &Range = T->Nonterminals[SID].RuleRange;
+  for (RuleID RID = Range.start; RID < Range.end; ++RID)
+    Result.append(dumpRule(RID)).push_back('\n');
+  return Result;
+}
+
+std::string Grammar::dump() const {
+  std::string Result;
+  llvm::raw_string_ostream OS(Result);
+  OS << "Nonterminals:\n";
+  for (SymbolID SID = 0; SID < T->Nonterminals.size(); ++SID)
+    OS << llvm::formatv("  {0} {1}\n", SID, symbolName(SID));
+  OS << "Rules:\n";
+  for (RuleID RID = 0; RID < T->Rules.size(); ++RID)
+    OS << llvm::formatv("  {0} {1}\n", RID, dumpRule(RID));
+  return OS.str();
+}
+
+} // namespace pseudo
+} // namespace syntax
+} // namespace clang

diff  --git a/clang/lib/Tooling/Syntax/Pseudo/GrammarBNF.cpp b/clang/lib/Tooling/Syntax/Pseudo/GrammarBNF.cpp
new file mode 100644
index 0000000000000..40181e049f253
--- /dev/null
+++ b/clang/lib/Tooling/Syntax/Pseudo/GrammarBNF.cpp
@@ -0,0 +1,260 @@
+//===--- GrammarBNF.cpp - build grammar from BNF files  ----------*- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/Basic/TokenKinds.h"
+#include "clang/Tooling/Syntax/Pseudo/Grammar.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/FormatVariadic.h"
+#include <memory>
+
+namespace clang {
+namespace syntax {
+namespace pseudo {
+
+namespace {
+static const llvm::StringRef OptSuffix = "_opt";
+static const llvm::StringRef StartSymbol = "_";
+
+void initTerminals(std::vector<std::string> &Out) {
+  Out.clear();
+  Out.reserve(NumTerminals);
+  for (unsigned I = 0; I < NumTerminals; ++I) {
+    tok::TokenKind K = static_cast<tok::TokenKind>(I);
+    if (const auto *Punc = tok::getPunctuatorSpelling(K))
+      Out.push_back(Punc);
+    else
+      Out.push_back(llvm::StringRef(tok::getTokenName(K)).upper());
+  }
+}
+// Builds grammar from BNF files.
+class GrammarBuilder {
+public:
+  GrammarBuilder(std::vector<std::string> &Diagnostics)
+      : Diagnostics(Diagnostics) {}
+
+  std::unique_ptr<Grammar> build(llvm::StringRef BNF) {
+    auto Specs = eliminateOptional(parse(BNF));
+
+    assert(llvm::all_of(Specs,
+                        [](const RuleSpec &R) {
+                          if (R.Target.endswith(OptSuffix))
+                            return false;
+                          return llvm::all_of(
+                              R.Sequence, [](const RuleSpec::Element &E) {
+                                return !E.Symbol.endswith(OptSuffix);
+                              });
+                        }) &&
+           "Optional symbols should be eliminated!");
+
+    auto T = std::make_unique<GrammarTable>();
+    initTerminals(T->Terminals);
+
+    // Assemble the name->ID and ID->nonterminal name maps.
+    llvm::DenseSet<llvm::StringRef> UniqueNonterminals;
+    llvm::DenseMap<llvm::StringRef, SymbolID> SymbolIds;
+    for (uint16_t I = 0; I < NumTerminals; ++I)
+      SymbolIds.try_emplace(T->Terminals[I], tokenSymbol(tok::TokenKind(I)));
+    auto Consider = [&](llvm::StringRef Name) {
+      if (!SymbolIds.count(Name))
+        UniqueNonterminals.insert(Name);
+    };
+    for (const auto &Spec : Specs) {
+      Consider(Spec.Target);
+      for (const RuleSpec::Element &Elt : Spec.Sequence)
+        Consider(Elt.Symbol);
+    }
+    llvm::for_each(UniqueNonterminals, [&T](llvm::StringRef Name) {
+      T->Nonterminals.emplace_back();
+      T->Nonterminals.back().Name = Name.str();
+    });
+    assert(T->Nonterminals.size() < (1 << (SymbolBits - 1)) &&
+           "Too many nonterminals to fit in SymbolID bits!");
+    llvm::sort(T->Nonterminals, [](const GrammarTable::Nonterminal &L,
+                                   const GrammarTable::Nonterminal &R) {
+      return L.Name < R.Name;
+    });
+    // Build name -> ID maps for nonterminals.
+    for (SymbolID SID = 0; SID < T->Nonterminals.size(); ++SID)
+      SymbolIds.try_emplace(T->Nonterminals[SID].Name, SID);
+
+    // Convert the rules.
+    T->Rules.reserve(Specs.size());
+    std::vector<SymbolID> Symbols;
+    auto Lookup = [SymbolIds](llvm::StringRef Name) {
+      auto It = SymbolIds.find(Name);
+      assert(It != SymbolIds.end() && "Didn't find the symbol in SymbolIds!");
+      return It->second;
+    };
+    for (const auto &Spec : Specs) {
+      assert(Spec.Sequence.size() < Rule::MaxElements);
+      Symbols.clear();
+      for (const RuleSpec::Element &Elt : Spec.Sequence)
+        Symbols.push_back(Lookup(Elt.Symbol));
+      T->Rules.push_back(Rule(Lookup(Spec.Target), Symbols));
+    }
+    assert(T->Rules.size() < (1 << RuleBits) &&
+           "Too many rules to fit in RuleID bits!");
+    llvm::sort(T->Rules, [](const Rule &Left, const Rule &Right) {
+      // Sorted by the Target.
+      return std::tie(Left.Target, Left.Size) <
+             std::tie(Right.Target, Right.Size);
+    });
+    RuleID RulePos = 0;
+    for (SymbolID SID = 0; SID < T->Nonterminals.size(); ++SID) {
+      RuleID Start = RulePos;
+      while (RulePos < T->Rules.size() && T->Rules[RulePos].Target == SID)
+        ++RulePos;
+      T->Nonterminals[SID].RuleRange = {Start, RulePos};
+    }
+    auto G = std::make_unique<Grammar>(std::move(T));
+    diagnoseGrammar(*G);
+    return G;
+  }
+
+private:
+  // Text representation of a BNF grammar rule.
+  struct RuleSpec {
+    llvm::StringRef Target;
+    struct Element {
+      llvm::StringRef Symbol; // Name of the symbol
+    };
+    std::vector<Element> Sequence;
+
+    std::string toString() const {
+      std::vector<llvm::StringRef> Body;
+      for (const auto &E : Sequence)
+        Body.push_back(E.Symbol);
+      return llvm::formatv("{0} := {1}", Target, llvm::join(Body, " "));
+    }
+  };
+
+  std::vector<RuleSpec> parse(llvm::StringRef Lines) {
+    std::vector<RuleSpec> Specs;
+    for (llvm::StringRef Line : llvm::split(Lines, '\n')) {
+      Line = Line.trim();
+      // Strip anything coming after the '#' (comment).
+      Line = Line.take_while([](char C) { return C != '#'; });
+      if (Line.empty())
+        continue;
+      RuleSpec Rule;
+      if (parseLine(Line, Rule))
+        Specs.push_back(std::move(Rule));
+    }
+    return Specs;
+  }
+
+  bool parseLine(llvm::StringRef Line, RuleSpec &Out) {
+    auto Parts = Line.split(":=");
+    if (Parts.first == Line) { // no separator in Line
+      Diagnostics.push_back(
+          llvm::formatv("Failed to parse '{0}': no separator :=", Line).str());
+      return false;
+    }
+
+    Out.Target = Parts.first.trim();
+    Out.Sequence.clear();
+    for (llvm::StringRef Chunk : llvm::split(Parts.second, ' ')) {
+      Chunk = Chunk.trim();
+      if (Chunk.empty())
+        continue; // skip empty
+
+      Out.Sequence.push_back({Chunk});
+    }
+    return true;
+  };
+
+  // Inlines all _opt symbols.
+  // For example, a rule E := id +_opt id, after elimination, we have two
+  // equivalent rules:
+  //   1) E := id + id
+  //   2) E := id id
+  std::vector<RuleSpec> eliminateOptional(llvm::ArrayRef<RuleSpec> Input) {
+    std::vector<RuleSpec> Results;
+    std::vector<RuleSpec::Element> Storage;
+    for (const auto &R : Input) {
+      eliminateOptionalTail(
+          R.Sequence, Storage, [&Results, &Storage, &R, this]() {
+            if (Storage.empty()) {
+              Diagnostics.push_back(
+                  llvm::formatv("Rule '{0}' has a nullable RHS", R.toString()));
+              return;
+            }
+            Results.push_back({R.Target, Storage});
+          });
+      assert(Storage.empty());
+    }
+    return Results;
+  }
+  void eliminateOptionalTail(llvm::ArrayRef<RuleSpec::Element> Elements,
+                             std::vector<RuleSpec::Element> &Result,
+                             llvm::function_ref<void()> CB) {
+    if (Elements.empty())
+      return CB();
+    auto Front = Elements.front();
+    if (!Front.Symbol.endswith(OptSuffix)) {
+      Result.push_back(std::move(Front));
+      eliminateOptionalTail(Elements.drop_front(1), Result, CB);
+      Result.pop_back();
+      return;
+    }
+    // Enumerate two options: skip the opt symbol, or inline the symbol.
+    eliminateOptionalTail(Elements.drop_front(1), Result, CB); // skip
+    Front.Symbol = Front.Symbol.drop_back(OptSuffix.size());   // drop "_opt"
+    Result.push_back(std::move(Front));
+    eliminateOptionalTail(Elements.drop_front(1), Result, CB);
+    Result.pop_back();
+  }
+
+  // Diagnoses the grammar and emit warnings if any.
+  void diagnoseGrammar(const Grammar &G) {
+    const auto &T = G.table();
+    for (SymbolID SID = 0; SID < T.Nonterminals.size(); ++SID) {
+      auto Range = T.Nonterminals[SID].RuleRange;
+      if (Range.start == Range.end)
+        Diagnostics.push_back(
+            llvm::formatv("No rules for nonterminal: {0}", G.symbolName(SID)));
+      llvm::StringRef NameRef = T.Nonterminals[SID].Name;
+      if (llvm::all_of(NameRef, llvm::isAlpha) && NameRef.upper() == NameRef) {
+        Diagnostics.push_back(llvm::formatv(
+            "Token-like name {0} is used as a nonterminal", G.symbolName(SID)));
+      }
+    }
+    for (RuleID RID = 0; RID + 1 < T.Rules.size(); ++RID) {
+      if (T.Rules[RID] == T.Rules[RID + 1])
+        Diagnostics.push_back(
+            llvm::formatv("Duplicate rule: `{0}`", G.dumpRule(RID)));
+      // Warning for nullable non-terminals
+      if (T.Rules[RID].Size == 0)
+        Diagnostics.push_back(
+            llvm::formatv("Rule `{0}` has a nullable RHS", G.dumpRule(RID)));
+    }
+    // symbol-id -> used counts
+    std::vector<unsigned> UseCounts(T.Nonterminals.size(), 0);
+    for (const Rule &R : T.Rules)
+      for (SymbolID SID : R.seq())
+        if (isNonterminal(SID))
+          ++UseCounts[SID];
+    for (SymbolID SID = 0; SID < UseCounts.size(); ++SID)
+      if (UseCounts[SID] == 0 && T.Nonterminals[SID].Name != StartSymbol)
+        Diagnostics.push_back(
+            llvm::formatv("Nonterminal never used: {0}", G.symbolName(SID)));
+  }
+  std::vector<std::string> &Diagnostics;
+};
+} // namespace
+
+std::unique_ptr<Grammar>
+Grammar::parseBNF(llvm::StringRef BNF, std::vector<std::string> &Diagnostics) {
+  Diagnostics.clear();
+  return GrammarBuilder(Diagnostics).build(BNF);
+}
+
+} // namespace pseudo
+} // namespace syntax
+} // namespace clang

diff  --git a/clang/unittests/Tooling/Syntax/CMakeLists.txt b/clang/unittests/Tooling/Syntax/CMakeLists.txt
index 174f3e7bf5732..a21d558c481cb 100644
--- a/clang/unittests/Tooling/Syntax/CMakeLists.txt
+++ b/clang/unittests/Tooling/Syntax/CMakeLists.txt
@@ -28,3 +28,5 @@ target_link_libraries(SyntaxTests
   PRIVATE
   LLVMTestingSupport
 )
+
+add_subdirectory(Pseudo)

diff  --git a/clang/unittests/Tooling/Syntax/Pseudo/CMakeLists.txt b/clang/unittests/Tooling/Syntax/Pseudo/CMakeLists.txt
new file mode 100644
index 0000000000000..77c6cbac026de
--- /dev/null
+++ b/clang/unittests/Tooling/Syntax/Pseudo/CMakeLists.txt
@@ -0,0 +1,20 @@
+set(LLVM_LINK_COMPONENTS
+  Support
+  )
+
+add_clang_unittest(ClangPseudoTests
+  GrammarTests.cpp
+)
+
+clang_target_link_libraries(ClangPseudoTests
+  PRIVATE
+  clangBasic
+  clangLex
+  clangSyntaxPseudo
+  clangTesting
+  )
+
+target_link_libraries(ClangPseudoTests
+  PRIVATE
+  LLVMTestingSupport
+  )

diff  --git a/clang/unittests/Tooling/Syntax/Pseudo/GrammarTests.cpp b/clang/unittests/Tooling/Syntax/Pseudo/GrammarTests.cpp
new file mode 100644
index 0000000000000..7954e53d786c3
--- /dev/null
+++ b/clang/unittests/Tooling/Syntax/Pseudo/GrammarTests.cpp
@@ -0,0 +1,102 @@
+//===--- GrammarTests.cpp - grammar tests  ----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/Tooling/Syntax/Pseudo/Grammar.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include <memory>
+
+namespace clang {
+namespace syntax {
+namespace pseudo {
+namespace {
+
+using testing::AllOf;
+using testing::ElementsAre;
+using testing::IsEmpty;
+using testing::UnorderedElementsAre;
+
+MATCHER_P(TargetID, SID, "") { return arg.Target == SID; }
+template <typename... T> testing::Matcher<const Rule &> Sequence(T... IDs) {
+  return testing::Property(&Rule::seq, ElementsAre(IDs...));
+}
+
+class GrammarTest : public ::testing::Test {
+public:
+  void build(llvm::StringRef BNF) {
+    Diags.clear();
+    G = Grammar::parseBNF(BNF, Diags);
+  }
+
+  SymbolID lookup(llvm::StringRef Name) const {
+    for (unsigned I = 0; I < NumTerminals; ++I)
+      if (G->table().Terminals[I] == Name)
+        return tokenSymbol(static_cast<tok::TokenKind>(I));
+    for (SymbolID ID = 0; ID < G->table().Nonterminals.size(); ++ID)
+      if (G->table().Nonterminals[ID].Name == Name)
+        return ID;
+    ADD_FAILURE() << "No such symbol found: " << Name;
+    return 0;
+  }
+
+protected:
+  std::unique_ptr<Grammar> G;
+  std::vector<std::string> Diags;
+};
+
+TEST_F(GrammarTest, Basic) {
+  build("expression := IDENTIFIER + expression # comment");
+  EXPECT_THAT(Diags, IsEmpty());
+
+  auto ExpectedRule =
+      AllOf(TargetID(lookup("expression")),
+            Sequence(lookup("IDENTIFIER"), lookup("+"), lookup("expression")));
+  auto ExpressionID = lookup("expression");
+  EXPECT_EQ(G->symbolName(ExpressionID), "expression");
+  EXPECT_THAT(G->rulesFor(ExpressionID), UnorderedElementsAre(ExpectedRule));
+  const auto &Rule = G->lookupRule(/*RID=*/0);
+  EXPECT_THAT(Rule, ExpectedRule);
+  EXPECT_THAT(G->symbolName(Rule.seq()[0]), "IDENTIFIER");
+  EXPECT_THAT(G->symbolName(Rule.seq()[1]), "+");
+  EXPECT_THAT(G->symbolName(Rule.seq()[2]), "expression");
+}
+
+TEST_F(GrammarTest, EliminatedOptional) {
+  build("_ := CONST_opt INT ;_opt");
+  EXPECT_THAT(Diags, IsEmpty());
+  EXPECT_THAT(G->table().Rules,
+              UnorderedElementsAre(
+                  Sequence(lookup("INT")),
+                  Sequence(lookup("CONST"), lookup("INT")),
+                  Sequence(lookup("CONST"), lookup("INT"), lookup(";")),
+                  Sequence(lookup("INT"), lookup(";"))));
+}
+
+TEST_F(GrammarTest, Diagnostics) {
+  build(R"cpp(
+    _ := ,_opt
+    _ := undefined-sym
+    null :=
+    _ := IDENFIFIE # a typo of the terminal IDENFITIER
+
+    invalid
+  )cpp");
+
+  EXPECT_THAT(Diags, UnorderedElementsAre(
+                         "Rule '_ := ,_opt' has a nullable RHS",
+                         "Rule 'null := ' has a nullable RHS",
+                         "No rules for nonterminal: undefined-sym",
+                         "Failed to parse 'invalid': no separator :=",
+                         "Token-like name IDENFIFIE is used as a nonterminal",
+                         "No rules for nonterminal: IDENFIFIE"));
+}
+
+} // namespace
+} // namespace pseudo
+} // namespace syntax
+} // namespace clang