[clang-tools-extra] 3132e9c - [pseudo] Key guards by RuleID, add guards to literals (and 0).

Sam McCall via cfe-commits cfe-commits at lists.llvm.org
Thu Jul 21 13:42:50 PDT 2022


Author: Sam McCall
Date: 2022-07-21T22:42:31+02:00
New Revision: 3132e9cd7c9fda63f7c0babf8bd5f6d755ce9027

URL: https://github.com/llvm/llvm-project/commit/3132e9cd7c9fda63f7c0babf8bd5f6d755ce9027
DIFF: https://github.com/llvm/llvm-project/commit/3132e9cd7c9fda63f7c0babf8bd5f6d755ce9027.diff

LOG: [pseudo] Key guards by RuleID, add guards to literals (and 0).

After this, NUMERIC_CONSTANT and strings should parse only one way.

There are 8 types of literals, and 24 valid (literal, TokenKind) pairs.
This means adding 8 new named guards (or 24, if we want to assert the token).

It seems fairly clear to me at this point that the guard names are unneccesary
indirection: the guards are in fact coupled to the rule signature.

(Also add the zero guard I forgot in the previous patch.)

Differential Revision: https://reviews.llvm.org/D130066

Added: 
    clang-tools-extra/pseudo/test/cxx/literals.cpp

Modified: 
    clang-tools-extra/pseudo/include/clang-pseudo/Language.h
    clang-tools-extra/pseudo/include/clang-pseudo/grammar/Grammar.h
    clang-tools-extra/pseudo/lib/GLR.cpp
    clang-tools-extra/pseudo/lib/cxx/CXX.cpp
    clang-tools-extra/pseudo/lib/cxx/cxx.bnf
    clang-tools-extra/pseudo/lib/grammar/Grammar.cpp
    clang-tools-extra/pseudo/lib/grammar/GrammarBNF.cpp
    clang-tools-extra/pseudo/test/cxx/mixed-designator.cpp
    clang-tools-extra/pseudo/tool/ClangPseudo.cpp
    clang-tools-extra/pseudo/unittests/GLRTest.cpp
    clang-tools-extra/pseudo/unittests/GrammarTest.cpp

Removed: 
    


################################################################################
diff  --git a/clang-tools-extra/pseudo/include/clang-pseudo/Language.h b/clang-tools-extra/pseudo/include/clang-pseudo/Language.h
index 410ca075a5da4..3696543915cba 100644
--- a/clang-tools-extra/pseudo/include/clang-pseudo/Language.h
+++ b/clang-tools-extra/pseudo/include/clang-pseudo/Language.h
@@ -46,7 +46,7 @@ struct Language {
   LRTable Table;
 
   // Binding extension ids to corresponding implementations.
-  llvm::DenseMap<ExtensionID, RuleGuard> Guards;
+  llvm::DenseMap<RuleID, RuleGuard> Guards;
   llvm::DenseMap<ExtensionID, RecoveryStrategy> RecoveryStrategies;
 
   // FIXME: add clang::LangOptions.

diff  --git a/clang-tools-extra/pseudo/include/clang-pseudo/grammar/Grammar.h b/clang-tools-extra/pseudo/include/clang-pseudo/grammar/Grammar.h
index ef22f71d801c0..a3d85aacef23e 100644
--- a/clang-tools-extra/pseudo/include/clang-pseudo/grammar/Grammar.h
+++ b/clang-tools-extra/pseudo/include/clang-pseudo/grammar/Grammar.h
@@ -28,12 +28,12 @@
 //  ), and an extension point corresponds to a piece of native code. For
 //  example, C++ grammar has a rule:
 //
-//    contextual-override := IDENTIFIER [guard=Override]
+//   compound_statement := { statement-seq [recover=Brackets] }
 //
-//  GLR parser only conducts the reduction of the rule if the IDENTIFIER
-//  content is `override`. This Override guard is implemented in CXX.cpp by
-//  binding the ExtensionID for the `Override` value to a specific C++ function
-//  that performs the check.
+//  The `recover` attribute instructs the parser that we should perform error
+//  recovery if parsing the statement-seq fails. The `Brackets` recovery
+//  heuristic is implemented in CXX.cpp by binding the ExtensionID for the
+//  `Recovery` value to a specific C++ function that finds the recovery point.
 //
 //  Notions about the BNF grammar:
 //  - "_" is the start symbol of the augmented grammar;
@@ -118,11 +118,8 @@ struct Rule {
   uint8_t Size : SizeBits; // Size of the Sequence
   SymbolID Sequence[MaxElements];
 
-  // A guard extension controls whether a reduction of a rule will be conducted
-  // by the GLR parser.
-  // 0 is sentinel unset extension ID, indicating there is no guard extension
-  // being set for this rule.
-  ExtensionID Guard = 0;
+  // A guarded rule has extra logic to determine whether the RHS is eligible.
+  bool Guarded = false;
 
   // Specifies the index within Sequence eligible for error recovery.
   // Given stmt := { stmt-seq_opt }, if we fail to parse the stmt-seq then we
@@ -136,7 +133,7 @@ struct Rule {
     return llvm::ArrayRef<SymbolID>(Sequence, Size);
   }
   friend bool operator==(const Rule &L, const Rule &R) {
-    return L.Target == R.Target && L.seq() == R.seq() && L.Guard == R.Guard;
+    return L.Target == R.Target && L.seq() == R.seq() && L.Guarded == R.Guarded;
   }
 };
 

diff  --git a/clang-tools-extra/pseudo/lib/GLR.cpp b/clang-tools-extra/pseudo/lib/GLR.cpp
index 1664089725197..df8381d04326b 100644
--- a/clang-tools-extra/pseudo/lib/GLR.cpp
+++ b/clang-tools-extra/pseudo/lib/GLR.cpp
@@ -416,11 +416,11 @@ class GLRReduce {
   }
 
 private:
-  bool canReduce(ExtensionID GuardID, RuleID RID,
+  bool canReduce(const Rule &R, RuleID RID,
                  llvm::ArrayRef<const ForestNode *> RHS) const {
-    if (!GuardID)
+    if (!R.Guarded)
       return true;
-    if (auto Guard = Lang.Guards.lookup(GuardID))
+    if (auto Guard = Lang.Guards.lookup(RID))
       return Guard(RHS, Params.Code);
     LLVM_DEBUG(llvm::dbgs()
                << llvm::formatv("missing guard implementation for rule {0}\n",
@@ -441,7 +441,7 @@ class GLRReduce {
           for (const auto *B : N->parents())
             llvm::dbgs() << "    --> base at S" << B->State << "\n";
         });
-        if (!canReduce(Rule.Guard, RID, TempSequence))
+        if (!canReduce(Rule, RID, TempSequence))
           return;
         // Copy the chain to stable storage so it can be enqueued.
         if (SequenceStorageCount == SequenceStorage.size())
@@ -572,7 +572,7 @@ class GLRReduce {
       TempSequence[Rule.Size - 1 - I] = Base->Payload;
       Base = Base->parents().front();
     }
-    if (!canReduce(Rule.Guard, *RID, TempSequence))
+    if (!canReduce(Rule, *RID, TempSequence))
       return true; // reduction is not available
     const ForestNode *Parsed =
         &Params.Forest.createSequence(Rule.Target, *RID, TempSequence);

diff  --git a/clang-tools-extra/pseudo/lib/cxx/CXX.cpp b/clang-tools-extra/pseudo/lib/cxx/CXX.cpp
index 4b78a67e0be0f..8fa24bfbbd0b5 100644
--- a/clang-tools-extra/pseudo/lib/cxx/CXX.cpp
+++ b/clang-tools-extra/pseudo/lib/cxx/CXX.cpp
@@ -11,6 +11,9 @@
 #include "clang-pseudo/Language.h"
 #include "clang-pseudo/grammar/Grammar.h"
 #include "clang-pseudo/grammar/LRTable.h"
+#include "clang/Basic/CharInfo.h"
+#include "clang/Basic/TokenKinds.h"
+#include "llvm/ADT/StringSwitch.h"
 #include <utility>
 
 namespace clang {
@@ -21,29 +24,88 @@ static const char *CXXBNF =
 #include "CXXBNF.inc"
     ;
 
-bool guardOverride(llvm::ArrayRef<const ForestNode *> RHS,
-                   const TokenStream &Tokens) {
-  assert(RHS.size() == 1 &&
-         RHS.front()->symbol() == tokenSymbol(clang::tok::identifier));
-  return Tokens.tokens()[RHS.front()->startTokenIndex()].text() == "override";
+// User-defined string literals look like `""suffix`.
+bool isStringUserDefined(const Token &Tok) {
+  return !Tok.text().endswith("\"");
 }
-bool guardFinal(llvm::ArrayRef<const ForestNode *> RHS,
-                const TokenStream &Tokens) {
-  assert(RHS.size() == 1 &&
-         RHS.front()->symbol() == tokenSymbol(clang::tok::identifier));
-  return Tokens.tokens()[RHS.front()->startTokenIndex()].text() == "final";
-}
-bool guardModule(llvm::ArrayRef<const ForestNode *> RHS,
-                 const TokenStream &Tokens) {
-  return Tokens.tokens()[RHS.front()->startTokenIndex()].text() == "module";
+bool isCharUserDefined(const Token &Tok) { return !Tok.text().endswith("'"); }
+
+// Combinable flags describing numbers.
+// Clang has just one numeric_token kind, the grammar has 4.
+enum NumericKind {
+  Integer = 0,
+  Floating = 1 << 0,
+  UserDefined = 1 << 1,
+};
+// Determine the kind of numeric_constant we have.
+// We can assume it's something valid, as it has been lexed.
+// FIXME: is this expensive enough that we should set flags on the token
+// and reuse them rather than computing it for each guard?
+unsigned numKind(const Token &Tok) {
+  assert(Tok.Kind == tok::numeric_constant);
+  llvm::StringRef Text = Tok.text();
+  if (Text.size() <= 1)
+    return Integer;
+  bool Hex =
+      Text.size() > 2 && Text[0] == '0' && (Text[1] == 'x' || Text[1] == 'X');
+  uint8_t K = Integer;
+
+  for (char C : Text) {
+    switch (C) {
+    case '.':
+      K |= Floating;
+      break;
+    case 'e':
+    case 'E':
+      if (!Hex)
+        K |= Floating;
+      break;
+    case 'p':
+    case 'P':
+      if (Hex)
+        K |= Floating;
+      break;
+    case '_':
+      K |= UserDefined;
+      break;
+    default:
+      break;
+    }
+  }
+
+  // We would be done here, but there are stdlib UDLs that lack _.
+  // We must distinguish these from the builtin suffixes.
+  unsigned LastLetter = Text.size();
+  while (LastLetter > 0 && isLetter(Text[LastLetter - 1]))
+    --LastLetter;
+  if (LastLetter == Text.size()) // Common case
+    return NumericKind(K);
+  // Trailing d/e/f are not part of the suffix in hex numbers.
+  while (Hex && LastLetter < Text.size() && isHexDigit(Text[LastLetter]))
+    ++LastLetter;
+  return llvm::StringSwitch<int, unsigned>(Text.substr(LastLetter))
+      // std::chrono
+      .Cases("h", "min", "s", "ms", "us", "ns", "d", "y", K | UserDefined)
+      // complex
+      .Cases("il", "i", "if", K | UserDefined)
+      .Default(K);
 }
-bool guardImport(llvm::ArrayRef<const ForestNode *> RHS,
-                 const TokenStream &Tokens) {
-  return Tokens.tokens()[RHS.front()->startTokenIndex()].text() == "import";
+
+// RHS is expected to contain a single terminal.
+// Returns the corresponding token.
+const Token &onlyToken(tok::TokenKind Kind,
+                       const ArrayRef<const ForestNode *> RHS,
+                       const TokenStream &Tokens) {
+  assert(RHS.size() == 1 && RHS.front()->symbol() == tokenSymbol(Kind));
+  return Tokens.tokens()[RHS.front()->startTokenIndex()];
 }
-bool guardExport(llvm::ArrayRef<const ForestNode *> RHS,
-                 const TokenStream &Tokens) {
-  return Tokens.tokens()[RHS.front()->startTokenIndex()].text() == "export";
+// RHS is expected to contain a single symbol.
+// Returns the corresponding ForestNode.
+const ForestNode &onlySymbol(SymbolID Kind,
+                             const ArrayRef<const ForestNode *> RHS,
+                             const TokenStream &Tokens) {
+  assert(RHS.size() == 1 && RHS.front()->symbol() == Kind);
+  return *RHS.front();
 }
 
 bool isFunctionDeclarator(const ForestNode *Declarator) {
@@ -93,29 +155,92 @@ bool isFunctionDeclarator(const ForestNode *Declarator) {
   }
   llvm_unreachable("unreachable");
 }
-bool guardFunction(llvm::ArrayRef<const ForestNode *> RHS,
-                   const TokenStream &Tokens) {
-  assert(RHS.size() == 1 &&
-         RHS.front()->symbol() == (SymbolID)(cxx::Symbol::declarator));
-  return isFunctionDeclarator(RHS.front());
-}
-bool guardNonFunction(llvm::ArrayRef<const ForestNode *> RHS,
-                      const TokenStream &Tokens) {
-  assert(RHS.size() == 1 &&
-         RHS.front()->symbol() == (SymbolID)(cxx::Symbol::declarator));
-  return !isFunctionDeclarator(RHS.front());
-}
 
 llvm::DenseMap<ExtensionID, RuleGuard> buildGuards() {
+#define TOKEN_GUARD(kind, cond)                                                \
+  [](llvm::ArrayRef<const ForestNode *> RHS, const TokenStream &Tokens) {      \
+    const Token &Tok = onlyToken(tok::kind, RHS, Tokens);                      \
+    return cond;                                                               \
+  }
+#define SYMBOL_GUARD(kind, cond)                                               \
+  [](llvm::ArrayRef<const ForestNode *> RHS, const TokenStream &Tokens) {      \
+    const ForestNode &N = onlySymbol((SymbolID)Symbol::kind, RHS, Tokens);     \
+    return cond;                                                               \
+  }
   return {
-      {(ExtensionID)Extension::Override, guardOverride},
-      {(ExtensionID)Extension::Final, guardFinal},
-      {(ExtensionID)Extension::Import, guardImport},
-      {(ExtensionID)Extension::Export, guardExport},
-      {(ExtensionID)Extension::Module, guardModule},
-      {(ExtensionID)Extension::FunctionDeclarator, guardFunction},
-      {(ExtensionID)Extension::NonFunctionDeclarator, guardNonFunction},
+      {(RuleID)Rule::function_declarator_0declarator,
+       SYMBOL_GUARD(declarator, isFunctionDeclarator(&N))},
+      {(RuleID)Rule::non_function_declarator_0declarator,
+       SYMBOL_GUARD(declarator, !isFunctionDeclarator(&N))},
+
+      {(RuleID)Rule::contextual_override_0identifier,
+       TOKEN_GUARD(identifier, Tok.text() == "override")},
+      {(RuleID)Rule::contextual_final_0identifier,
+       TOKEN_GUARD(identifier, Tok.text() == "final")},
+      {(RuleID)Rule::import_keyword_0identifier,
+       TOKEN_GUARD(identifier, Tok.text() == "import")},
+      {(RuleID)Rule::export_keyword_0identifier,
+       TOKEN_GUARD(identifier, Tok.text() == "export")},
+      {(RuleID)Rule::module_keyword_0identifier,
+       TOKEN_GUARD(identifier, Tok.text() == "module")},
+      {(RuleID)Rule::contextual_zero_0numeric_constant,
+       TOKEN_GUARD(numeric_constant, Tok.text() == "0")},
+
+      // The grammar distinguishes (only) user-defined vs plain string literals,
+      // where the clang lexer distinguishes (only) encoding types.
+      {(RuleID)Rule::user_defined_string_literal_chunk_0string_literal,
+       TOKEN_GUARD(string_literal, isStringUserDefined(Tok))},
+      {(RuleID)Rule::user_defined_string_literal_chunk_0utf8_string_literal,
+       TOKEN_GUARD(utf8_string_literal, isStringUserDefined(Tok))},
+      {(RuleID)Rule::user_defined_string_literal_chunk_0utf16_string_literal,
+       TOKEN_GUARD(utf16_string_literal, isStringUserDefined(Tok))},
+      {(RuleID)Rule::user_defined_string_literal_chunk_0utf32_string_literal,
+       TOKEN_GUARD(utf32_string_literal, isStringUserDefined(Tok))},
+      {(RuleID)Rule::user_defined_string_literal_chunk_0wide_string_literal,
+       TOKEN_GUARD(wide_string_literal, isStringUserDefined(Tok))},
+      {(RuleID)Rule::string_literal_chunk_0string_literal,
+       TOKEN_GUARD(string_literal, !isStringUserDefined(Tok))},
+      {(RuleID)Rule::string_literal_chunk_0utf8_string_literal,
+       TOKEN_GUARD(utf8_string_literal, !isStringUserDefined(Tok))},
+      {(RuleID)Rule::string_literal_chunk_0utf16_string_literal,
+       TOKEN_GUARD(utf16_string_literal, !isStringUserDefined(Tok))},
+      {(RuleID)Rule::string_literal_chunk_0utf32_string_literal,
+       TOKEN_GUARD(utf32_string_literal, !isStringUserDefined(Tok))},
+      {(RuleID)Rule::string_literal_chunk_0wide_string_literal,
+       TOKEN_GUARD(wide_string_literal, !isStringUserDefined(Tok))},
+      // And the same for chars.
+      {(RuleID)Rule::user_defined_character_literal_0char_constant,
+       TOKEN_GUARD(char_constant, isCharUserDefined(Tok))},
+      {(RuleID)Rule::user_defined_character_literal_0utf8_char_constant,
+       TOKEN_GUARD(utf8_char_constant, isCharUserDefined(Tok))},
+      {(RuleID)Rule::user_defined_character_literal_0utf16_char_constant,
+       TOKEN_GUARD(utf16_char_constant, isCharUserDefined(Tok))},
+      {(RuleID)Rule::user_defined_character_literal_0utf32_char_constant,
+       TOKEN_GUARD(utf32_char_constant, isCharUserDefined(Tok))},
+      {(RuleID)Rule::user_defined_character_literal_0wide_char_constant,
+       TOKEN_GUARD(wide_char_constant, isCharUserDefined(Tok))},
+      {(RuleID)Rule::character_literal_0char_constant,
+       TOKEN_GUARD(char_constant, !isCharUserDefined(Tok))},
+      {(RuleID)Rule::character_literal_0utf8_char_constant,
+       TOKEN_GUARD(utf8_char_constant, !isCharUserDefined(Tok))},
+      {(RuleID)Rule::character_literal_0utf16_char_constant,
+       TOKEN_GUARD(utf16_char_constant, !isCharUserDefined(Tok))},
+      {(RuleID)Rule::character_literal_0utf32_char_constant,
+       TOKEN_GUARD(utf32_char_constant, !isCharUserDefined(Tok))},
+      {(RuleID)Rule::character_literal_0wide_char_constant,
+       TOKEN_GUARD(wide_char_constant, !isCharUserDefined(Tok))},
+      // clang just has one NUMERIC_CONSTANT token for {ud,plain}x{float,int}
+      {(RuleID)Rule::user_defined_integer_literal_0numeric_constant,
+       TOKEN_GUARD(numeric_constant, numKind(Tok) == (Integer | UserDefined))},
+      {(RuleID)Rule::user_defined_floating_point_literal_0numeric_constant,
+       TOKEN_GUARD(numeric_constant, numKind(Tok) == (Floating | UserDefined))},
+      {(RuleID)Rule::integer_literal_0numeric_constant,
+       TOKEN_GUARD(numeric_constant, numKind(Tok) == Integer)},
+      {(RuleID)Rule::floating_point_literal_0numeric_constant,
+       TOKEN_GUARD(numeric_constant, numKind(Tok) == Floating)},
   };
+#undef TOKEN_GUARD
+#undef SYMBOL_GUARD
 }
 
 Token::Index recoverBrackets(Token::Index Begin, const TokenStream &Tokens) {

diff  --git a/clang-tools-extra/pseudo/lib/cxx/cxx.bnf b/clang-tools-extra/pseudo/lib/cxx/cxx.bnf
index 4e434b1e037cc..d49fb8fb7cf42 100644
--- a/clang-tools-extra/pseudo/lib/cxx/cxx.bnf
+++ b/clang-tools-extra/pseudo/lib/cxx/cxx.bnf
@@ -413,8 +413,8 @@ init-declarator-list := init-declarator-list , init-declarator
 #! to eliminate these false parses.
 init-declarator := non-function-declarator initializer_opt
 init-declarator := function-declarator requires-clause_opt
-function-declarator := declarator [guard=FunctionDeclarator]
-non-function-declarator := declarator [guard=NonFunctionDeclarator]
+function-declarator := declarator [guard]
+non-function-declarator := declarator [guard]
 declarator := ptr-declarator
 declarator := noptr-declarator parameters-and-qualifiers trailing-return-type
 ptr-declarator := noptr-declarator
@@ -715,18 +715,18 @@ literal := string-literal
 literal := boolean-literal
 literal := pointer-literal
 literal := user-defined-literal
-integer-literal := NUMERIC_CONSTANT
-character-literal := CHAR_CONSTANT
-character-literal := WIDE_CHAR_CONSTANT
-character-literal := UTF8_CHAR_CONSTANT
-character-literal := UTF16_CHAR_CONSTANT
-character-literal := UTF32_CHAR_CONSTANT
-floating-point-literal := NUMERIC_CONSTANT
-string-literal-chunk := STRING_LITERAL
-string-literal-chunk := WIDE_STRING_LITERAL
-string-literal-chunk := UTF8_STRING_LITERAL
-string-literal-chunk := UTF16_STRING_LITERAL
-string-literal-chunk := UTF32_STRING_LITERAL
+integer-literal := NUMERIC_CONSTANT [guard]
+character-literal := CHAR_CONSTANT [guard]
+character-literal := WIDE_CHAR_CONSTANT [guard]
+character-literal := UTF8_CHAR_CONSTANT [guard]
+character-literal := UTF16_CHAR_CONSTANT [guard]
+character-literal := UTF32_CHAR_CONSTANT [guard]
+floating-point-literal := NUMERIC_CONSTANT [guard]
+string-literal-chunk := STRING_LITERAL [guard]
+string-literal-chunk := WIDE_STRING_LITERAL [guard]
+string-literal-chunk := UTF8_STRING_LITERAL [guard]
+string-literal-chunk := UTF16_STRING_LITERAL [guard]
+string-literal-chunk := UTF32_STRING_LITERAL [guard]
 #! Technically, string concatenation happens at phase 6 which is before parsing,
 #! so it doesn't belong to the grammar. However, we extend the grammar to
 #! support it, to make the pseudoparser fully functional on practical code.
@@ -736,33 +736,33 @@ user-defined-literal := user-defined-integer-literal
 user-defined-literal := user-defined-floating-point-literal
 user-defined-literal := user-defined-string-literal
 user-defined-literal := user-defined-character-literal
-user-defined-integer-literal := NUMERIC_CONSTANT
-user-defined-string-literal-chunk := STRING_LITERAL
-user-defined-string-literal-chunk := WIDE_STRING_LITERAL
-user-defined-string-literal-chunk := UTF8_STRING_LITERAL
-user-defined-string-literal-chunk := UTF16_STRING_LITERAL
-user-defined-string-literal-chunk := UTF32_STRING_LITERAL
+user-defined-integer-literal := NUMERIC_CONSTANT [guard]
+user-defined-string-literal-chunk := STRING_LITERAL [guard]
+user-defined-string-literal-chunk := WIDE_STRING_LITERAL [guard]
+user-defined-string-literal-chunk := UTF8_STRING_LITERAL [guard]
+user-defined-string-literal-chunk := UTF16_STRING_LITERAL [guard]
+user-defined-string-literal-chunk := UTF32_STRING_LITERAL [guard]
 user-defined-string-literal := user-defined-string-literal-chunk
 user-defined-string-literal := string-literal-chunk user-defined-string-literal
 user-defined-string-literal := user-defined-string-literal string-literal-chunk
-user-defined-floating-point-literal := NUMERIC_CONSTANT
-user-defined-character-literal := CHAR_CONSTANT
-user-defined-character-literal := WIDE_CHAR_CONSTANT
-user-defined-character-literal := UTF8_CHAR_CONSTANT
-user-defined-character-literal := UTF16_CHAR_CONSTANT
-user-defined-character-literal := UTF32_CHAR_CONSTANT
+user-defined-floating-point-literal := NUMERIC_CONSTANT [guard]
+user-defined-character-literal := CHAR_CONSTANT [guard]
+user-defined-character-literal := WIDE_CHAR_CONSTANT [guard]
+user-defined-character-literal := UTF8_CHAR_CONSTANT [guard]
+user-defined-character-literal := UTF16_CHAR_CONSTANT [guard]
+user-defined-character-literal := UTF32_CHAR_CONSTANT [guard]
 boolean-literal := FALSE
 boolean-literal := TRUE
 pointer-literal := NULLPTR
 
 #! Contextual keywords -- clang lexer always lexes them as identifier tokens.
 #! Placeholders for literal text in the grammar that lex as other things.
-contextual-override := IDENTIFIER [guard=Override]
-contextual-final := IDENTIFIER [guard=Final]
-contextual-zero := NUMERIC_CONSTANT [guard=Zero]
-module-keyword := IDENTIFIER [guard=Module]
-import-keyword := IDENTIFIER [guard=Import]
-export-keyword := IDENTIFIER [guard=Export]
+contextual-override := IDENTIFIER [guard]
+contextual-final := IDENTIFIER [guard]
+contextual-zero := NUMERIC_CONSTANT [guard]
+module-keyword := IDENTIFIER [guard]
+import-keyword := IDENTIFIER [guard]
+export-keyword := IDENTIFIER [guard]
 
 #! greatergreater token -- clang lexer always lexes it as a single token, we
 #! split it into two tokens to make the GLR parser aware of the nested-template

diff  --git a/clang-tools-extra/pseudo/lib/grammar/Grammar.cpp b/clang-tools-extra/pseudo/lib/grammar/Grammar.cpp
index da4e2dfd7a542..7fd8c3b66b8cd 100644
--- a/clang-tools-extra/pseudo/lib/grammar/Grammar.cpp
+++ b/clang-tools-extra/pseudo/lib/grammar/Grammar.cpp
@@ -86,8 +86,8 @@ std::string Grammar::dumpRule(RuleID RID) const {
     if (R.RecoveryIndex == I)
       OS << " [recover=" << T->AttributeValues[R.Recovery] << "]";
   }
-  if (R.Guard)
-    OS << " [guard=" << T->AttributeValues[R.Guard] << "]";
+  if (R.Guarded)
+    OS << " [guard]";
   return Result;
 }
 

diff  --git a/clang-tools-extra/pseudo/lib/grammar/GrammarBNF.cpp b/clang-tools-extra/pseudo/lib/grammar/GrammarBNF.cpp
index 5e1fe9b6a0086..43fba22dd52d7 100644
--- a/clang-tools-extra/pseudo/lib/grammar/GrammarBNF.cpp
+++ b/clang-tools-extra/pseudo/lib/grammar/GrammarBNF.cpp
@@ -76,6 +76,7 @@ class GrammarBuilder {
     });
     // Add an empty string for the corresponding sentinel unset attribute.
     T->AttributeValues.push_back("");
+    UniqueAttributeValues.erase("");
     llvm::for_each(UniqueAttributeValues, [&T](llvm::StringRef Name) {
       T->AttributeValues.emplace_back();
       T->AttributeValues.back() = Name.str();
@@ -258,7 +259,7 @@ class GrammarBuilder {
     for (unsigned I = 0; I < Spec.Sequence.size(); ++I) {
       for (const auto &KV : Spec.Sequence[I].Attributes) {
         if (KV.first == "guard") {
-          R.Guard = LookupExtensionID(KV.second);
+          R.Guarded = true;
         } else if (KV.first == "recover") {
           R.Recovery = LookupExtensionID(KV.second);
           R.RecoveryIndex = I;

diff  --git a/clang-tools-extra/pseudo/test/cxx/literals.cpp b/clang-tools-extra/pseudo/test/cxx/literals.cpp
new file mode 100644
index 0000000000000..e1cec8985b25f
--- /dev/null
+++ b/clang-tools-extra/pseudo/test/cxx/literals.cpp
@@ -0,0 +1,43 @@
+// RUN: clang-pseudo -grammar=cxx -source=%s --print-forest -forest-abbrev=0 | FileCheck %s --implicit-check-not=ambiguous
+auto list = {
+  0,      // CHECK: := integer-literal
+  0b1011, // CHECK: := integer-literal
+  0777,   // CHECK: := integer-literal
+  42_u,   // CHECK: := user-defined-integer-literal
+  0LL,    // CHECK: := integer-literal
+  0h,     // CHECK: := user-defined-integer-literal
+  0.,     // CHECK: := floating-point-literal
+  .2,     // CHECK: := floating-point-literal
+  2e1,    // CHECK: := floating-point-literal
+  0x42d,  // CHECK: := integer-literal
+  0x42_d, // CHECK: := user-defined-integer-literal
+  0x42ds, // CHECK: := user-defined-integer-literal
+  0x1.2p2,// CHECK: := floating-point-literal
+  
+  "",               // CHECK: literal := string-literal
+  L"",              // CHECK: literal := string-literal
+  u8"",             // CHECK: literal := string-literal
+  u"",              // CHECK: literal := string-literal
+  U"",              // CHECK: literal := string-literal
+  R"()",            // CHECK: literal := string-literal
+  uR"()",           // CHECK: literal := string-literal
+  "a" "b",          // CHECK: literal := string-literal
+  u8"a" "b",        // CHECK: literal := string-literal
+  u"a" u"b",        // CHECK: literal := string-literal
+  "a"_u "b",        // CHECK: user-defined-literal := user-defined-string-literal
+  "a"_u u"b",       // CHECK: user-defined-literal := user-defined-string-literal
+  R"(a)" "\n",      // CHECK: literal := string-literal
+  R"c(a)c"_u u"\n", // CHECK: user-defined-literal := user-defined-string-literal
+
+  'a',      // CHECK: := character-literal
+  'abc',    // CHECK: := character-literal
+  'abcdef', // CHECK: := character-literal
+  u'a',     // CHECK: := character-literal
+  U'a',     // CHECK: := character-literal
+  L'a',     // CHECK: := character-literal
+  L'abc',   // CHECK: := character-literal
+  U'\u1234',// CHECK: := character-literal
+  '\u1234', // CHECK: := character-literal
+  u'a'_u,   // CHECK: := user-defined-character-literal
+};
+

diff  --git a/clang-tools-extra/pseudo/test/cxx/mixed-designator.cpp b/clang-tools-extra/pseudo/test/cxx/mixed-designator.cpp
index 5a89f4d57a528..d605a3d66a5de 100644
--- a/clang-tools-extra/pseudo/test/cxx/mixed-designator.cpp
+++ b/clang-tools-extra/pseudo/test/cxx/mixed-designator.cpp
@@ -5,16 +5,16 @@ auto x = { 1, .f = 2, [c]{3} };
 // CHECK-NEXT: ├─{ := tok[3]
 // CHECK-NEXT: ├─initializer-list
 // CHECK-NEXT: │ ├─initializer-list
-// CHECK-NEXT: │ │ ├─initializer-list~literal
-// CHECK:      │ │ ├─, := tok[5]
+// CHECK-NEXT: │ │ ├─initializer-list~NUMERIC_CONSTANT
+// CHECK-NEXT: │ │ ├─, := tok[5]
 // CHECK-NEXT: │ │ └─initializer-list-item
 // CHECK-NEXT: │ │   ├─designator
 // CHECK-NEXT: │ │   │ ├─. := tok[6]
 // CHECK-NEXT: │ │   │ └─IDENTIFIER := tok[7]
 // CHECK-NEXT: │ │   └─brace-or-equal-initializer
 // CHECK-NEXT: │ │     ├─= := tok[8]
-// CHECK-NEXT: │ │     └─initializer-clause~literal
-// CHECK:      │ ├─, := tok[10]
+// CHECK-NEXT: │ │     └─initializer-clause~NUMERIC_CONSTANT
+// CHECK-NEXT: │ ├─, := tok[10]
 // CHECK-NEXT: │ └─initializer-list-item
 // CHECK-NEXT: │   ├─designator
 // CHECK-NEXT: │   │ ├─[ := tok[11]
@@ -22,6 +22,6 @@ auto x = { 1, .f = 2, [c]{3} };
 // CHECK-NEXT: │   │ └─] := tok[13]
 // CHECK-NEXT: │   └─brace-or-equal-initializer~braced-init-list
 // CHECK-NEXT: │     ├─{ := tok[14]
-// CHECK-NEXT: │     ├─initializer-list~literal
+// CHECK-NEXT: │     ├─initializer-list~NUMERIC_CONSTANT
 // CHECK:      │     └─} := tok[16]
 // CHECK-NEXT: └─} := tok[17]

diff  --git a/clang-tools-extra/pseudo/tool/ClangPseudo.cpp b/clang-tools-extra/pseudo/tool/ClangPseudo.cpp
index 9a11db9006d1e..f4f511ff14c6d 100644
--- a/clang-tools-extra/pseudo/tool/ClangPseudo.cpp
+++ b/clang-tools-extra/pseudo/tool/ClangPseudo.cpp
@@ -45,6 +45,8 @@ static opt<bool>
                     desc("Strip directives and select conditional sections"));
 static opt<bool> PrintStatistics("print-statistics", desc("Print GLR parser statistics"));
 static opt<bool> PrintForest("print-forest", desc("Print parse forest"));
+static opt<bool> ForestAbbrev("forest-abbrev", desc("Abbreviate parse forest"),
+                              init(true));
 static opt<std::string> HTMLForest("html-forest",
                                    desc("output file for HTML forest"));
 static opt<std::string> StartSymbol("start-symbol",
@@ -153,7 +155,7 @@ int main(int argc, char *argv[]) {
         glrParse(clang::pseudo::ParseParams{*ParseableStream, Arena, GSS},
                  *StartSymID, Lang);
     if (PrintForest)
-      llvm::outs() << Root.dumpRecursive(Lang.G, /*Abbreviated=*/true);
+      llvm::outs() << Root.dumpRecursive(Lang.G, /*Abbreviated=*/ForestAbbrev);
 
     if (HTMLForest.getNumOccurrences()) {
       std::error_code EC;

diff  --git a/clang-tools-extra/pseudo/unittests/GLRTest.cpp b/clang-tools-extra/pseudo/unittests/GLRTest.cpp
index 05419c1a35273..2c3ef265de392 100644
--- a/clang-tools-extra/pseudo/unittests/GLRTest.cpp
+++ b/clang-tools-extra/pseudo/unittests/GLRTest.cpp
@@ -631,10 +631,10 @@ TEST_F(GLRTest, GuardExtension) {
   build(R"bnf(
     _ := start
 
-    start := IDENTIFIER [guard=TestOnly]
+    start := IDENTIFIER [guard]
   )bnf");
   TestLang.Guards.try_emplace(
-      extensionID("TestOnly"),
+      ruleFor("start"),
       [&](llvm::ArrayRef<const ForestNode *> RHS, const TokenStream &Tokens) {
         assert(RHS.size() == 1 &&
                RHS.front()->symbol() == tokenSymbol(clang::tok::identifier));
@@ -647,7 +647,7 @@ TEST_F(GLRTest, GuardExtension) {
   const TokenStream &Succeeded = cook(lex(Input, LOptions), LOptions);
   EXPECT_EQ(glrParse({Succeeded, Arena, GSStack}, id("start"), TestLang)
                 .dumpRecursive(TestLang.G),
-            "[  0, end) start := IDENTIFIER [guard=TestOnly]\n"
+            "[  0, end) start := IDENTIFIER [guard]\n"
             "[  0, end) └─IDENTIFIER := tok[0]\n");
 
   Input = "notest";

diff  --git a/clang-tools-extra/pseudo/unittests/GrammarTest.cpp b/clang-tools-extra/pseudo/unittests/GrammarTest.cpp
index ee9a3ae2904b2..2657531ca7270 100644
--- a/clang-tools-extra/pseudo/unittests/GrammarTest.cpp
+++ b/clang-tools-extra/pseudo/unittests/GrammarTest.cpp
@@ -102,16 +102,11 @@ TEST_F(GrammarTest, RuleIDSorted) {
 TEST_F(GrammarTest, Annotation) {
   build(R"bnf(
     _ := x
-
-    x := y [guard=value]
-    y := IDENTIFIER [guard=final]
-
+    x := IDENTIFIER [guard]
   )bnf");
-  ASSERT_TRUE(Diags.empty());
-  EXPECT_EQ(G.lookupRule(ruleFor("_")).Guard, 0);
-  EXPECT_GT(G.lookupRule(ruleFor("x")).Guard, 0);
-  EXPECT_GT(G.lookupRule(ruleFor("y")).Guard, 0);
-  EXPECT_NE(G.lookupRule(ruleFor("x")).Guard, G.lookupRule(ruleFor("y")).Guard);
+  ASSERT_THAT(Diags, IsEmpty());
+  EXPECT_FALSE(G.lookupRule(ruleFor("_")).Guarded);
+  EXPECT_TRUE(G.lookupRule(ruleFor("x")).Guarded);
 }
 
 TEST_F(GrammarTest, MangleName) {


        


More information about the cfe-commits mailing list