[clang-tools-extra] 56c54cf - [pseudo] Placeholder disambiguation strategy: always choose second
Sam McCall via cfe-commits
cfe-commits at lists.llvm.org
Fri Aug 26 04:18:36 PDT 2022
Author: Sam McCall
Date: 2022-08-26T13:16:09+02:00
New Revision: 56c54cf66bcd6c2266cd96ab4da45c766fbad540
URL: https://github.com/llvm/llvm-project/commit/56c54cf66bcd6c2266cd96ab4da45c766fbad540
DIFF: https://github.com/llvm/llvm-project/commit/56c54cf66bcd6c2266cd96ab4da45c766fbad540.diff
LOG: [pseudo] Placeholder disambiguation strategy: always choose second
Mostly mechanics here. Interesting decisions:
- apply disambiguation in-place instead of copying the forest
debatable, but even the final tree size is significant
- split decide/apply into different functions - this allows the hard part
(decide) to be tested non-destructively and combined with HTML forest easily
- add non-const accessors to forest to enable apply
- unit tests but no lit tests: my plan is to test actual C++ disambiguation
heuristics with lit, generic disambiguation mechanics without the C++ grammar
Differential Revision: https://reviews.llvm.org/D132487
Added:
clang-tools-extra/pseudo/include/clang-pseudo/Disambiguate.h
clang-tools-extra/pseudo/lib/Disambiguate.cpp
clang-tools-extra/pseudo/unittests/DisambiguateTest.cpp
Modified:
clang-tools-extra/pseudo/include/clang-pseudo/Forest.h
clang-tools-extra/pseudo/include/clang-pseudo/GLR.h
clang-tools-extra/pseudo/lib/CMakeLists.txt
clang-tools-extra/pseudo/lib/GLR.cpp
clang-tools-extra/pseudo/tool/ClangPseudo.cpp
clang-tools-extra/pseudo/tool/HTMLForest.cpp
clang-tools-extra/pseudo/unittests/CMakeLists.txt
Removed:
################################################################################
diff --git a/clang-tools-extra/pseudo/include/clang-pseudo/Disambiguate.h b/clang-tools-extra/pseudo/include/clang-pseudo/Disambiguate.h
new file mode 100644
index 0000000000000..5f3a22c9cabb3
--- /dev/null
+++ b/clang-tools-extra/pseudo/include/clang-pseudo/Disambiguate.h
@@ -0,0 +1,64 @@
+//===--- Disambiguate.h - Find the best tree in the forest -------*- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// A GLR parse forest represents every possible parse tree for the source code.
+//
+// Before we can do useful analysis/editing of the code, we need to pick a
+// single tree which we think is accurate. We use three main types of clues:
+//
+// A) Semantic language rules may restrict which parses are allowed.
+// For example, `string string string X` is *grammatical* C++, but only a
+// single type-name is allowed in a decl-specifier-sequence.
+// Where possible, these interpretations are forbidden by guards.
+// Sometimes this isn't possible, or we want our parser to be lenient.
+//
+// B) Some constructs are rarer, while others are common.
+// For example `a<b>::c` is often a template specialization, and rarely a
+// double comparison between a, b, and c.
+//
+// C) Identifier text hints whether they name types/values/templates etc.
+// "std" is usually a namespace, a project index may also guide us.
+// Hints may be within the document: if one occurrence of 'foo' is a variable
+// then the others probably are too.
+// (Text need not match: similar CaseStyle can be a weak hint, too).
+//
+//----------------------------------------------------------------------------//
+//
+// Mechanically, we replace each ambiguous node with its best alternative.
+//
+// "Best" is determined by assigning bonuses/penalties to nodes, to express
+// the clues of type A and B above. A forest node representing an unlikely
+// parse would apply a penalty to every subtree is is present in.
+// Disambiguation proceeds bottom-up, so that the score of each alternative
+// is known when a decision is made.
+//
+// Identifier-based hints within the document mean some nodes should be
+// *correlated*. Rather than resolve these simultaneously, we make the most
+// certain decisions first and use these results to update bonuses elsewhere.
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang-pseudo/Forest.h"
+
+namespace clang::pseudo {
+
+struct DisambiguateParams {};
+
+// Maps ambiguous nodes onto the index of their preferred alternative.
+using Disambiguation = llvm::DenseMap<const ForestNode *, unsigned>;
+
+// Resolve each ambiguous node in the forest.
+// Maps each ambiguous node to the index of the chosen alternative.
+// FIXME: current implementation is a placeholder and chooses arbitrarily.
+Disambiguation disambiguate(const ForestNode *Root,
+ const DisambiguateParams &Params);
+
+// Remove all ambiguities from the forest, resolving them according to Disambig.
+void removeAmbiguities(ForestNode *&Root, const Disambiguation &Disambig);
+
+} // namespace clang::pseudo
diff --git a/clang-tools-extra/pseudo/include/clang-pseudo/Forest.h b/clang-tools-extra/pseudo/include/clang-pseudo/Forest.h
index 35212c18a1161..e36e763937d60 100644
--- a/clang-tools-extra/pseudo/include/clang-pseudo/Forest.h
+++ b/clang-tools-extra/pseudo/include/clang-pseudo/Forest.h
@@ -80,6 +80,10 @@ class alignas(class ForestNode *) ForestNode {
assert(kind() == Sequence);
return children(Data >> RuleBits);
}
+ llvm::MutableArrayRef<ForestNode *> elements() {
+ assert(kind() == Sequence);
+ return children(Data >> RuleBits);
+ }
// Returns all possible interpretations of the code.
// REQUIRES: this is an Ambiguous node.
@@ -87,6 +91,10 @@ class alignas(class ForestNode *) ForestNode {
assert(kind() == Ambiguous);
return children(Data);
}
+ llvm::MutableArrayRef<ForestNode *> alternatives() {
+ assert(kind() == Ambiguous);
+ return children(Data);
+ }
llvm::ArrayRef<const ForestNode *> children() const {
switch (kind()) {
@@ -134,6 +142,10 @@ class alignas(class ForestNode *) ForestNode {
return llvm::makeArrayRef(reinterpret_cast<ForestNode *const *>(this + 1),
Num);
}
+ llvm::MutableArrayRef<ForestNode *> children(uint16_t Num) {
+ return llvm::makeMutableArrayRef(reinterpret_cast<ForestNode **>(this + 1),
+ Num);
+ }
Token::Index StartIndex;
Kind K : 4;
diff --git a/clang-tools-extra/pseudo/include/clang-pseudo/GLR.h b/clang-tools-extra/pseudo/include/clang-pseudo/GLR.h
index f5257ce8d82f3..1e867891b9728 100644
--- a/clang-tools-extra/pseudo/include/clang-pseudo/GLR.h
+++ b/clang-tools-extra/pseudo/include/clang-pseudo/GLR.h
@@ -130,8 +130,8 @@ struct ParseParams {
// A rule `_ := StartSymbol` must exit for the chosen start symbol.
//
// If the parsing fails, we model it as an opaque node in the forest.
-const ForestNode &glrParse(const ParseParams &Params, SymbolID StartSymbol,
- const Language &Lang);
+ForestNode &glrParse(const ParseParams &Params, SymbolID StartSymbol,
+ const Language &Lang);
// Shift a token onto all OldHeads, placing the results into NewHeads.
//
diff --git a/clang-tools-extra/pseudo/lib/CMakeLists.txt b/clang-tools-extra/pseudo/lib/CMakeLists.txt
index d517eef35503b..f9b9cfa7fa1b3 100644
--- a/clang-tools-extra/pseudo/lib/CMakeLists.txt
+++ b/clang-tools-extra/pseudo/lib/CMakeLists.txt
@@ -7,6 +7,7 @@ set(LLVM_LINK_COMPONENTS Support)
add_clang_library(clangPseudo
Bracket.cpp
DirectiveTree.cpp
+ Disambiguate.cpp
Forest.cpp
GLR.cpp
Lex.cpp
diff --git a/clang-tools-extra/pseudo/lib/Disambiguate.cpp b/clang-tools-extra/pseudo/lib/Disambiguate.cpp
new file mode 100644
index 0000000000000..b0bc75cf96c93
--- /dev/null
+++ b/clang-tools-extra/pseudo/lib/Disambiguate.cpp
@@ -0,0 +1,48 @@
+//===--- Disambiguate.cpp - Find the best tree in the forest --------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang-pseudo/Disambiguate.h"
+
+namespace clang::pseudo {
+
+Disambiguation disambiguate(const ForestNode *Root,
+ const DisambiguateParams &Params) {
+ // FIXME: this is a dummy placeholder strategy, implement a real one!
+ Disambiguation Result;
+ for (const ForestNode &N : Root->descendants()) {
+ if (N.kind() == ForestNode::Ambiguous)
+ Result.try_emplace(&N, 1);
+ }
+ return Result;
+}
+
+void removeAmbiguities(ForestNode *&Root, const Disambiguation &D) {
+ std::vector<ForestNode **> Queue = {&Root};
+ while (!Queue.empty()) {
+ ForestNode **Next = Queue.back();
+ Queue.pop_back();
+ switch ((*Next)->kind()) {
+ case ForestNode::Sequence:
+ for (ForestNode *&Child : (*Next)->elements())
+ Queue.push_back(&Child);
+ break;
+ case ForestNode::Ambiguous: {
+ assert(D.count(*Next) != 0 && "disambiguation is incomplete!");
+ ForestNode *ChosenChild = (*Next)->alternatives()[D.lookup(*Next)];
+ *Next = ChosenChild;
+ Queue.push_back(Next);
+ break;
+ }
+ case ForestNode::Terminal:
+ case ForestNode::Opaque:
+ break;
+ }
+ }
+}
+
+} // namespace clang::pseudo
diff --git a/clang-tools-extra/pseudo/lib/GLR.cpp b/clang-tools-extra/pseudo/lib/GLR.cpp
index 3e7d5a5435807..0639990e8f7f1 100644
--- a/clang-tools-extra/pseudo/lib/GLR.cpp
+++ b/clang-tools-extra/pseudo/lib/GLR.cpp
@@ -598,8 +598,8 @@ class GLRReduce {
} // namespace
-const ForestNode &glrParse(const ParseParams &Params, SymbolID StartSymbol,
- const Language &Lang) {
+ForestNode &glrParse(const ParseParams &Params, SymbolID StartSymbol,
+ const Language &Lang) {
GLRReduce Reduce(Params, Lang);
assert(isNonterminal(StartSymbol) && "Start symbol must be a nonterminal");
llvm::ArrayRef<ForestNode> Terminals = Params.Forest.createTerminals(Params.Code);
@@ -686,7 +686,7 @@ const ForestNode &glrParse(const ParseParams &Params, SymbolID StartSymbol,
return Result;
};
if (auto *Result = SearchForAccept(Heads))
- return *Result;
+ return *const_cast<ForestNode *>(Result); // Safe: we created all nodes.
// We failed to parse the input, returning an opaque forest node for recovery.
// FIXME: as above, we can add fallback error handling so this is impossible.
return Params.Forest.createOpaque(StartSymbol, /*Token::Index=*/0);
diff --git a/clang-tools-extra/pseudo/tool/ClangPseudo.cpp b/clang-tools-extra/pseudo/tool/ClangPseudo.cpp
index 294098a3a5c14..23af115e5fc3d 100644
--- a/clang-tools-extra/pseudo/tool/ClangPseudo.cpp
+++ b/clang-tools-extra/pseudo/tool/ClangPseudo.cpp
@@ -8,6 +8,7 @@
#include "clang-pseudo/Bracket.h"
#include "clang-pseudo/DirectiveTree.h"
+#include "clang-pseudo/Disambiguate.h"
#include "clang-pseudo/Forest.h"
#include "clang-pseudo/GLR.h"
#include "clang-pseudo/Language.h"
@@ -45,6 +46,8 @@ static opt<bool>
static opt<bool>
StripDirectives("strip-directives",
desc("Strip directives and select conditional sections"));
+static opt<bool> Disambiguate("disambiguate",
+ desc("Choose best tree from parse forest"));
static opt<bool> PrintStatistics("print-statistics", desc("Print GLR parser statistics"));
static opt<bool> PrintForest("print-forest", desc("Print parse forest"));
static opt<bool> ForestAbbrev("forest-abbrev", desc("Abbreviate parse forest"),
@@ -70,7 +73,8 @@ namespace clang {
namespace pseudo {
// Defined in HTMLForest.cpp
void writeHTMLForest(llvm::raw_ostream &OS, const Grammar &,
- const ForestNode &Root, const TokenStream &);
+ const ForestNode &Root, const Disambiguation &,
+ const TokenStream &);
namespace {
struct NodeStats {
@@ -156,8 +160,12 @@ int main(int argc, char *argv[]) {
auto &Root =
glrParse(clang::pseudo::ParseParams{*ParseableStream, Arena, GSS},
*StartSymID, Lang);
- if (PrintForest)
+ // If we're disambiguating, we'll print at the end instead.
+ if (PrintForest && !Disambiguate)
llvm::outs() << Root.dumpRecursive(Lang.G, /*Abbreviated=*/ForestAbbrev);
+ clang::pseudo::Disambiguation Disambig;
+ if (Disambiguate)
+ Disambig = clang::pseudo::disambiguate(&Root, {});
if (HTMLForest.getNumOccurrences()) {
std::error_code EC;
@@ -167,7 +175,8 @@ int main(int argc, char *argv[]) {
<< "\n";
return 2;
}
- clang::pseudo::writeHTMLForest(HTMLOut, Lang.G, Root, *ParseableStream);
+ clang::pseudo::writeHTMLForest(HTMLOut, Lang.G, Root, Disambig,
+ *ParseableStream);
}
if (PrintStatistics) {
@@ -219,6 +228,14 @@ int main(int argc, char *argv[]) {
llvm::outs() << llvm::formatv("Unparsed: {0}%\n",
100.0 * UnparsedTokens / Len);
}
+
+ if (Disambiguate && PrintForest) {
+ ForestNode *DisambigRoot = &Root;
+ removeAmbiguities(DisambigRoot, Disambig);
+ llvm::outs() << "Disambiguated tree:\n";
+ llvm::outs() << DisambigRoot->dumpRecursive(Lang.G,
+ /*Abbreviated=*/ForestAbbrev);
+ }
}
return 0;
diff --git a/clang-tools-extra/pseudo/tool/HTMLForest.cpp b/clang-tools-extra/pseudo/tool/HTMLForest.cpp
index 954f9b0048a2f..184430bddd8d6 100644
--- a/clang-tools-extra/pseudo/tool/HTMLForest.cpp
+++ b/clang-tools-extra/pseudo/tool/HTMLForest.cpp
@@ -43,6 +43,7 @@
//
//===----------------------------------------------------------------------===//
+#include "clang-pseudo/Disambiguate.h"
#include "clang-pseudo/Forest.h"
#include "clang-pseudo/grammar/Grammar.h"
#include "llvm/ADT/StringExtras.h"
@@ -60,6 +61,7 @@ struct Writer {
const Grammar &G;
const ForestNode &Root;
const TokenStream &Stream;
+ const Disambiguation &Disambig;
void write() {
Out << "<!doctype html>\n";
@@ -150,7 +152,8 @@ void Writer::writeForestJSON() {
break;
case ForestNode::Ambiguous:
Out.attribute("kind", "ambiguous");
- Out.attribute("selected", AssignID(N->children().front(), End));
+ Out.attribute("selected",
+ AssignID(N->children()[Disambig.lookup(N)], End));
break;
case ForestNode::Opaque:
Out.attribute("kind", "opaque");
@@ -180,8 +183,9 @@ void Writer::writeForestJSON() {
// We only accept the derived stream here.
// FIXME: allow the original stream instead?
void writeHTMLForest(llvm::raw_ostream &OS, const Grammar &G,
- const ForestNode &Root, const TokenStream &Stream) {
- Writer{OS, G, Root, Stream}.write();
+ const ForestNode &Root, const Disambiguation &Disambig,
+ const TokenStream &Stream) {
+ Writer{OS, G, Root, Stream, Disambig}.write();
}
} // namespace pseudo
diff --git a/clang-tools-extra/pseudo/unittests/CMakeLists.txt b/clang-tools-extra/pseudo/unittests/CMakeLists.txt
index 9249c2984128b..831ae3d1256a9 100644
--- a/clang-tools-extra/pseudo/unittests/CMakeLists.txt
+++ b/clang-tools-extra/pseudo/unittests/CMakeLists.txt
@@ -7,6 +7,7 @@ add_unittest(ClangPseudoUnitTests ClangPseudoTests
BracketTest.cpp
CXXTest.cpp
DirectiveTreeTest.cpp
+ DisambiguateTest.cpp
ForestTest.cpp
GLRTest.cpp
GrammarTest.cpp
diff --git a/clang-tools-extra/pseudo/unittests/DisambiguateTest.cpp b/clang-tools-extra/pseudo/unittests/DisambiguateTest.cpp
new file mode 100644
index 0000000000000..2f483bb090660
--- /dev/null
+++ b/clang-tools-extra/pseudo/unittests/DisambiguateTest.cpp
@@ -0,0 +1,111 @@
+//===--- DisambiguateTest.cpp ---------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang-pseudo/Disambiguate.h"
+#include "clang-pseudo/Forest.h"
+#include "clang-pseudo/Token.h"
+#include "clang/Basic/TokenKinds.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include <vector>
+
+namespace clang {
+namespace pseudo {
+namespace {
+using testing::ElementsAre;
+using testing::Pair;
+using testing::UnorderedElementsAre;
+
+// Common disambiguation test fixture.
+// This is the ambiguous forest representing parses of 'a * b;'.
+class DisambiguateTest : public ::testing::Test {
+protected:
+ // Greatly simplified C++ grammar.
+ enum Symbol : SymbolID {
+ Statement,
+ Declarator,
+ Expression,
+ DeclSpecifier,
+ Type,
+ Template,
+ };
+ enum Rule : RuleID {
+ /* LHS__RHS1_RHS2 means LHS := RHS1 RHS2 */
+ Statement__DeclSpecifier_Declarator_Semi,
+ Declarator__Star_Declarator,
+ Declarator__Identifier,
+ Statement__Expression_Semi,
+ Expression__Expression_Star_Expression,
+ Expression__Identifier,
+ DeclSpecifier__Type,
+ DeclSpecifier__Template,
+ Type__Identifier,
+ Template__Identifier,
+ };
+
+ ForestArena Arena;
+ ForestNode &A = Arena.createTerminal(tok::identifier, 0);
+ ForestNode &Star = Arena.createTerminal(tok::star, 1);
+ ForestNode &B = Arena.createTerminal(tok::identifier, 2);
+ ForestNode &Semi = Arena.createTerminal(tok::semi, 3);
+
+ // Parse as multiplication expression.
+ ForestNode &AExpr =
+ Arena.createSequence(Expression, Expression__Identifier, &A);
+ ForestNode &BExpr =
+ Arena.createSequence(Expression, Expression__Identifier, &B);
+ ForestNode &Expr =
+ Arena.createSequence(Expression, Expression__Expression_Star_Expression,
+ {&AExpr, &Star, &BExpr});
+ ForestNode &ExprStmt = Arena.createSequence(
+ Statement, Statement__Expression_Semi, {&Expr, &Semi});
+ // Parse as declaration (`a` may be CTAD or not).
+ ForestNode &AType =
+ Arena.createSequence(DeclSpecifier, DeclSpecifier__Type,
+ &Arena.createSequence(Type, Type__Identifier, &A));
+ ForestNode &ATemplate = Arena.createSequence(
+ DeclSpecifier, DeclSpecifier__Template,
+ &Arena.createSequence(Template, Template__Identifier, &A));
+ ForestNode &DeclSpec =
+ Arena.createAmbiguous(DeclSpecifier, {&AType, &ATemplate});
+ ForestNode &BDeclarator =
+ Arena.createSequence(Declarator, Declarator__Identifier, &B);
+ ForestNode &BPtr = Arena.createSequence(
+ Declarator, Declarator__Star_Declarator, {&Star, &BDeclarator});
+ ForestNode &DeclStmt =
+ Arena.createSequence(Statement, Statement__DeclSpecifier_Declarator_Semi,
+ {&DeclSpec, &Star, &BDeclarator});
+ // Top-level ambiguity
+ ForestNode &Stmt = Arena.createAmbiguous(Statement, {&ExprStmt, &DeclStmt});
+};
+
+TEST_F(DisambiguateTest, Remove) {
+ Disambiguation D;
+ D.try_emplace(&Stmt, 1); // statement is a declaration, not an expression
+ D.try_emplace(&DeclSpec, 0); // a is a type, not a (CTAD) template
+ ForestNode *Root = &Stmt;
+ removeAmbiguities(Root, D);
+
+ EXPECT_EQ(Root, &DeclStmt);
+ EXPECT_THAT(DeclStmt.elements(), ElementsAre(&AType, &Star, &BDeclarator));
+}
+
+TEST_F(DisambiguateTest, DummyStrategy) {
+ Disambiguation D = disambiguate(&Stmt, {});
+ EXPECT_THAT(D, UnorderedElementsAre(Pair(&Stmt, 1), Pair(&DeclSpec, 1)));
+
+ ForestNode *Root = &Stmt;
+ removeAmbiguities(Root, D);
+ EXPECT_EQ(Root, &DeclStmt);
+ EXPECT_THAT(DeclStmt.elements(),
+ ElementsAre(&ATemplate, &Star, &BDeclarator));
+}
+
+} // namespace
+} // namespace pseudo
+} // namespace clang
More information about the cfe-commits
mailing list