[clang-tools-extra] 0360b9f - [pseudo] (trivial) bracket-matching
Sam McCall via cfe-commits
cfe-commits at lists.llvm.org
Tue May 24 06:13:41 PDT 2022
Author: Sam McCall
Date: 2022-05-24T15:13:36+02:00
New Revision: 0360b9f1599b0b13f164d8170a619b19f9cb8bb4
URL: https://github.com/llvm/llvm-project/commit/0360b9f1599b0b13f164d8170a619b19f9cb8bb4
DIFF: https://github.com/llvm/llvm-project/commit/0360b9f1599b0b13f164d8170a619b19f9cb8bb4.diff
LOG: [pseudo] (trivial) bracket-matching
Error-tolerant bracket matching enables our error-tolerant parsing strategies.
The implementation here is *not* yet error tolerant: this patch sets up the APIs
and plumbing, and describes the planned approach.
Differential Revision: https://reviews.llvm.org/D125911
Added:
clang-tools-extra/pseudo/include/clang-pseudo/Bracket.h
clang-tools-extra/pseudo/lib/Bracket.cpp
clang-tools-extra/pseudo/unittests/BracketTest.cpp
Modified:
clang-tools-extra/pseudo/benchmarks/Benchmark.cpp
clang-tools-extra/pseudo/include/clang-pseudo/Token.h
clang-tools-extra/pseudo/lib/CMakeLists.txt
clang-tools-extra/pseudo/tool/ClangPseudo.cpp
clang-tools-extra/pseudo/unittests/CMakeLists.txt
Removed:
################################################################################
diff --git a/clang-tools-extra/pseudo/benchmarks/Benchmark.cpp b/clang-tools-extra/pseudo/benchmarks/Benchmark.cpp
index fb028be0c7abd..b10ff3a175bd2 100644
--- a/clang-tools-extra/pseudo/benchmarks/Benchmark.cpp
+++ b/clang-tools-extra/pseudo/benchmarks/Benchmark.cpp
@@ -20,6 +20,7 @@
//===----------------------------------------------------------------------===//
#include "benchmark/benchmark.h"
+#include "clang-pseudo/Bracket.h"
#include "clang-pseudo/DirectiveTree.h"
#include "clang-pseudo/Forest.h"
#include "clang-pseudo/GLR.h"
@@ -89,7 +90,9 @@ TokenStream lexAndPreprocess() {
chooseConditionalBranches(DirectiveStructure, RawStream);
TokenStream Cook =
cook(DirectiveStructure.stripDirectives(RawStream), LangOpts);
- return stripComments(Cook);
+ auto Stream = stripComments(Cook);
+ pairBrackets(Stream);
+ return Stream;
}
static void lex(benchmark::State &State) {
@@ -101,6 +104,16 @@ static void lex(benchmark::State &State) {
}
BENCHMARK(lex);
+static void pairBrackets(benchmark::State &State) {
+ clang::LangOptions LangOpts = genericLangOpts();
+ auto Stream = clang::pseudo::lex(*SourceText, LangOpts);
+ for (auto _ : State)
+ pairBrackets(Stream);
+ State.SetBytesProcessed(static_cast<uint64_t>(State.iterations()) *
+ SourceText->size());
+}
+BENCHMARK(pairBrackets);
+
static void preprocess(benchmark::State &State) {
clang::LangOptions LangOpts = genericLangOpts();
TokenStream RawStream = clang::pseudo::lex(*SourceText, LangOpts);
diff --git a/clang-tools-extra/pseudo/include/clang-pseudo/Bracket.h b/clang-tools-extra/pseudo/include/clang-pseudo/Bracket.h
new file mode 100644
index 0000000000000..268cfff1ab07a
--- /dev/null
+++ b/clang-tools-extra/pseudo/include/clang-pseudo/Bracket.h
@@ -0,0 +1,41 @@
+//===--- Bracket.h - Analyze bracket structure --------------------*-C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Bracket structure (particularly braces) is key to isolating broken regions
+// of code and preventing parsing from going "off the rails".
+//
+// For correct C++ code, brackets are well-nested and identifying pairs and
+// therefore blocks is simple. In broken code, brackets are not properly nested.
+// We cannot match them all and must choose which pairs to form.
+//
+// Rather than have the grammar-based parser make these choices, we pair
+// brackets up-front based on textual features like indentation.
+// This mirrors the way humans read code, and so is likely to produce the
+// "correct" interpretation of broken code.
+//
+// This interpretation then guides the parse: a rule containing a bracket pair
+// must match against paired bracket tokens.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef CLANG_PSEUDO_BRACKET_H
+#define CLANG_PSEUDO_BRACKET_H
+
+#include "clang-pseudo/Token.h"
+
+namespace clang {
+namespace pseudo {
+
+/// Identifies bracket token in the stream which should be paired.
+/// Sets Token::Pair accordingly.
+void pairBrackets(TokenStream &);
+
+} // namespace pseudo
+} // namespace clang
+
+#endif
diff --git a/clang-tools-extra/pseudo/include/clang-pseudo/Token.h b/clang-tools-extra/pseudo/include/clang-pseudo/Token.h
index 1750f547abd15..b558891f0a862 100644
--- a/clang-tools-extra/pseudo/include/clang-pseudo/Token.h
+++ b/clang-tools-extra/pseudo/include/clang-pseudo/Token.h
@@ -88,11 +88,15 @@ struct Token {
while (T->Kind == tok::comment);
return *T;
}
+ /// Returns the bracket paired with this one, if any.
+ const Token *pair() const { return Pair == 0 ? nullptr : this + Pair; }
/// The type of token as determined by clang's lexer.
clang::tok::TokenKind Kind = clang::tok::unknown;
+ /// If this token is a paired bracket, the offset of the pair in the stream.
+ int32_t Pair = 0;
};
-static_assert(sizeof(Token) <= sizeof(char *) + 16, "Careful with layout!");
+static_assert(sizeof(Token) <= sizeof(char *) + 20, "Careful with layout!");
llvm::raw_ostream &operator<<(llvm::raw_ostream &, const Token &);
/// A half-open range of tokens within a stream.
@@ -155,6 +159,11 @@ class TokenStream {
return tokens().slice(R.Begin, R.End - R.Begin);
}
+ MutableArrayRef<Token> tokens() {
+ assert(isFinalized());
+ return Tokens;
+ }
+
/// May return the end sentinel if the stream is empty.
const Token &front() const {
assert(isFinalized());
diff --git a/clang-tools-extra/pseudo/lib/Bracket.cpp b/clang-tools-extra/pseudo/lib/Bracket.cpp
new file mode 100644
index 0000000000000..07836146ad8a5
--- /dev/null
+++ b/clang-tools-extra/pseudo/lib/Bracket.cpp
@@ -0,0 +1,155 @@
+//===--- Bracket.cpp - Analyze bracket structure --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// The basic phases of our bracket matching are:
+//
+// 1) A simple "greedy" match looks for well-nested subsequences.
+//
+// We can't fully trust the results of this, consider:
+// while (1) { // A
+// if (true) { // B
+// break;
+// } // C
+// Greedy matching will match B=C, when we should at least consider A=C.
+// However for the correct parts of the file, the greedy match gives the
+// right answer. It produces useful candidates for phase 2.
+//
+// simplePairBrackets handles this step.
+//
+// 2) Try to identify places where formatting indicates that the greedy match
+// was correct. This is similar to how a human would scan a large file.
+//
+// For example:
+// int foo() { // X
+// // indented
+// while (1) {
+// // valid code
+// }
+// return bar(42);
+// } // Y
+// We can "verify" that X..Y looks like a braced block, and the greedy match
+// tells us that substring is perfectly nested.
+// We trust the pairings of those brackets and don't examine them further.
+// However in the first example above, we do not trust B=C because the brace
+// indentation is suspect.
+//
+// FIXME: implement this step.
+//
+// 3) Run full best-match optimization on remaining brackets.
+//
+// Conceptually, this considers all possible matchings and optimizes cost:
+// - there is a cost for failing to match a bracket
+// - there is a variable cost for matching two brackets.
+// (For example if brace indentation doesn't match).
+//
+// In the first example we have three alternatives, and they are ranked:
+// 1) A=C, skip B
+// 2) B=C, skip A
+// 3) skip A, skip B, skip C
+// The cost for skipping a bracket is high, so option 3 is worst.
+// B=C costs more than A=C, because the indentation doesn't match.
+//
+// It would be correct to run this step alone, but it would be too slow.
+// The implementation is dynamic programming in N^3 space and N^2 time.
+// Having earlier steps filter out most brackets is key to performance.
+//
+// FIXME: implement this step.
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang-pseudo/Bracket.h"
+
+namespace clang {
+namespace pseudo {
+namespace {
+
+struct Bracket {
+ using Index = unsigned;
+ constexpr static Index None = -1;
+
+ enum BracketKind : char { Paren, Brace, Square } Kind;
+ enum Direction : bool { Open, Close } Dir;
+ unsigned Line;
+ unsigned Indent;
+ Token::Index Tok;
+ Bracket::Index Pair = None;
+};
+
+// Find brackets in the stream and convert to Bracket struct.
+std::vector<Bracket> findBrackets(const TokenStream &Stream) {
+ std::vector<Bracket> Brackets;
+ auto Add = [&](const pseudo::Token &Tok, Bracket::BracketKind K,
+ Bracket::Direction D) {
+ Brackets.push_back(
+ {K, D, Tok.Line, Tok.Indent, Stream.index(Tok), Bracket::None});
+ };
+ for (const auto &Tok : Stream.tokens()) {
+ switch (Tok.Kind) {
+ case clang::tok::l_paren:
+ Add(Tok, Bracket::Paren, Bracket::Open);
+ break;
+ case clang::tok::r_paren:
+ Add(Tok, Bracket::Paren, Bracket::Close);
+ break;
+ case clang::tok::l_brace:
+ Add(Tok, Bracket::Brace, Bracket::Open);
+ break;
+ case clang::tok::r_brace:
+ Add(Tok, Bracket::Brace, Bracket::Close);
+ break;
+ case clang::tok::l_square:
+ Add(Tok, Bracket::Square, Bracket::Open);
+ break;
+ case clang::tok::r_square:
+ Add(Tok, Bracket::Square, Bracket::Close);
+ break;
+ default:
+ break;
+ }
+ }
+ return Brackets;
+}
+
+// Write the bracket pairings from Brackets back to Tokens.
+void applyPairings(ArrayRef<Bracket> Brackets, TokenStream &Tokens) {
+ for (const auto &B : Brackets)
+ Tokens.tokens()[B.Tok].Pair =
+ (B.Pair == Bracket::None) ? 0 : (int32_t)Brackets[B.Pair].Tok - B.Tok;
+}
+
+// Find perfect pairings (ignoring whitespace) via greedy algorithm.
+// This means two brackets are paired if they match and the brackets between
+// them nest perfectly, with no skipped or crossed brackets.
+void simplePairBrackets(MutableArrayRef<Bracket> Brackets) {
+ std::vector<unsigned> Stack;
+ for (unsigned I = 0; I < Brackets.size(); ++I) {
+ if (Brackets[I].Dir == Bracket::Open) {
+ Stack.push_back(I);
+ } else if (!Stack.empty() &&
+ Brackets[Stack.back()].Kind == Brackets[I].Kind) {
+ Brackets[Stack.back()].Pair = I;
+ Brackets[I].Pair = Stack.back();
+ Stack.pop_back();
+ } else {
+ // Unpaired closer, no brackets on stack are part of a perfect sequence.
+ Stack.clear();
+ }
+ }
+ // Any remaining brackets on the stack stay unpaired.
+}
+
+} // namespace
+
+void pairBrackets(TokenStream &Stream) {
+ auto Brackets = findBrackets(Stream);
+ simplePairBrackets(Brackets);
+ applyPairings(Brackets, Stream);
+}
+
+} // namespace pseudo
+} // namespace clang
diff --git a/clang-tools-extra/pseudo/lib/CMakeLists.txt b/clang-tools-extra/pseudo/lib/CMakeLists.txt
index b11d2dd12e280..6dc8ed5b5e7a2 100644
--- a/clang-tools-extra/pseudo/lib/CMakeLists.txt
+++ b/clang-tools-extra/pseudo/lib/CMakeLists.txt
@@ -1,6 +1,7 @@
set(LLVM_LINK_COMPONENTS Support)
add_clang_library(clangPseudo
+ Bracket.cpp
DirectiveTree.cpp
Forest.cpp
GLR.cpp
diff --git a/clang-tools-extra/pseudo/tool/ClangPseudo.cpp b/clang-tools-extra/pseudo/tool/ClangPseudo.cpp
index 5a6956df1f708..1d3ab19b3c09d 100644
--- a/clang-tools-extra/pseudo/tool/ClangPseudo.cpp
+++ b/clang-tools-extra/pseudo/tool/ClangPseudo.cpp
@@ -6,6 +6,7 @@
//
//===----------------------------------------------------------------------===//
+#include "clang-pseudo/Bracket.h"
#include "clang-pseudo/DirectiveTree.h"
#include "clang-pseudo/GLR.h"
#include "clang-pseudo/Grammar.h"
@@ -89,6 +90,7 @@ int main(int argc, char *argv[]) {
llvm::outs() << DirectiveStructure;
ParseableStream = clang::pseudo::stripComments(cook(*Stream, LangOpts));
+ pairBrackets(*ParseableStream);
}
if (Grammar.getNumOccurrences()) {
diff --git a/clang-tools-extra/pseudo/unittests/BracketTest.cpp b/clang-tools-extra/pseudo/unittests/BracketTest.cpp
new file mode 100644
index 0000000000000..1247ddbd49a1d
--- /dev/null
+++ b/clang-tools-extra/pseudo/unittests/BracketTest.cpp
@@ -0,0 +1,117 @@
+//===--- BracketTest.cpp -------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang-pseudo/Bracket.h"
+#include "clang-pseudo/Token.h"
+#include "clang/Basic/LangOptions.h"
+#include "llvm/Testing/Support/Annotations.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+namespace clang {
+namespace pseudo {
+
+// Return a version of Code with each paired bracket marked with ^.
+std::string decorate(llvm::StringRef Code, const TokenStream &Stream) {
+ std::string Result;
+ const char *Pos = Code.data();
+ for (const Token &Tok : Stream.tokens()) {
+ if (Tok.Pair == 0)
+ continue;
+ const char *NewPos = Tok.text().begin();
+ assert(NewPos >= Code.begin() && NewPos < Code.end());
+ Result.append(Pos, NewPos - Pos);
+ Result.push_back('^');
+ Pos = NewPos;
+ }
+ Result.append(Pos, Code.end() - Pos);
+ return Result;
+}
+
+// Checks that the brackets matched in Stream are those annotated in MarkedCode.
+void verifyMatchedSet(llvm::StringRef Code, llvm::StringRef MarkedCode,
+ const TokenStream &Stream) {
+ EXPECT_EQ(MarkedCode, decorate(Code, Stream));
+}
+
+// Checks that paired brackets within the stream nest properly.
+void verifyNesting(const TokenStream &Stream) {
+ std::vector<const Token *> Stack;
+ for (const auto &Tok : Stream.tokens()) {
+ if (Tok.Pair > 0)
+ Stack.push_back(&Tok);
+ else if (Tok.Pair < 0) {
+ ASSERT_FALSE(Stack.empty()) << Tok;
+ ASSERT_EQ(Stack.back(), Tok.pair())
+ << *Stack.back() << " != " << *Tok.pair() << " = pair of " << Tok;
+ Stack.pop_back();
+ }
+ }
+ ASSERT_THAT(Stack, testing::IsEmpty());
+}
+
+// Checks that ( pairs with a ) on its right, etc.
+void verifyMatchKind(const TokenStream &Stream) {
+ for (const auto &Tok : Stream.tokens()) {
+ if (Tok.Pair == 0)
+ continue;
+ auto Want = [&]() -> std::pair<bool, tok::TokenKind> {
+ switch (Tok.Kind) {
+ case tok::l_paren:
+ return {true, tok::r_paren};
+ case tok::r_paren:
+ return {false, tok::l_paren};
+ case tok::l_brace:
+ return {true, tok::r_brace};
+ case tok::r_brace:
+ return {false, tok::l_brace};
+ case tok::l_square:
+ return {true, tok::r_square};
+ case tok::r_square:
+ return {false, tok::l_square};
+ default:
+ ADD_FAILURE() << "Paired non-bracket " << Tok;
+ return {false, tok::eof};
+ }
+ }();
+ EXPECT_EQ(Tok.Pair > 0, Want.first) << Tok;
+ EXPECT_EQ(Tok.pair()->Kind, Want.second) << Tok;
+ }
+}
+
+// Verifies an expected bracket pairing like:
+// ^( [ ^)
+// The input is annotated code, with the brackets expected to be matched marked.
+//
+// The input doesn't specify which bracket matches with which, but we verify:
+// - exactly the marked subset are paired
+// - ( is paired to a later ), etc
+// - brackets properly nest
+// This uniquely determines the bracket structure, so we indirectly verify it.
+// If particular tests should emphasize which brackets are paired, use comments.
+void verifyBrackets(llvm::StringRef MarkedCode) {
+ SCOPED_TRACE(MarkedCode);
+ llvm::Annotations A(MarkedCode);
+ std::string Code = A.code().str();
+ LangOptions LangOpts;
+ auto Stream = lex(Code, LangOpts);
+ pairBrackets(Stream);
+
+ verifyMatchedSet(Code, MarkedCode, Stream);
+ verifyNesting(Stream);
+ verifyMatchKind(Stream);
+}
+
+TEST(Bracket, SimplePair) {
+ verifyBrackets("^{ ^[ ^( ^) ^( ^) ^] ^}");
+ verifyBrackets(") ^{ ^[ ^] ^} (");
+ verifyBrackets("{ [ ( ] }"); // FIXME
+}
+
+} // namespace pseudo
+} // namespace clang
diff --git a/clang-tools-extra/pseudo/unittests/CMakeLists.txt b/clang-tools-extra/pseudo/unittests/CMakeLists.txt
index aba8a16674899..73b13984d93e6 100644
--- a/clang-tools-extra/pseudo/unittests/CMakeLists.txt
+++ b/clang-tools-extra/pseudo/unittests/CMakeLists.txt
@@ -1,9 +1,11 @@
set(LLVM_LINK_COMPONENTS
Support
+ TestingSupport
)
add_custom_target(ClangPseudoUnitTests)
add_unittest(ClangPseudoUnitTests ClangPseudoTests
+ BracketTest.cpp
DirectiveTreeTest.cpp
ForestTest.cpp
GLRTest.cpp
More information about the cfe-commits
mailing list