[clang-tools-extra] 0360b9f - [pseudo] (trivial) bracket-matching

Sam McCall via cfe-commits cfe-commits at lists.llvm.org
Tue May 24 06:13:41 PDT 2022


Author: Sam McCall
Date: 2022-05-24T15:13:36+02:00
New Revision: 0360b9f1599b0b13f164d8170a619b19f9cb8bb4

URL: https://github.com/llvm/llvm-project/commit/0360b9f1599b0b13f164d8170a619b19f9cb8bb4
DIFF: https://github.com/llvm/llvm-project/commit/0360b9f1599b0b13f164d8170a619b19f9cb8bb4.diff

LOG: [pseudo] (trivial) bracket-matching

Error-tolerant bracket matching enables our error-tolerant parsing strategies.
The implementation here is *not* yet error tolerant: this patch sets up the APIs
and plumbing, and describes the planned approach.

Differential Revision: https://reviews.llvm.org/D125911

Added: 
    clang-tools-extra/pseudo/include/clang-pseudo/Bracket.h
    clang-tools-extra/pseudo/lib/Bracket.cpp
    clang-tools-extra/pseudo/unittests/BracketTest.cpp

Modified: 
    clang-tools-extra/pseudo/benchmarks/Benchmark.cpp
    clang-tools-extra/pseudo/include/clang-pseudo/Token.h
    clang-tools-extra/pseudo/lib/CMakeLists.txt
    clang-tools-extra/pseudo/tool/ClangPseudo.cpp
    clang-tools-extra/pseudo/unittests/CMakeLists.txt

Removed: 
    


################################################################################
diff  --git a/clang-tools-extra/pseudo/benchmarks/Benchmark.cpp b/clang-tools-extra/pseudo/benchmarks/Benchmark.cpp
index fb028be0c7abd..b10ff3a175bd2 100644
--- a/clang-tools-extra/pseudo/benchmarks/Benchmark.cpp
+++ b/clang-tools-extra/pseudo/benchmarks/Benchmark.cpp
@@ -20,6 +20,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "benchmark/benchmark.h"
+#include "clang-pseudo/Bracket.h"
 #include "clang-pseudo/DirectiveTree.h"
 #include "clang-pseudo/Forest.h"
 #include "clang-pseudo/GLR.h"
@@ -89,7 +90,9 @@ TokenStream lexAndPreprocess() {
   chooseConditionalBranches(DirectiveStructure, RawStream);
   TokenStream Cook =
       cook(DirectiveStructure.stripDirectives(RawStream), LangOpts);
-  return stripComments(Cook);
+  auto Stream = stripComments(Cook);
+  pairBrackets(Stream);
+  return Stream;
 }
 
 static void lex(benchmark::State &State) {
@@ -101,6 +104,16 @@ static void lex(benchmark::State &State) {
 }
 BENCHMARK(lex);
 
+static void pairBrackets(benchmark::State &State) {
+  clang::LangOptions LangOpts = genericLangOpts();
+  auto Stream = clang::pseudo::lex(*SourceText, LangOpts);
+  for (auto _ : State)
+    pairBrackets(Stream);
+  State.SetBytesProcessed(static_cast<uint64_t>(State.iterations()) *
+                          SourceText->size());
+}
+BENCHMARK(pairBrackets);
+
 static void preprocess(benchmark::State &State) {
   clang::LangOptions LangOpts = genericLangOpts();
   TokenStream RawStream = clang::pseudo::lex(*SourceText, LangOpts);

diff  --git a/clang-tools-extra/pseudo/include/clang-pseudo/Bracket.h b/clang-tools-extra/pseudo/include/clang-pseudo/Bracket.h
new file mode 100644
index 0000000000000..268cfff1ab07a
--- /dev/null
+++ b/clang-tools-extra/pseudo/include/clang-pseudo/Bracket.h
@@ -0,0 +1,41 @@
+//===--- Bracket.h - Analyze bracket structure --------------------*-C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Bracket structure (particularly braces) is key to isolating broken regions
+// of code and preventing parsing from going "off the rails".
+//
+// For correct C++ code, brackets are well-nested and identifying pairs and
+// therefore blocks is simple. In broken code, brackets are not properly nested.
+// We cannot match them all and must choose which pairs to form.
+//
+// Rather than have the grammar-based parser make these choices, we pair
+// brackets up-front based on textual features like indentation.
+// This mirrors the way humans read code, and so is likely to produce the
+// "correct" interpretation of broken code.
+//
+// This interpretation then guides the parse: a rule containing a bracket pair
+// must match against paired bracket tokens.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef CLANG_PSEUDO_BRACKET_H
+#define CLANG_PSEUDO_BRACKET_H
+
+#include "clang-pseudo/Token.h"
+
+namespace clang {
+namespace pseudo {
+
+/// Identifies bracket token in the stream which should be paired.
+/// Sets Token::Pair accordingly.
+void pairBrackets(TokenStream &);
+
+} // namespace pseudo
+} // namespace clang
+
+#endif

diff  --git a/clang-tools-extra/pseudo/include/clang-pseudo/Token.h b/clang-tools-extra/pseudo/include/clang-pseudo/Token.h
index 1750f547abd15..b558891f0a862 100644
--- a/clang-tools-extra/pseudo/include/clang-pseudo/Token.h
+++ b/clang-tools-extra/pseudo/include/clang-pseudo/Token.h
@@ -88,11 +88,15 @@ struct Token {
     while (T->Kind == tok::comment);
     return *T;
   }
+  /// Returns the bracket paired with this one, if any.
+  const Token *pair() const { return Pair == 0 ? nullptr : this + Pair; }
 
   /// The type of token as determined by clang's lexer.
   clang::tok::TokenKind Kind = clang::tok::unknown;
+  /// If this token is a paired bracket, the offset of the pair in the stream.
+  int32_t Pair = 0;
 };
-static_assert(sizeof(Token) <= sizeof(char *) + 16, "Careful with layout!");
+static_assert(sizeof(Token) <= sizeof(char *) + 20, "Careful with layout!");
 llvm::raw_ostream &operator<<(llvm::raw_ostream &, const Token &);
 
 /// A half-open range of tokens within a stream.
@@ -155,6 +159,11 @@ class TokenStream {
     return tokens().slice(R.Begin, R.End - R.Begin);
   }
 
+  MutableArrayRef<Token> tokens() {
+    assert(isFinalized());
+    return Tokens;
+  }
+
   /// May return the end sentinel if the stream is empty.
   const Token &front() const {
     assert(isFinalized());

diff  --git a/clang-tools-extra/pseudo/lib/Bracket.cpp b/clang-tools-extra/pseudo/lib/Bracket.cpp
new file mode 100644
index 0000000000000..07836146ad8a5
--- /dev/null
+++ b/clang-tools-extra/pseudo/lib/Bracket.cpp
@@ -0,0 +1,155 @@
+//===--- Bracket.cpp - Analyze bracket structure --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// The basic phases of our bracket matching are:
+//
+// 1) A simple "greedy" match looks for well-nested subsequences.
+//
+//    We can't fully trust the results of this, consider:
+//      while (1) {   // A
+//        if (true) { // B
+//          break;
+//      }             // C
+//    Greedy matching will match B=C, when we should at least consider A=C.
+//    However for the correct parts of the file, the greedy match gives the
+//    right answer. It produces useful candidates for phase 2.
+//
+//    simplePairBrackets handles this step.
+//
+// 2) Try to identify places where formatting indicates that the greedy match
+//    was correct. This is similar to how a human would scan a large file.
+//
+//    For example:
+//      int foo() {      // X
+//        // indented
+//        while (1) {
+//          // valid code
+//        }
+//        return bar(42);
+//      }                // Y
+//    We can "verify" that X..Y looks like a braced block, and the greedy match
+//    tells us that substring is perfectly nested.
+//    We trust the pairings of those brackets and don't examine them further.
+//    However in the first example above, we do not trust B=C because the brace
+//    indentation is suspect.
+//
+//    FIXME: implement this step.
+//
+// 3) Run full best-match optimization on remaining brackets.
+//
+//    Conceptually, this considers all possible matchings and optimizes cost:
+//      - there is a cost for failing to match a bracket
+//      - there is a variable cost for matching two brackets.
+//        (For example if brace indentation doesn't match).
+//
+//    In the first example we have three alternatives, and they are ranked:
+//      1) A=C, skip B
+//      2) B=C, skip A
+//      3) skip A, skip B, skip C
+//    The cost for skipping a bracket is high, so option 3 is worst.
+//    B=C costs more than A=C, because the indentation doesn't match.
+//
+//    It would be correct to run this step alone, but it would be too slow.
+//    The implementation is dynamic programming in N^3 space and N^2 time.
+//    Having earlier steps filter out most brackets is key to performance.
+//
+//    FIXME: implement this step.
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang-pseudo/Bracket.h"
+
+namespace clang {
+namespace pseudo {
+namespace {
+
+struct Bracket {
+  using Index = unsigned;
+  constexpr static Index None = -1;
+
+  enum BracketKind : char { Paren, Brace, Square } Kind;
+  enum Direction : bool { Open, Close } Dir;
+  unsigned Line;
+  unsigned Indent;
+  Token::Index Tok;
+  Bracket::Index Pair = None;
+};
+
+// Find brackets in the stream and convert to Bracket struct.
+std::vector<Bracket> findBrackets(const TokenStream &Stream) {
+  std::vector<Bracket> Brackets;
+  auto Add = [&](const pseudo::Token &Tok, Bracket::BracketKind K,
+                 Bracket::Direction D) {
+    Brackets.push_back(
+        {K, D, Tok.Line, Tok.Indent, Stream.index(Tok), Bracket::None});
+  };
+  for (const auto &Tok : Stream.tokens()) {
+    switch (Tok.Kind) {
+    case clang::tok::l_paren:
+      Add(Tok, Bracket::Paren, Bracket::Open);
+      break;
+    case clang::tok::r_paren:
+      Add(Tok, Bracket::Paren, Bracket::Close);
+      break;
+    case clang::tok::l_brace:
+      Add(Tok, Bracket::Brace, Bracket::Open);
+      break;
+    case clang::tok::r_brace:
+      Add(Tok, Bracket::Brace, Bracket::Close);
+      break;
+    case clang::tok::l_square:
+      Add(Tok, Bracket::Square, Bracket::Open);
+      break;
+    case clang::tok::r_square:
+      Add(Tok, Bracket::Square, Bracket::Close);
+      break;
+    default:
+      break;
+    }
+  }
+  return Brackets;
+}
+
+// Write the bracket pairings from Brackets back to Tokens.
+void applyPairings(ArrayRef<Bracket> Brackets, TokenStream &Tokens) {
+  for (const auto &B : Brackets)
+    Tokens.tokens()[B.Tok].Pair =
+        (B.Pair == Bracket::None) ? 0 : (int32_t)Brackets[B.Pair].Tok - B.Tok;
+}
+
+// Find perfect pairings (ignoring whitespace) via greedy algorithm.
+// This means two brackets are paired if they match and the brackets between
+// them nest perfectly, with no skipped or crossed brackets.
+void simplePairBrackets(MutableArrayRef<Bracket> Brackets) {
+  std::vector<unsigned> Stack;
+  for (unsigned I = 0; I < Brackets.size(); ++I) {
+    if (Brackets[I].Dir == Bracket::Open) {
+      Stack.push_back(I);
+    } else if (!Stack.empty() &&
+               Brackets[Stack.back()].Kind == Brackets[I].Kind) {
+      Brackets[Stack.back()].Pair = I;
+      Brackets[I].Pair = Stack.back();
+      Stack.pop_back();
+    } else {
+      // Unpaired closer, no brackets on stack are part of a perfect sequence.
+      Stack.clear();
+    }
+  }
+  // Any remaining brackets on the stack stay unpaired.
+}
+
+} // namespace
+
+void pairBrackets(TokenStream &Stream) {
+  auto Brackets = findBrackets(Stream);
+  simplePairBrackets(Brackets);
+  applyPairings(Brackets, Stream);
+}
+
+} // namespace pseudo
+} // namespace clang

diff  --git a/clang-tools-extra/pseudo/lib/CMakeLists.txt b/clang-tools-extra/pseudo/lib/CMakeLists.txt
index b11d2dd12e280..6dc8ed5b5e7a2 100644
--- a/clang-tools-extra/pseudo/lib/CMakeLists.txt
+++ b/clang-tools-extra/pseudo/lib/CMakeLists.txt
@@ -1,6 +1,7 @@
 set(LLVM_LINK_COMPONENTS Support)
 
 add_clang_library(clangPseudo
+  Bracket.cpp
   DirectiveTree.cpp
   Forest.cpp
   GLR.cpp

diff  --git a/clang-tools-extra/pseudo/tool/ClangPseudo.cpp b/clang-tools-extra/pseudo/tool/ClangPseudo.cpp
index 5a6956df1f708..1d3ab19b3c09d 100644
--- a/clang-tools-extra/pseudo/tool/ClangPseudo.cpp
+++ b/clang-tools-extra/pseudo/tool/ClangPseudo.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "clang-pseudo/Bracket.h"
 #include "clang-pseudo/DirectiveTree.h"
 #include "clang-pseudo/GLR.h"
 #include "clang-pseudo/Grammar.h"
@@ -89,6 +90,7 @@ int main(int argc, char *argv[]) {
       llvm::outs() << DirectiveStructure;
 
     ParseableStream = clang::pseudo::stripComments(cook(*Stream, LangOpts));
+    pairBrackets(*ParseableStream);
   }
 
   if (Grammar.getNumOccurrences()) {

diff  --git a/clang-tools-extra/pseudo/unittests/BracketTest.cpp b/clang-tools-extra/pseudo/unittests/BracketTest.cpp
new file mode 100644
index 0000000000000..1247ddbd49a1d
--- /dev/null
+++ b/clang-tools-extra/pseudo/unittests/BracketTest.cpp
@@ -0,0 +1,117 @@
+//===--- BracketTest.cpp -------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang-pseudo/Bracket.h"
+#include "clang-pseudo/Token.h"
+#include "clang/Basic/LangOptions.h"
+#include "llvm/Testing/Support/Annotations.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+namespace clang {
+namespace pseudo {
+
+// Return a version of Code with each paired bracket marked with ^.
+std::string decorate(llvm::StringRef Code, const TokenStream &Stream) {
+  std::string Result;
+  const char *Pos = Code.data();
+  for (const Token &Tok : Stream.tokens()) {
+    if (Tok.Pair == 0)
+      continue;
+    const char *NewPos = Tok.text().begin();
+    assert(NewPos >= Code.begin() && NewPos < Code.end());
+    Result.append(Pos, NewPos - Pos);
+    Result.push_back('^');
+    Pos = NewPos;
+  }
+  Result.append(Pos, Code.end() - Pos);
+  return Result;
+}
+
+// Checks that the brackets matched in Stream are those annotated in MarkedCode.
+void verifyMatchedSet(llvm::StringRef Code, llvm::StringRef MarkedCode,
+                      const TokenStream &Stream) {
+  EXPECT_EQ(MarkedCode, decorate(Code, Stream));
+}
+
+// Checks that paired brackets within the stream nest properly.
+void verifyNesting(const TokenStream &Stream) {
+  std::vector<const Token *> Stack;
+  for (const auto &Tok : Stream.tokens()) {
+    if (Tok.Pair > 0)
+      Stack.push_back(&Tok);
+    else if (Tok.Pair < 0) {
+      ASSERT_FALSE(Stack.empty()) << Tok;
+      ASSERT_EQ(Stack.back(), Tok.pair())
+          << *Stack.back() << " != " << *Tok.pair() << " = pair of " << Tok;
+      Stack.pop_back();
+    }
+  }
+  ASSERT_THAT(Stack, testing::IsEmpty());
+}
+
+// Checks that ( pairs with a ) on its right, etc.
+void verifyMatchKind(const TokenStream &Stream) {
+  for (const auto &Tok : Stream.tokens()) {
+    if (Tok.Pair == 0)
+      continue;
+    auto Want = [&]() -> std::pair<bool, tok::TokenKind> {
+      switch (Tok.Kind) {
+      case tok::l_paren:
+        return {true, tok::r_paren};
+      case tok::r_paren:
+        return {false, tok::l_paren};
+      case tok::l_brace:
+        return {true, tok::r_brace};
+      case tok::r_brace:
+        return {false, tok::l_brace};
+      case tok::l_square:
+        return {true, tok::r_square};
+      case tok::r_square:
+        return {false, tok::l_square};
+      default:
+        ADD_FAILURE() << "Paired non-bracket " << Tok;
+        return {false, tok::eof};
+      }
+    }();
+    EXPECT_EQ(Tok.Pair > 0, Want.first) << Tok;
+    EXPECT_EQ(Tok.pair()->Kind, Want.second) << Tok;
+  }
+}
+
+// Verifies an expected bracket pairing like:
+//   ^( [ ^)
+// The input is annotated code, with the brackets expected to be matched marked.
+//
+// The input doesn't specify which bracket matches with which, but we verify:
+//  - exactly the marked subset are paired
+//  - ( is paired to a later ), etc
+//  - brackets properly nest
+// This uniquely determines the bracket structure, so we indirectly verify it.
+// If particular tests should emphasize which brackets are paired, use comments.
+void verifyBrackets(llvm::StringRef MarkedCode) {
+  SCOPED_TRACE(MarkedCode);
+  llvm::Annotations A(MarkedCode);
+  std::string Code = A.code().str();
+  LangOptions LangOpts;
+  auto Stream = lex(Code, LangOpts);
+  pairBrackets(Stream);
+
+  verifyMatchedSet(Code, MarkedCode, Stream);
+  verifyNesting(Stream);
+  verifyMatchKind(Stream);
+}
+
+TEST(Bracket, SimplePair) {
+  verifyBrackets("^{ ^[ ^( ^)  ^( ^) ^] ^}");
+  verifyBrackets(") ^{ ^[ ^] ^} (");
+  verifyBrackets("{ [ ( ] }"); // FIXME
+}
+
+} // namespace pseudo
+} // namespace clang

diff  --git a/clang-tools-extra/pseudo/unittests/CMakeLists.txt b/clang-tools-extra/pseudo/unittests/CMakeLists.txt
index aba8a16674899..73b13984d93e6 100644
--- a/clang-tools-extra/pseudo/unittests/CMakeLists.txt
+++ b/clang-tools-extra/pseudo/unittests/CMakeLists.txt
@@ -1,9 +1,11 @@
 set(LLVM_LINK_COMPONENTS
   Support
+  TestingSupport
   )
 
 add_custom_target(ClangPseudoUnitTests)
 add_unittest(ClangPseudoUnitTests ClangPseudoTests
+  BracketTest.cpp
   DirectiveTreeTest.cpp
   ForestTest.cpp
   GLRTest.cpp


        


More information about the cfe-commits mailing list