[llvm] 8bd078b - [Symbolize] Parse multi-line markup elements.

Wed Jun 22 10:00:47 PDT 2022

Author: Daniel Thornburgh
Date: 2022-06-22T10:00:43-07:00
New Revision: 8bd078b57c7d308c0b6b29da9f98386a17b72b89

URL: https://github.com/llvm/llvm-project/commit/8bd078b57c7d308c0b6b29da9f98386a17b72b89
DIFF: https://github.com/llvm/llvm-project/commit/8bd078b57c7d308c0b6b29da9f98386a17b72b89.diff

LOG: [Symbolize] Parse multi-line markup elements.

This allows registering certain tags as possibly beginning multi-line
elements in the symbolizer markup parser. The parser is kept agnostic to
how lines are delimited; it reports the entire contents, including line
endings, once the end of element marker is reached.

Reviewed By: peter.smith

Differential Revision: https://reviews.llvm.org/D124798

Added: 
    

Modified: 
    llvm/include/llvm/DebugInfo/Symbolize/Markup.h
    llvm/lib/DebugInfo/Symbolize/Markup.cpp
    llvm/unittests/DebugInfo/Symbolizer/MarkupTest.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/DebugInfo/Symbolize/Markup.h b/llvm/include/llvm/DebugInfo/Symbolize/Markup.h
index 19cc0ab622f0b..86c133dd66adf 100644

--- a/llvm/include/llvm/DebugInfo/Symbolize/Markup.h
+++ b/llvm/include/llvm/DebugInfo/Symbolize/Markup.h
@@ -21,6 +21,7 @@
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/StringSet.h"
 #include "llvm/Support/Regex.h"
 
 namespace llvm {
@@ -52,7 +53,7 @@ struct MarkupNode {
 /// Parses a log containing symbolizer markup into a sequence of nodes.
 class MarkupParser {
 public:
-  MarkupParser();
+  MarkupParser(StringSet<> MultilineTags = {});
 
   /// Parses an individual \p Line of input.
   ///
@@ -60,34 +61,54 @@ class MarkupParser {
   /// by nextNode() are discarded. The nodes returned by nextNode() may
   /// reference the input string, so it must be retained by the caller until the
   /// last use.
+  ///
+  /// Note that some elements may span multiple lines. If a line ends with the
+  /// start of one of these elements, then no nodes will be produced until the
+  /// either the end or something that cannot be part of an element is
+  /// encountered. This may only occur after multiple calls to parseLine(),
+  /// corresponding to the lines of the multi-line element.
   void parseLine(StringRef Line);
 
-  /// Returns the next node from the most recent parseLine() call.
+  /// Inform the parser of that the input stream has ended.
+  ///
+  /// This allows the parser to finish any deferred processing (e.g., an
+  /// in-progress multi-line element) and may cause nextNode() to return
+  /// additional nodes.
+  void flush();
+
+  /// Returns the next node in the input sequence.
   ///
   /// Calling nextNode() may invalidate the contents of the node returned by the
   /// previous call.
   ///
   /// \returns the next markup node or None if none remain.
-  Optional<MarkupNode> nextNode() {
-    if (!NextIdx)
-      NextIdx = 0;
-    if (*NextIdx == Buffer.size()) {
-      NextIdx.reset();
-      Buffer.clear();
-      return None;
-    }
-    return std::move(Buffer[(*NextIdx)++]);
-  }
+  Optional<MarkupNode> nextNode();
 
 private:
   Optional<MarkupNode> parseElement(StringRef Line);
   void parseTextOutsideMarkup(StringRef Text);
+  Optional<StringRef> parseMultiLineBegin(StringRef Line);
+  Optional<StringRef> parseMultiLineEnd(StringRef Line);
+
+  // Tags of elements that can span multiple lines.
+  const StringSet<> MultilineTags;
+
+  // Contents of a multi-line element that has finished being parsed. Retained
+  // to keep returned StringRefs for the contents valid.
+  std::string FinishedMultiline;
+
+  // Contents of a multi-line element that is still in the process of receiving
+  // lines.
+  std::string InProgressMultiline;
+
+  // The line currently being parsed.
+  StringRef Line;
 
   // Buffer for nodes parsed from the current line.
   SmallVector<MarkupNode> Buffer;
 
-  // Next buffer index to return or None if nextNode has not yet been called.
-  Optional<size_t> NextIdx;
+  // Next buffer index to return.
+  size_t NextIdx;
 
   // Regular expression matching supported ANSI SGR escape sequences.
   const Regex SGRSyntax;

diff  --git a/llvm/lib/DebugInfo/Symbolize/Markup.cpp b/llvm/lib/DebugInfo/Symbolize/Markup.cpp
index 04cf7b38b7997..9bc65e763287f 100644
--- a/llvm/lib/DebugInfo/Symbolize/Markup.cpp
+++ b/llvm/lib/DebugInfo/Symbolize/Markup.cpp
@@ -13,6 +13,7 @@
 
 #include "llvm/DebugInfo/Symbolize/Markup.h"
 
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringExtras.h"
 
 namespace llvm {
@@ -24,7 +25,8 @@ namespace symbolize {
 //   "\033[30m" -- "\033[37m"
 static const char SGRSyntaxStr[] = "\033\\[([0-1]|3[0-7])m";
 
-MarkupParser::MarkupParser() : SGRSyntax(SGRSyntaxStr) {}
+MarkupParser::MarkupParser(StringSet<> MultilineTags)
+    : MultilineTags(std::move(MultilineTags)), SGRSyntax(SGRSyntaxStr) {}
 
 static StringRef takeTo(StringRef Str, StringRef::iterator Pos) {
   return Str.take_front(Pos - Str.begin());
@@ -35,18 +37,73 @@ static void advanceTo(StringRef &Str, StringRef::iterator Pos) {
 
 void MarkupParser::parseLine(StringRef Line) {
   Buffer.clear();
-  while (!Line.empty()) {
-    // Find the first valid markup element, if any.
-    if (Optional<MarkupNode> Element = parseElement(Line)) {
-      parseTextOutsideMarkup(takeTo(Line, Element->Text.begin()));
-      Buffer.push_back(std::move(*Element));
-      advanceTo(Line, Element->Text.end());
-    } else {
-      // The line doesn't contain any more markup elements, so emit it as text.
-      parseTextOutsideMarkup(Line);
-      return;
+  NextIdx = 0;
+  FinishedMultiline.clear();
+  this->Line = Line;
+}
+
+Optional<MarkupNode> MarkupParser::nextNode() {
+  // Pull something out of the buffer if possible.
+  if (!Buffer.empty()) {
+    if (NextIdx < Buffer.size())
+      return std::move(Buffer[NextIdx++]);
+    NextIdx = 0;
+    Buffer.clear();
+  }
+
+  // The buffer is empty, so parse the next bit of the line.
+
+  if (Line.empty())
+    return None;
+
+  if (!InProgressMultiline.empty()) {
+    if (Optional<StringRef> MultilineEnd = parseMultiLineEnd(Line)) {
+      llvm::append_range(InProgressMultiline, *MultilineEnd);
+      assert(FinishedMultiline.empty() &&
+             "At most one multi-line element can be finished at a time.");
+      FinishedMultiline.swap(InProgressMultiline);
+      // Parse the multi-line element as if it were contiguous.
+      advanceTo(Line, MultilineEnd->end());
+      return *parseElement(FinishedMultiline);
     }
+
+    // The whole line is part of the multi-line element.
+    llvm::append_range(InProgressMultiline, Line);
+    Line = Line.drop_front(Line.size());
+    return None;
+  }
+
+  // Find the first valid markup element, if any.
+  if (Optional<MarkupNode> Element = parseElement(Line)) {
+    parseTextOutsideMarkup(takeTo(Line, Element->Text.begin()));
+    Buffer.push_back(std::move(*Element));
+    advanceTo(Line, Element->Text.end());
+    return nextNode();
+  }
+
+  // Since there were no valid elements remaining, see if the line opens a
+  // multi-line element.
+  if (Optional<StringRef> MultilineBegin = parseMultiLineBegin(Line)) {
+    // Emit any text before the element.
+    parseTextOutsideMarkup(takeTo(Line, MultilineBegin->begin()));
+
+    // Begin recording the multi-line element.
+    llvm::append_range(InProgressMultiline, *MultilineBegin);
+    Line = Line.drop_front(Line.size());
+    return nextNode();
   }
+
+  // The line doesn't contain any more markup elements, so emit it as text.
+  parseTextOutsideMarkup(Line);
+  Line = Line.drop_front(Line.size());
+  return nextNode();
+}
+
+void MarkupParser::flush() {
+  if (InProgressMultiline.empty())
+    return;
+  FinishedMultiline.swap(InProgressMultiline);
+  parseTextOutsideMarkup(FinishedMultiline);
 }
 
 // Finds and returns the next valid markup element in the given line. Returns
@@ -107,5 +164,39 @@ void MarkupParser::parseTextOutsideMarkup(StringRef Text) {
     Buffer.push_back(textNode(Text));
 }
 
+// Given that a line doesn't contain any valid markup, see if it ends with the
+// start of a multi-line element. If so, returns the beginning.
+Optional<StringRef> MarkupParser::parseMultiLineBegin(StringRef Line) {
+  // A multi-line begin marker must be the last one on the line.
+  size_t BeginPos = Line.rfind("{{{");
+  if (BeginPos == StringRef::npos)
+    return None;
+  size_t BeginTagPos = BeginPos + 3;
+
+  // If there are any end markers afterwards, the begin marker cannot belong to
+  // a multi-line element.
+  size_t EndPos = Line.find("}}}", BeginTagPos);
+  if (EndPos != StringRef::npos)
+    return None;
+
+  // Check whether the tag is registered multi-line.
+  size_t EndTagPos = Line.find(':', BeginTagPos);
+  if (EndTagPos == StringRef::npos)
+    return None;
+  StringRef Tag = Line.slice(BeginTagPos, EndTagPos);
+  if (!MultilineTags.contains(Tag))
+    return None;
+  return Line.substr(BeginPos);
+}
+
+// See if the line begins with the ending of an in-progress multi-line element.
+// If so, return the ending.
+Optional<StringRef> MarkupParser::parseMultiLineEnd(StringRef Line) {
+  size_t EndPos = Line.find("}}}");
+  if (EndPos == StringRef::npos)
+    return None;
+  return Line.take_front(EndPos + 3);
+}
+
 } // end namespace symbolize
 } // end namespace llvm

diff  --git a/llvm/unittests/DebugInfo/Symbolizer/MarkupTest.cpp b/llvm/unittests/DebugInfo/Symbolizer/MarkupTest.cpp
index 6d587d90b3a93..fc95840a57472 100644
--- a/llvm/unittests/DebugInfo/Symbolizer/MarkupTest.cpp
+++ b/llvm/unittests/DebugInfo/Symbolizer/MarkupTest.cpp
@@ -44,6 +44,14 @@ TEST(SymbolizerMarkup, LinesWithoutMarkup) {
   EXPECT_THAT(Parser.nextNode(), testing::Optional(isNode("kept")));
   EXPECT_THAT(Parser.nextNode(), None);
 
+  Parser.parseLine("text\n");
+  EXPECT_THAT(Parser.nextNode(), testing::Optional(isNode("text\n")));
+  EXPECT_THAT(Parser.nextNode(), None);
+
+  Parser.parseLine("text\r\n");
+  EXPECT_THAT(Parser.nextNode(), testing::Optional(isNode("text\r\n")));
+  EXPECT_THAT(Parser.nextNode(), None);
+
   Parser.parseLine("{{{");
   EXPECT_THAT(Parser.nextNode(), testing::Optional(isNode("{{{")));
   EXPECT_THAT(Parser.nextNode(), None);
@@ -145,4 +153,69 @@ TEST(SymbolizerMarkup, LinesWithMarkup) {
   EXPECT_THAT(Parser.nextNode(), None);
 }
 
+TEST(SymbolizerMarkup, MultilineElements) {
+  MarkupParser Parser(/*MultilineTags=*/{"first", "second"});
+
+  Parser.parseLine("{{{tag:");
+  EXPECT_THAT(Parser.nextNode(), testing::Optional(isNode("{{{tag:")));
+  EXPECT_THAT(Parser.nextNode(), None);
+
+  Parser.parseLine("{{{first:");
+  EXPECT_THAT(Parser.nextNode(), None);
+  Parser.parseLine("}}}{{{second:");
+  EXPECT_THAT(
+      Parser.nextNode(),
+      testing::Optional(isNode("{{{first:}}}", "first", ElementsAre(""))));
+  EXPECT_THAT(Parser.nextNode(), None);
+  Parser.parseLine("}}}");
+  EXPECT_THAT(
+      Parser.nextNode(),
+      testing::Optional(isNode("{{{second:}}}", "second", ElementsAre(""))));
+  EXPECT_THAT(Parser.nextNode(), None);
+
+  Parser.parseLine("{{{before{{{first:");
+  EXPECT_THAT(Parser.nextNode(), testing::Optional(isNode("{{{before")));
+  EXPECT_THAT(Parser.nextNode(), None);
+  Parser.parseLine("line");
+  EXPECT_THAT(Parser.nextNode(), None);
+  Parser.parseLine("}}}after");
+  EXPECT_THAT(Parser.nextNode(),
+              testing::Optional(
+                  isNode("{{{first:line}}}", "first", ElementsAre("line"))));
+  EXPECT_THAT(Parser.nextNode(), testing::Optional(isNode("after")));
+  EXPECT_THAT(Parser.nextNode(), None);
+
+  Parser.parseLine("{{{first:");
+  EXPECT_THAT(Parser.nextNode(), None);
+  Parser.flush();
+  EXPECT_THAT(Parser.nextNode(), testing::Optional(isNode("{{{first:")));
+  EXPECT_THAT(Parser.nextNode(), None);
+
+  Parser.parseLine("{{{first:\n");
+  EXPECT_THAT(Parser.nextNode(), None);
+  Parser.parseLine("}}}\n");
+  EXPECT_THAT(
+      Parser.nextNode(),
+      testing::Optional(isNode("{{{first:\n}}}", "first", ElementsAre("\n"))));
+  EXPECT_THAT(Parser.nextNode(), testing::Optional(isNode("\n")));
+  EXPECT_THAT(Parser.nextNode(), None);
+
+  Parser.parseLine("{{{first:\r\n");
+  EXPECT_THAT(Parser.nextNode(), None);
+  Parser.parseLine("}}}\r\n");
+  EXPECT_THAT(Parser.nextNode(),
+              testing::Optional(
+                  isNode("{{{first:\r\n}}}", "first", ElementsAre("\r\n"))));
+  EXPECT_THAT(Parser.nextNode(), testing::Optional(isNode("\r\n")));
+  EXPECT_THAT(Parser.nextNode(), None);
+
+  Parser.parseLine("{{{first:");
+  EXPECT_THAT(Parser.nextNode(), None);
+  Parser.parseLine("\033[0m}}}");
+  EXPECT_THAT(Parser.nextNode(),
+              testing::Optional(isNode("{{{first:\033[0m}}}", "first",
+                                       ElementsAre("\033[0m"))));
+  EXPECT_THAT(Parser.nextNode(), None);
+}
+
 } // namespace