[llvm] 2040b6d - [Symbolize] Parser for log symbolizer markup.

Fri Jun 17 10:26:29 PDT 2022

Author: Daniel Thornburgh
Date: 2022-06-17T10:26:24-07:00
New Revision: 2040b6df0a3f355076a363459f0d6c5ef187ac81

URL: https://github.com/llvm/llvm-project/commit/2040b6df0a3f355076a363459f0d6c5ef187ac81
DIFF: https://github.com/llvm/llvm-project/commit/2040b6df0a3f355076a363459f0d6c5ef187ac81.diff

LOG: [Symbolize] Parser for log symbolizer markup.

This adds a parser for the log symbolizer markup format discussed in
https://discourse.llvm.org/t/rfc-log-symbolizer/61282. The parser
operates in a line-by-line fashion with minimal memory requirements.

This doesn't yet include support for multi-line tags or specific parsing
for ANSI X3.64 SGR control sequences, but it can be extended to do so.
The latter can also be relatively easily handled by examining the
resulting text elements.

Reviewed By: peter.smith

Differential Revision: https://reviews.llvm.org/D124686

Added: 
    llvm/include/llvm/DebugInfo/Symbolize/Markup.h
    llvm/lib/DebugInfo/Symbolize/Markup.cpp
    llvm/unittests/DebugInfo/Symbolizer/CMakeLists.txt
    llvm/unittests/DebugInfo/Symbolizer/MarkupTest.cpp

Modified: 
    llvm/lib/DebugInfo/Symbolize/CMakeLists.txt
    llvm/unittests/DebugInfo/CMakeLists.txt

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/DebugInfo/Symbolize/Markup.h b/llvm/include/llvm/DebugInfo/Symbolize/Markup.h
new file mode 100644
index 0000000000000..19cc0ab622f0b

--- /dev/null
+++ b/llvm/include/llvm/DebugInfo/Symbolize/Markup.h
@@ -0,0 +1,99 @@
+//===- Markup.h -------------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file declares the log symbolizer markup data model and parser.
+///
+/// \todo Add a link to the reference documentation once added.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_SYMBOLIZE_MARKUP_H
+#define LLVM_DEBUGINFO_SYMBOLIZE_MARKUP_H
+
+#include <iostream>
+
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Regex.h"
+
+namespace llvm {
+namespace symbolize {
+
+/// A node of symbolizer markup.
+///
+/// If only the Text field is set, this represents a region of text outside a
+/// markup element. ANSI SGR control codes are also reported this way; if
+/// detected, then the control code will be the entirety of the Text field, and
+/// any surrounding text will be reported as preceding and following nodes.
+struct MarkupNode {
+  /// The full text of this node in the input.
+  StringRef Text;
+
+  /// If this represents an element, the tag. Otherwise, empty.
+  StringRef Tag;
+
+  /// If this represents an element with fields, a list of the field contents.
+  /// Otherwise, empty.
+  SmallVector<StringRef> Fields;
+
+  bool operator==(const MarkupNode &Other) const {
+    return Text == Other.Text && Tag == Other.Tag && Fields == Other.Fields;
+  }
+  bool operator!=(const MarkupNode &Other) const { return !(*this == Other); }
+};
+
+/// Parses a log containing symbolizer markup into a sequence of nodes.
+class MarkupParser {
+public:
+  MarkupParser();
+
+  /// Parses an individual \p Line of input.
+  ///
+  /// Nodes from the previous parseLine() call that haven't yet been extracted
+  /// by nextNode() are discarded. The nodes returned by nextNode() may
+  /// reference the input string, so it must be retained by the caller until the
+  /// last use.
+  void parseLine(StringRef Line);
+
+  /// Returns the next node from the most recent parseLine() call.
+  ///
+  /// Calling nextNode() may invalidate the contents of the node returned by the
+  /// previous call.
+  ///
+  /// \returns the next markup node or None if none remain.
+  Optional<MarkupNode> nextNode() {
+    if (!NextIdx)
+      NextIdx = 0;
+    if (*NextIdx == Buffer.size()) {
+      NextIdx.reset();
+      Buffer.clear();
+      return None;
+    }
+    return std::move(Buffer[(*NextIdx)++]);
+  }
+
+private:
+  Optional<MarkupNode> parseElement(StringRef Line);
+  void parseTextOutsideMarkup(StringRef Text);
+
+  // Buffer for nodes parsed from the current line.
+  SmallVector<MarkupNode> Buffer;
+
+  // Next buffer index to return or None if nextNode has not yet been called.
+  Optional<size_t> NextIdx;
+
+  // Regular expression matching supported ANSI SGR escape sequences.
+  const Regex SGRSyntax;
+};
+
+} // end namespace symbolize
+} // end namespace llvm
+
+#endif // LLVM_DEBUGINFO_SYMBOLIZE_MARKUP_H

diff  --git a/llvm/lib/DebugInfo/Symbolize/CMakeLists.txt b/llvm/lib/DebugInfo/Symbolize/CMakeLists.txt
index a647a161579ae..c83d957eeb9d5 100644
--- a/llvm/lib/DebugInfo/Symbolize/CMakeLists.txt
+++ b/llvm/lib/DebugInfo/Symbolize/CMakeLists.txt
@@ -1,6 +1,7 @@
 add_llvm_component_library(LLVMSymbolize
   DIFetcher.cpp
   DIPrinter.cpp
+  Markup.cpp
   SymbolizableObjectFile.cpp
   Symbolize.cpp
 

diff  --git a/llvm/lib/DebugInfo/Symbolize/Markup.cpp b/llvm/lib/DebugInfo/Symbolize/Markup.cpp
new file mode 100644
index 0000000000000..04cf7b38b7997
--- /dev/null
+++ b/llvm/lib/DebugInfo/Symbolize/Markup.cpp
@@ -0,0 +1,111 @@
+//===- lib/DebugInfo/Symbolize/Markup.cpp ------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines the log symbolizer markup data model and parser.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/Symbolize/Markup.h"
+
+#include "llvm/ADT/StringExtras.h"
+
+namespace llvm {
+namespace symbolize {
+
+// Matches the following:
+//   "\033[0m"
+//   "\033[1m"
+//   "\033[30m" -- "\033[37m"
+static const char SGRSyntaxStr[] = "\033\\[([0-1]|3[0-7])m";
+
+MarkupParser::MarkupParser() : SGRSyntax(SGRSyntaxStr) {}
+
+static StringRef takeTo(StringRef Str, StringRef::iterator Pos) {
+  return Str.take_front(Pos - Str.begin());
+}
+static void advanceTo(StringRef &Str, StringRef::iterator Pos) {
+  Str = Str.drop_front(Pos - Str.begin());
+}
+
+void MarkupParser::parseLine(StringRef Line) {
+  Buffer.clear();
+  while (!Line.empty()) {
+    // Find the first valid markup element, if any.
+    if (Optional<MarkupNode> Element = parseElement(Line)) {
+      parseTextOutsideMarkup(takeTo(Line, Element->Text.begin()));
+      Buffer.push_back(std::move(*Element));
+      advanceTo(Line, Element->Text.end());
+    } else {
+      // The line doesn't contain any more markup elements, so emit it as text.
+      parseTextOutsideMarkup(Line);
+      return;
+    }
+  }
+}
+
+// Finds and returns the next valid markup element in the given line. Returns
+// None if the line contains no valid elements.
+Optional<MarkupNode> MarkupParser::parseElement(StringRef Line) {
+  while (true) {
+    // Find next element using begin and end markers.
+    size_t BeginPos = Line.find("{{{");
+    if (BeginPos == StringRef::npos)
+      return None;
+    size_t EndPos = Line.find("}}}", BeginPos + 3);
+    if (EndPos == StringRef::npos)
+      return None;
+    EndPos += 3;
+    MarkupNode Element;
+    Element.Text = Line.slice(BeginPos, EndPos);
+    Line = Line.substr(EndPos);
+
+    // Parse tag.
+    StringRef Content = Element.Text.drop_front(3).drop_back(3);
+    StringRef FieldsContent;
+    std::tie(Element.Tag, FieldsContent) = Content.split(':');
+    if (Element.Tag.empty())
+      continue;
+
+    // Parse fields.
+    if (!FieldsContent.empty())
+      FieldsContent.split(Element.Fields, ":");
+    else if (Content.back() == ':')
+      Element.Fields.push_back(FieldsContent);
+
+    return Element;
+  }
+}
+
+static MarkupNode textNode(StringRef Text) {
+  MarkupNode Node;
+  Node.Text = Text;
+  return Node;
+}
+
+// Parses a region of text known to be outside any markup elements. Such text
+// may still contain SGR control codes, so the region is further subdivided into
+// control codes and true text regions.
+void MarkupParser::parseTextOutsideMarkup(StringRef Text) {
+  if (Text.empty())
+    return;
+  SmallVector<StringRef> Matches;
+  while (SGRSyntax.match(Text, &Matches)) {
+    // Emit any text before the SGR element.
+    if (Matches.begin()->begin() != Text.begin())
+      Buffer.push_back(textNode(takeTo(Text, Matches.begin()->begin())));
+
+    Buffer.push_back(textNode(*Matches.begin()));
+    advanceTo(Text, Matches.begin()->end());
+  }
+  if (!Text.empty())
+    Buffer.push_back(textNode(Text));
+}
+
+} // end namespace symbolize
+} // end namespace llvm

diff  --git a/llvm/unittests/DebugInfo/CMakeLists.txt b/llvm/unittests/DebugInfo/CMakeLists.txt
index 0a0a1141d92e2..4be8d76473c2e 100644
--- a/llvm/unittests/DebugInfo/CMakeLists.txt
+++ b/llvm/unittests/DebugInfo/CMakeLists.txt
@@ -3,3 +3,4 @@ add_subdirectory(DWARF)
 add_subdirectory(GSYM)
 add_subdirectory(MSF)
 add_subdirectory(PDB)
+add_subdirectory(Symbolizer)

diff  --git a/llvm/unittests/DebugInfo/Symbolizer/CMakeLists.txt b/llvm/unittests/DebugInfo/Symbolizer/CMakeLists.txt
new file mode 100644
index 0000000000000..e6c2ba17f4e1d
--- /dev/null
+++ b/llvm/unittests/DebugInfo/Symbolizer/CMakeLists.txt
@@ -0,0 +1,3 @@
+set(LLVM_LINK_COMPONENTS Symbolize)
+add_llvm_unittest(DebugInfoSymbolizerTests MarkupTest.cpp)
+target_link_libraries(DebugInfoSymbolizerTests PRIVATE LLVMTestingSupport)

diff  --git a/llvm/unittests/DebugInfo/Symbolizer/MarkupTest.cpp b/llvm/unittests/DebugInfo/Symbolizer/MarkupTest.cpp
new file mode 100644
index 0000000000000..6d587d90b3a93
--- /dev/null
+++ b/llvm/unittests/DebugInfo/Symbolizer/MarkupTest.cpp
@@ -0,0 +1,148 @@
+
+//===- unittest/DebugInfo/Symbolizer/MarkupTest.cpp - Markup parser tests -===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/Symbolize/Markup.h"
+
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Support/FormatVariadic.h"
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+namespace {
+
+using namespace llvm;
+using namespace llvm::symbolize;
+using namespace testing;
+
+Matcher<MarkupNode> isNode(StringRef Text, StringRef Tag = "",
+                           Matcher<SmallVector<StringRef>> Fields = IsEmpty()) {
+  return AllOf(Field("Text", &MarkupNode::Text, Text),
+               Field("Tag", &MarkupNode::Tag, Tag),
+               Field("Fields", &MarkupNode::Fields, Fields));
+}
+
+TEST(SymbolizerMarkup, NoLines) { EXPECT_EQ(MarkupParser{}.nextNode(), None); }
+
+TEST(SymbolizerMarkup, LinesWithoutMarkup) {
+  MarkupParser Parser;
+
+  Parser.parseLine("text");
+  EXPECT_THAT(Parser.nextNode(), testing::Optional(isNode("text")));
+  EXPECT_THAT(Parser.nextNode(), None);
+
+  Parser.parseLine("discarded");
+  Parser.parseLine("kept");
+  EXPECT_THAT(Parser.nextNode(), testing::Optional(isNode("kept")));
+  EXPECT_THAT(Parser.nextNode(), None);
+
+  Parser.parseLine("{{{");
+  EXPECT_THAT(Parser.nextNode(), testing::Optional(isNode("{{{")));
+  EXPECT_THAT(Parser.nextNode(), None);
+
+  Parser.parseLine("{{{}}");
+  EXPECT_THAT(Parser.nextNode(), testing::Optional(isNode("{{{}}")));
+  EXPECT_THAT(Parser.nextNode(), None);
+
+  Parser.parseLine("{{}}}");
+  EXPECT_THAT(Parser.nextNode(), testing::Optional(isNode("{{}}}")));
+  EXPECT_THAT(Parser.nextNode(), None);
+
+  Parser.parseLine("{{{}}}");
+  EXPECT_THAT(Parser.nextNode(), testing::Optional(isNode("{{{}}}")));
+  EXPECT_THAT(Parser.nextNode(), None);
+
+  Parser.parseLine("{{{:field}}}");
+  EXPECT_THAT(Parser.nextNode(), testing::Optional(isNode("{{{:field}}}")));
+  EXPECT_THAT(Parser.nextNode(), None);
+
+  Parser.parseLine("{{{tag:");
+  EXPECT_THAT(Parser.nextNode(), testing::Optional(isNode("{{{tag:")));
+  EXPECT_THAT(Parser.nextNode(), None);
+
+  Parser.parseLine("{{{tag:field}}");
+  EXPECT_THAT(Parser.nextNode(), testing::Optional(isNode("{{{tag:field}}")));
+  EXPECT_THAT(Parser.nextNode(), None);
+
+  Parser.parseLine("a\033[2mb");
+  EXPECT_THAT(Parser.nextNode(), testing::Optional(isNode("a\033[2mb")));
+  EXPECT_THAT(Parser.nextNode(), None);
+
+  Parser.parseLine("a\033[38mb");
+  EXPECT_THAT(Parser.nextNode(), testing::Optional(isNode("a\033[38mb")));
+  EXPECT_THAT(Parser.nextNode(), None);
+
+  Parser.parseLine("a\033[4mb");
+  EXPECT_THAT(Parser.nextNode(), testing::Optional(isNode("a\033[4mb")));
+  EXPECT_THAT(Parser.nextNode(), None);
+}
+
+TEST(SymbolizerMarkup, LinesWithMarkup) {
+  MarkupParser Parser;
+
+  Parser.parseLine("{{{tag}}}");
+  EXPECT_THAT(Parser.nextNode(), testing::Optional(isNode("{{{tag}}}", "tag")));
+  EXPECT_THAT(Parser.nextNode(), None);
+
+  Parser.parseLine("{{{tag:f1:f2:f3}}}");
+  EXPECT_THAT(Parser.nextNode(),
+              testing::Optional(isNode("{{{tag:f1:f2:f3}}}", "tag",
+                                       ElementsAre("f1", "f2", "f3"))));
+  EXPECT_THAT(Parser.nextNode(), None);
+
+  Parser.parseLine("{{{tag:}}}");
+  EXPECT_THAT(Parser.nextNode(),
+              testing::Optional(isNode("{{{tag:}}}", "tag", ElementsAre(""))));
+  EXPECT_THAT(Parser.nextNode(), None);
+
+  Parser.parseLine("{{{tag:}}");
+  EXPECT_THAT(Parser.nextNode(), testing::Optional(isNode("{{{tag:}}")));
+  EXPECT_THAT(Parser.nextNode(), None);
+
+  Parser.parseLine("{{{t2g}}}");
+  EXPECT_THAT(Parser.nextNode(), testing::Optional(isNode("{{{t2g}}}", "t2g")));
+  EXPECT_THAT(Parser.nextNode(), None);
+
+  Parser.parseLine("{{{tAg}}}");
+  EXPECT_THAT(Parser.nextNode(), testing::Optional(isNode("{{{tAg}}}", "tAg")));
+  EXPECT_THAT(Parser.nextNode(), None);
+
+  Parser.parseLine("a{{{b}}}c{{{d}}}e");
+  EXPECT_THAT(Parser.nextNode(), testing::Optional(isNode("a")));
+  EXPECT_THAT(Parser.nextNode(), testing::Optional(isNode("{{{b}}}", "b")));
+  EXPECT_THAT(Parser.nextNode(), testing::Optional(isNode("c")));
+  EXPECT_THAT(Parser.nextNode(), testing::Optional(isNode("{{{d}}}", "d")));
+  EXPECT_THAT(Parser.nextNode(), testing::Optional(isNode("e")));
+  EXPECT_THAT(Parser.nextNode(), None);
+
+  Parser.parseLine("{{{}}}{{{tag}}}");
+  EXPECT_THAT(Parser.nextNode(), testing::Optional(isNode("{{{}}}")));
+  EXPECT_THAT(Parser.nextNode(), testing::Optional(isNode("{{{tag}}}", "tag")));
+  EXPECT_THAT(Parser.nextNode(), None);
+
+  Parser.parseLine("\033[0mA\033[1mB\033[30mC\033[37m");
+  EXPECT_THAT(Parser.nextNode(), testing::Optional(isNode("\033[0m")));
+  EXPECT_THAT(Parser.nextNode(), testing::Optional(isNode("A")));
+  EXPECT_THAT(Parser.nextNode(), testing::Optional(isNode("\033[1m")));
+  EXPECT_THAT(Parser.nextNode(), testing::Optional(isNode("B")));
+  EXPECT_THAT(Parser.nextNode(), testing::Optional(isNode("\033[30m")));
+  EXPECT_THAT(Parser.nextNode(), testing::Optional(isNode("C")));
+  EXPECT_THAT(Parser.nextNode(), testing::Optional(isNode("\033[37m")));
+  EXPECT_THAT(Parser.nextNode(), None);
+
+  Parser.parseLine("{{{tag:\033[0m}}}");
+  EXPECT_THAT(Parser.nextNode(),
+              testing::Optional(
+                  isNode("{{{tag:\033[0m}}}", "tag", ElementsAre("\033[0m"))));
+  EXPECT_THAT(Parser.nextNode(), None);
+}
+
+} // namespace