[llvm] 8bd078b - [Symbolize] Parse multi-line markup elements.
Daniel Thornburgh via llvm-commits
llvm-commits at lists.llvm.org
Wed Jun 22 10:00:47 PDT 2022
Author: Daniel Thornburgh
Date: 2022-06-22T10:00:43-07:00
New Revision: 8bd078b57c7d308c0b6b29da9f98386a17b72b89
URL: https://github.com/llvm/llvm-project/commit/8bd078b57c7d308c0b6b29da9f98386a17b72b89
DIFF: https://github.com/llvm/llvm-project/commit/8bd078b57c7d308c0b6b29da9f98386a17b72b89.diff
LOG: [Symbolize] Parse multi-line markup elements.
This allows registering certain tags as possibly beginning multi-line
elements in the symbolizer markup parser. The parser is kept agnostic to
how lines are delimited; it reports the entire contents, including line
endings, once the end of element marker is reached.
Reviewed By: peter.smith
Differential Revision: https://reviews.llvm.org/D124798
Added:
Modified:
llvm/include/llvm/DebugInfo/Symbolize/Markup.h
llvm/lib/DebugInfo/Symbolize/Markup.cpp
llvm/unittests/DebugInfo/Symbolizer/MarkupTest.cpp
Removed:
################################################################################
diff --git a/llvm/include/llvm/DebugInfo/Symbolize/Markup.h b/llvm/include/llvm/DebugInfo/Symbolize/Markup.h
index 19cc0ab622f0b..86c133dd66adf 100644
--- a/llvm/include/llvm/DebugInfo/Symbolize/Markup.h
+++ b/llvm/include/llvm/DebugInfo/Symbolize/Markup.h
@@ -21,6 +21,7 @@
#include "llvm/ADT/Optional.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/StringSet.h"
#include "llvm/Support/Regex.h"
namespace llvm {
@@ -52,7 +53,7 @@ struct MarkupNode {
/// Parses a log containing symbolizer markup into a sequence of nodes.
class MarkupParser {
public:
- MarkupParser();
+ MarkupParser(StringSet<> MultilineTags = {});
/// Parses an individual \p Line of input.
///
@@ -60,34 +61,54 @@ class MarkupParser {
/// by nextNode() are discarded. The nodes returned by nextNode() may
/// reference the input string, so it must be retained by the caller until the
/// last use.
+ ///
+ /// Note that some elements may span multiple lines. If a line ends with the
+ /// start of one of these elements, then no nodes will be produced until the
+ /// either the end or something that cannot be part of an element is
+ /// encountered. This may only occur after multiple calls to parseLine(),
+ /// corresponding to the lines of the multi-line element.
void parseLine(StringRef Line);
- /// Returns the next node from the most recent parseLine() call.
+ /// Inform the parser of that the input stream has ended.
+ ///
+ /// This allows the parser to finish any deferred processing (e.g., an
+ /// in-progress multi-line element) and may cause nextNode() to return
+ /// additional nodes.
+ void flush();
+
+ /// Returns the next node in the input sequence.
///
/// Calling nextNode() may invalidate the contents of the node returned by the
/// previous call.
///
/// \returns the next markup node or None if none remain.
- Optional<MarkupNode> nextNode() {
- if (!NextIdx)
- NextIdx = 0;
- if (*NextIdx == Buffer.size()) {
- NextIdx.reset();
- Buffer.clear();
- return None;
- }
- return std::move(Buffer[(*NextIdx)++]);
- }
+ Optional<MarkupNode> nextNode();
private:
Optional<MarkupNode> parseElement(StringRef Line);
void parseTextOutsideMarkup(StringRef Text);
+ Optional<StringRef> parseMultiLineBegin(StringRef Line);
+ Optional<StringRef> parseMultiLineEnd(StringRef Line);
+
+ // Tags of elements that can span multiple lines.
+ const StringSet<> MultilineTags;
+
+ // Contents of a multi-line element that has finished being parsed. Retained
+ // to keep returned StringRefs for the contents valid.
+ std::string FinishedMultiline;
+
+ // Contents of a multi-line element that is still in the process of receiving
+ // lines.
+ std::string InProgressMultiline;
+
+ // The line currently being parsed.
+ StringRef Line;
// Buffer for nodes parsed from the current line.
SmallVector<MarkupNode> Buffer;
- // Next buffer index to return or None if nextNode has not yet been called.
- Optional<size_t> NextIdx;
+ // Next buffer index to return.
+ size_t NextIdx;
// Regular expression matching supported ANSI SGR escape sequences.
const Regex SGRSyntax;
diff --git a/llvm/lib/DebugInfo/Symbolize/Markup.cpp b/llvm/lib/DebugInfo/Symbolize/Markup.cpp
index 04cf7b38b7997..9bc65e763287f 100644
--- a/llvm/lib/DebugInfo/Symbolize/Markup.cpp
+++ b/llvm/lib/DebugInfo/Symbolize/Markup.cpp
@@ -13,6 +13,7 @@
#include "llvm/DebugInfo/Symbolize/Markup.h"
+#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/StringExtras.h"
namespace llvm {
@@ -24,7 +25,8 @@ namespace symbolize {
// "\033[30m" -- "\033[37m"
static const char SGRSyntaxStr[] = "\033\\[([0-1]|3[0-7])m";
-MarkupParser::MarkupParser() : SGRSyntax(SGRSyntaxStr) {}
+MarkupParser::MarkupParser(StringSet<> MultilineTags)
+ : MultilineTags(std::move(MultilineTags)), SGRSyntax(SGRSyntaxStr) {}
static StringRef takeTo(StringRef Str, StringRef::iterator Pos) {
return Str.take_front(Pos - Str.begin());
@@ -35,18 +37,73 @@ static void advanceTo(StringRef &Str, StringRef::iterator Pos) {
void MarkupParser::parseLine(StringRef Line) {
Buffer.clear();
- while (!Line.empty()) {
- // Find the first valid markup element, if any.
- if (Optional<MarkupNode> Element = parseElement(Line)) {
- parseTextOutsideMarkup(takeTo(Line, Element->Text.begin()));
- Buffer.push_back(std::move(*Element));
- advanceTo(Line, Element->Text.end());
- } else {
- // The line doesn't contain any more markup elements, so emit it as text.
- parseTextOutsideMarkup(Line);
- return;
+ NextIdx = 0;
+ FinishedMultiline.clear();
+ this->Line = Line;
+}
+
+Optional<MarkupNode> MarkupParser::nextNode() {
+ // Pull something out of the buffer if possible.
+ if (!Buffer.empty()) {
+ if (NextIdx < Buffer.size())
+ return std::move(Buffer[NextIdx++]);
+ NextIdx = 0;
+ Buffer.clear();
+ }
+
+ // The buffer is empty, so parse the next bit of the line.
+
+ if (Line.empty())
+ return None;
+
+ if (!InProgressMultiline.empty()) {
+ if (Optional<StringRef> MultilineEnd = parseMultiLineEnd(Line)) {
+ llvm::append_range(InProgressMultiline, *MultilineEnd);
+ assert(FinishedMultiline.empty() &&
+ "At most one multi-line element can be finished at a time.");
+ FinishedMultiline.swap(InProgressMultiline);
+ // Parse the multi-line element as if it were contiguous.
+ advanceTo(Line, MultilineEnd->end());
+ return *parseElement(FinishedMultiline);
}
+
+ // The whole line is part of the multi-line element.
+ llvm::append_range(InProgressMultiline, Line);
+ Line = Line.drop_front(Line.size());
+ return None;
+ }
+
+ // Find the first valid markup element, if any.
+ if (Optional<MarkupNode> Element = parseElement(Line)) {
+ parseTextOutsideMarkup(takeTo(Line, Element->Text.begin()));
+ Buffer.push_back(std::move(*Element));
+ advanceTo(Line, Element->Text.end());
+ return nextNode();
+ }
+
+ // Since there were no valid elements remaining, see if the line opens a
+ // multi-line element.
+ if (Optional<StringRef> MultilineBegin = parseMultiLineBegin(Line)) {
+ // Emit any text before the element.
+ parseTextOutsideMarkup(takeTo(Line, MultilineBegin->begin()));
+
+ // Begin recording the multi-line element.
+ llvm::append_range(InProgressMultiline, *MultilineBegin);
+ Line = Line.drop_front(Line.size());
+ return nextNode();
}
+
+ // The line doesn't contain any more markup elements, so emit it as text.
+ parseTextOutsideMarkup(Line);
+ Line = Line.drop_front(Line.size());
+ return nextNode();
+}
+
+void MarkupParser::flush() {
+ if (InProgressMultiline.empty())
+ return;
+ FinishedMultiline.swap(InProgressMultiline);
+ parseTextOutsideMarkup(FinishedMultiline);
}
// Finds and returns the next valid markup element in the given line. Returns
@@ -107,5 +164,39 @@ void MarkupParser::parseTextOutsideMarkup(StringRef Text) {
Buffer.push_back(textNode(Text));
}
+// Given that a line doesn't contain any valid markup, see if it ends with the
+// start of a multi-line element. If so, returns the beginning.
+Optional<StringRef> MarkupParser::parseMultiLineBegin(StringRef Line) {
+ // A multi-line begin marker must be the last one on the line.
+ size_t BeginPos = Line.rfind("{{{");
+ if (BeginPos == StringRef::npos)
+ return None;
+ size_t BeginTagPos = BeginPos + 3;
+
+ // If there are any end markers afterwards, the begin marker cannot belong to
+ // a multi-line element.
+ size_t EndPos = Line.find("}}}", BeginTagPos);
+ if (EndPos != StringRef::npos)
+ return None;
+
+ // Check whether the tag is registered multi-line.
+ size_t EndTagPos = Line.find(':', BeginTagPos);
+ if (EndTagPos == StringRef::npos)
+ return None;
+ StringRef Tag = Line.slice(BeginTagPos, EndTagPos);
+ if (!MultilineTags.contains(Tag))
+ return None;
+ return Line.substr(BeginPos);
+}
+
+// See if the line begins with the ending of an in-progress multi-line element.
+// If so, return the ending.
+Optional<StringRef> MarkupParser::parseMultiLineEnd(StringRef Line) {
+ size_t EndPos = Line.find("}}}");
+ if (EndPos == StringRef::npos)
+ return None;
+ return Line.take_front(EndPos + 3);
+}
+
} // end namespace symbolize
} // end namespace llvm
diff --git a/llvm/unittests/DebugInfo/Symbolizer/MarkupTest.cpp b/llvm/unittests/DebugInfo/Symbolizer/MarkupTest.cpp
index 6d587d90b3a93..fc95840a57472 100644
--- a/llvm/unittests/DebugInfo/Symbolizer/MarkupTest.cpp
+++ b/llvm/unittests/DebugInfo/Symbolizer/MarkupTest.cpp
@@ -44,6 +44,14 @@ TEST(SymbolizerMarkup, LinesWithoutMarkup) {
EXPECT_THAT(Parser.nextNode(), testing::Optional(isNode("kept")));
EXPECT_THAT(Parser.nextNode(), None);
+ Parser.parseLine("text\n");
+ EXPECT_THAT(Parser.nextNode(), testing::Optional(isNode("text\n")));
+ EXPECT_THAT(Parser.nextNode(), None);
+
+ Parser.parseLine("text\r\n");
+ EXPECT_THAT(Parser.nextNode(), testing::Optional(isNode("text\r\n")));
+ EXPECT_THAT(Parser.nextNode(), None);
+
Parser.parseLine("{{{");
EXPECT_THAT(Parser.nextNode(), testing::Optional(isNode("{{{")));
EXPECT_THAT(Parser.nextNode(), None);
@@ -145,4 +153,69 @@ TEST(SymbolizerMarkup, LinesWithMarkup) {
EXPECT_THAT(Parser.nextNode(), None);
}
+TEST(SymbolizerMarkup, MultilineElements) {
+ MarkupParser Parser(/*MultilineTags=*/{"first", "second"});
+
+ Parser.parseLine("{{{tag:");
+ EXPECT_THAT(Parser.nextNode(), testing::Optional(isNode("{{{tag:")));
+ EXPECT_THAT(Parser.nextNode(), None);
+
+ Parser.parseLine("{{{first:");
+ EXPECT_THAT(Parser.nextNode(), None);
+ Parser.parseLine("}}}{{{second:");
+ EXPECT_THAT(
+ Parser.nextNode(),
+ testing::Optional(isNode("{{{first:}}}", "first", ElementsAre(""))));
+ EXPECT_THAT(Parser.nextNode(), None);
+ Parser.parseLine("}}}");
+ EXPECT_THAT(
+ Parser.nextNode(),
+ testing::Optional(isNode("{{{second:}}}", "second", ElementsAre(""))));
+ EXPECT_THAT(Parser.nextNode(), None);
+
+ Parser.parseLine("{{{before{{{first:");
+ EXPECT_THAT(Parser.nextNode(), testing::Optional(isNode("{{{before")));
+ EXPECT_THAT(Parser.nextNode(), None);
+ Parser.parseLine("line");
+ EXPECT_THAT(Parser.nextNode(), None);
+ Parser.parseLine("}}}after");
+ EXPECT_THAT(Parser.nextNode(),
+ testing::Optional(
+ isNode("{{{first:line}}}", "first", ElementsAre("line"))));
+ EXPECT_THAT(Parser.nextNode(), testing::Optional(isNode("after")));
+ EXPECT_THAT(Parser.nextNode(), None);
+
+ Parser.parseLine("{{{first:");
+ EXPECT_THAT(Parser.nextNode(), None);
+ Parser.flush();
+ EXPECT_THAT(Parser.nextNode(), testing::Optional(isNode("{{{first:")));
+ EXPECT_THAT(Parser.nextNode(), None);
+
+ Parser.parseLine("{{{first:\n");
+ EXPECT_THAT(Parser.nextNode(), None);
+ Parser.parseLine("}}}\n");
+ EXPECT_THAT(
+ Parser.nextNode(),
+ testing::Optional(isNode("{{{first:\n}}}", "first", ElementsAre("\n"))));
+ EXPECT_THAT(Parser.nextNode(), testing::Optional(isNode("\n")));
+ EXPECT_THAT(Parser.nextNode(), None);
+
+ Parser.parseLine("{{{first:\r\n");
+ EXPECT_THAT(Parser.nextNode(), None);
+ Parser.parseLine("}}}\r\n");
+ EXPECT_THAT(Parser.nextNode(),
+ testing::Optional(
+ isNode("{{{first:\r\n}}}", "first", ElementsAre("\r\n"))));
+ EXPECT_THAT(Parser.nextNode(), testing::Optional(isNode("\r\n")));
+ EXPECT_THAT(Parser.nextNode(), None);
+
+ Parser.parseLine("{{{first:");
+ EXPECT_THAT(Parser.nextNode(), None);
+ Parser.parseLine("\033[0m}}}");
+ EXPECT_THAT(Parser.nextNode(),
+ testing::Optional(isNode("{{{first:\033[0m}}}", "first",
+ ElementsAre("\033[0m"))));
+ EXPECT_THAT(Parser.nextNode(), None);
+}
+
} // namespace
More information about the llvm-commits
mailing list