[clang-tools-extra] 704cd4d - [clangd] Only minimally escape text when rendering to markdown.
Sam McCall via cfe-commits
cfe-commits at lists.llvm.org
Tue Mar 17 09:10:37 PDT 2020
Author: Sam McCall
Date: 2020-03-17T17:10:20+01:00
New Revision: 704cd4d5d0754904361823588f203369c309deca
URL: https://github.com/llvm/llvm-project/commit/704cd4d5d0754904361823588f203369c309deca
DIFF: https://github.com/llvm/llvm-project/commit/704cd4d5d0754904361823588f203369c309deca.diff
LOG: [clangd] Only minimally escape text when rendering to markdown.
Summary:
Conservatively escaping everything is bad in coc.nvim which shows the markdown
to the user, and we have reports of it causing problems for other parsers.
Fixes https://github.com/clangd/clangd/issues/301
Reviewers: kadircet
Subscribers: ilya-biryukov, MaskRay, jkorous, arphaman, usaxena95, cfe-commits
Tags: #clang
Differential Revision: https://reviews.llvm.org/D75687
Added:
Modified:
clang-tools-extra/clangd/FormattedString.cpp
clang-tools-extra/clangd/FormattedString.h
clang-tools-extra/clangd/unittests/FormattedStringTests.cpp
clang-tools-extra/clangd/unittests/HoverTests.cpp
Removed:
################################################################################
diff --git a/clang-tools-extra/clangd/FormattedString.cpp b/clang-tools-extra/clangd/FormattedString.cpp
index 9cb732eabcf8..ecbc060a870a 100644
--- a/clang-tools-extra/clangd/FormattedString.cpp
+++ b/clang-tools-extra/clangd/FormattedString.cpp
@@ -12,6 +12,7 @@
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Compiler.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/FormatVariadic.h"
#include "llvm/Support/raw_ostream.h"
@@ -26,23 +27,143 @@ namespace clangd {
namespace markup {
namespace {
+
+// Is <contents a plausible start to an HTML tag?
+// Contents may not be the rest of the line, but it's the rest of the plain
+// text, so we expect to see at least the tag name.
+bool looksLikeTag(llvm::StringRef Contents) {
+ if (Contents.empty())
+ return false;
+ if (Contents.front() == '!' || Contents.front() == '?' ||
+ Contents.front() == '/')
+ return true;
+ // Check the start of the tag name.
+ if (!llvm::isAlpha(Contents.front()))
+ return false;
+ // Drop rest of the tag name, and following whitespace.
+ Contents = Contents
+ .drop_while([](char C) {
+ return llvm::isAlnum(C) || C == '-' || C == '_' || C == ':';
+ })
+ .drop_while(isWhitespace);
+ // The rest of the tag consists of attributes, which have restrictive names.
+ // If we hit '=', all bets are off (attribute values can contain anything).
+ for (; !Contents.empty(); Contents = Contents.drop_front()) {
+ if (llvm::isAlnum(Contents.front()) || isWhitespace(Contents.front()))
+ continue;
+ if (Contents.front() == '>' || Contents.startswith("/>"))
+ return true; // May close the tag.
+ if (Contents.front() == '=')
+ return true; // Don't try to parse attribute values.
+ return false; // Random punctuation means this isn't a tag.
+ }
+ return true; // Potentially incomplete tag.
+}
+
+// Tests whether C should be backslash-escaped in markdown.
+// The string being escaped is Before + C + After. This is part of a paragraph.
+// StartsLine indicates whether `Before` is the start of the line.
+// After may not be everything until the end of the line.
+//
+// It's always safe to escape punctuation, but want minimal escaping.
+// The strategy is to escape the first character of anything that might start
+// a markdown grammar construct.
+bool needsLeadingEscape(char C, llvm::StringRef Before, llvm::StringRef After,
+ bool StartsLine) {
+ assert(Before.take_while(isWhitespace).empty());
+ auto RulerLength = [&]() -> /*Length*/ unsigned {
+ if (!StartsLine || !Before.empty())
+ return false;
+ llvm::StringRef A = After.rtrim();
+ return llvm::all_of(A, [C](char D) { return C == D; }) ? 1 + A.size() : 0;
+ };
+ auto IsBullet = [&]() {
+ return StartsLine && Before.empty() &&
+ (After.empty() || After.startswith(" "));
+ };
+ auto SpaceSurrounds = [&]() {
+ return (After.empty() || isWhitespace(After.front())) &&
+ (Before.empty() || isWhitespace(Before.back()));
+ };
+ auto WordSurrounds = [&]() {
+ return (!After.empty() && llvm::isAlnum(After.front())) &&
+ (!Before.empty() && llvm::isAlnum(Before.back()));
+ };
+
+ switch (C) {
+ case '\\': // Escaped character.
+ return true;
+ case '`': // Code block or inline code
+ // Any number of backticks can delimit an inline code block that can end
+ // anywhere (including on another line). We must escape them all.
+ return true;
+ case '~': // Code block
+ return StartsLine && Before.empty() && After.startswith("~~");
+ case '#': { // ATX heading.
+ if (!StartsLine || !Before.empty())
+ return false;
+ llvm::StringRef Rest = After.ltrim(C);
+ return Rest.empty() || Rest.startswith(" ");
+ }
+ case ']': // Link or link reference.
+ // We escape ] rather than [ here, because it's more constrained:
+ // ](...) is an in-line link
+ // ]: is a link reference
+ // The following are only links if the link reference exists:
+ // ] by itself is a shortcut link
+ // ][...] is an out-of-line link
+ // Because we never emit link references, we don't need to handle these.
+ return After.startswith(":") || After.startswith("(");
+ case '=': // Setex heading.
+ return RulerLength() > 0;
+ case '_': // Horizontal ruler or matched delimiter.
+ if (RulerLength() >= 3)
+ return true;
+ // Not a delimiter if surrounded by space, or inside a word.
+ // (The rules at word boundaries are subtle).
+ return !(SpaceSurrounds() || WordSurrounds());
+ case '-': // Setex heading, horizontal ruler, or bullet.
+ if (RulerLength() > 0)
+ return true;
+ return IsBullet();
+ case '+': // Bullet list.
+ return IsBullet();
+ case '*': // Bullet list, horizontal ruler, or delimiter.
+ return IsBullet() || RulerLength() >= 3 || !SpaceSurrounds();
+ case '<': // HTML tag (or autolink, which we choose not to escape)
+ return looksLikeTag(After);
+ case '>': // Quote marker. Needs escaping at start of line.
+ return StartsLine && Before.empty();
+ case '&': { // HTML entity reference
+ auto End = After.find(';');
+ if (End == llvm::StringRef::npos)
+ return false;
+ llvm::StringRef Content = After.substr(0, End);
+ if (Content.consume_front("#")) {
+ if (Content.consume_front("x") || Content.consume_front("X"))
+ return llvm::all_of(Content, llvm::isHexDigit);
+ return llvm::all_of(Content, llvm::isDigit);
+ }
+ return llvm::all_of(Content, llvm::isAlpha);
+ }
+ case '.': // Numbered list indicator. Escape 12. -> 12\. at start of line.
+ case ')':
+ return StartsLine && !Before.empty() &&
+ llvm::all_of(Before, llvm::isDigit) && After.startswith(" ");
+ default:
+ return false;
+ }
+}
+
/// Escape a markdown text block. Ensures the punctuation will not introduce
/// any of the markdown constructs.
-std::string renderText(llvm::StringRef Input) {
- // Escaping ASCII punctuation ensures we can't start a markdown construct.
- constexpr llvm::StringLiteral Punctuation =
- R"txt(!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~)txt";
-
+std::string renderText(llvm::StringRef Input, bool StartsLine) {
std::string R;
- for (size_t From = 0; From < Input.size();) {
- size_t Next = Input.find_first_of(Punctuation, From);
- R += Input.substr(From, Next - From);
- if (Next == llvm::StringRef::npos)
- break;
- R += "\\";
- R += Input[Next];
-
- From = Next + 1;
+ for (unsigned I = 0; I < Input.size(); ++I) {
+ if (needsLeadingEscape(Input[I], Input.substr(0, I), Input.substr(I + 1),
+ StartsLine))
+ R.push_back('\\');
+ R.push_back(Input[I]);
}
return R;
}
@@ -236,7 +357,7 @@ void Paragraph::renderMarkdown(llvm::raw_ostream &OS) const {
OS << Sep;
switch (C.Kind) {
case Chunk::PlainText:
- OS << renderText(C.Contents);
+ OS << renderText(C.Contents, Sep.empty());
break;
case Chunk::InlineCode:
OS << renderInlineBlock(C.Contents);
diff --git a/clang-tools-extra/clangd/FormattedString.h b/clang-tools-extra/clangd/FormattedString.h
index effd03725925..fe864ba57568 100644
--- a/clang-tools-extra/clangd/FormattedString.h
+++ b/clang-tools-extra/clangd/FormattedString.h
@@ -95,6 +95,8 @@ class Document {
BulletList &addBulletList();
/// Doesn't contain any trailing newlines.
+ /// We try to make the markdown human-readable, e.g. avoid extra escaping.
+ /// At least one client (coc.nvim) displays the markdown verbatim!
std::string asMarkdown() const;
/// Doesn't contain any trailing newlines.
std::string asPlainText() const;
diff --git a/clang-tools-extra/clangd/unittests/FormattedStringTests.cpp b/clang-tools-extra/clangd/unittests/FormattedStringTests.cpp
index 6489aa05f844..d3fa1aa8c0a7 100644
--- a/clang-tools-extra/clangd/unittests/FormattedStringTests.cpp
+++ b/clang-tools-extra/clangd/unittests/FormattedStringTests.cpp
@@ -17,25 +17,96 @@ namespace clangd {
namespace markup {
namespace {
-TEST(Render, Escaping) {
- // Check some ASCII punctuation
- Paragraph P;
- P.appendText("*!`");
- EXPECT_EQ(P.asMarkdown(), "\\*\\!\\`");
+std::string escape(llvm::StringRef Text) {
+ return Paragraph().appendText(Text.str()).asMarkdown();
+}
+
+MATCHER_P(escaped, C, "") {
+ return testing::ExplainMatchResult(::testing::HasSubstr(std::string{'\\', C}),
+ arg, result_listener);
+}
+MATCHER(escapedNone, "") {
+ return testing::ExplainMatchResult(::testing::Not(::testing::HasSubstr("\\")),
+ arg, result_listener);
+}
+
+TEST(Render, Escaping) {
// Check all ASCII punctuation.
- P = Paragraph();
std::string Punctuation = R"txt(!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~)txt";
- // Same text, with each character escaped.
- std::string EscapedPunctuation;
- EscapedPunctuation.reserve(2 * Punctuation.size());
- for (char C : Punctuation)
- EscapedPunctuation += std::string("\\") + C;
- P.appendText(Punctuation);
- EXPECT_EQ(P.asMarkdown(), EscapedPunctuation);
+ std::string EscapedPunc = R"txt(!"#$%&'()\*+,-./:;<=>?@[\\]^\_\`{|}~)txt";
+ EXPECT_EQ(escape(Punctuation), EscapedPunc);
+
+ // Inline code
+ EXPECT_EQ(escape("`foo`"), R"(\`foo\`)");
+ EXPECT_EQ(escape("`foo"), R"(\`foo)");
+ EXPECT_EQ(escape("foo`"), R"(foo\`)");
+ EXPECT_EQ(escape("``foo``"), R"(\`\`foo\`\`)");
+ // Code blocks
+ EXPECT_EQ(escape("```"), R"(\`\`\`)"); // This could also be inline code!
+ EXPECT_EQ(escape("~~~"), R"(\~~~)");
+
+ // Rulers and headings
+ EXPECT_THAT(escape("## Heading"), escaped('#'));
+ EXPECT_THAT(escape("Foo # bar"), escapedNone());
+ EXPECT_EQ(escape("---"), R"(\---)");
+ EXPECT_EQ(escape("-"), R"(\-)");
+ EXPECT_EQ(escape("==="), R"(\===)");
+ EXPECT_EQ(escape("="), R"(\=)");
+ EXPECT_EQ(escape("***"), R"(\*\*\*)"); // \** could start emphasis!
+
+ // HTML tags.
+ EXPECT_THAT(escape("<pre"), escaped('<'));
+ EXPECT_THAT(escape("< pre"), escapedNone());
+ EXPECT_THAT(escape("if a<b then"), escaped('<'));
+ EXPECT_THAT(escape("if a<b then c."), escapedNone());
+ EXPECT_THAT(escape("if a<b then c='foo'."), escaped('<'));
+ EXPECT_THAT(escape("std::vector<T>"), escaped('<'));
+ EXPECT_THAT(escape("std::vector<std::string>"), escaped('<'));
+ EXPECT_THAT(escape("std::map<int, int>"), escapedNone());
+ // Autolinks
+ EXPECT_THAT(escape("Email <foo at bar.com>"), escapedNone());
+ EXPECT_THAT(escape("Website <http://foo.bar>"), escapedNone());
+
+ // Bullet lists.
+ EXPECT_THAT(escape("- foo"), escaped('-'));
+ EXPECT_THAT(escape("* foo"), escaped('*'));
+ EXPECT_THAT(escape("+ foo"), escaped('+'));
+ EXPECT_THAT(escape("+"), escaped('+'));
+ EXPECT_THAT(escape("a + foo"), escapedNone());
+ EXPECT_THAT(escape("a+ foo"), escapedNone());
+ EXPECT_THAT(escape("1. foo"), escaped('.'));
+ EXPECT_THAT(escape("a. foo"), escapedNone());
+
+ // Emphasis.
+ EXPECT_EQ(escape("*foo*"), R"(\*foo\*)");
+ EXPECT_EQ(escape("**foo**"), R"(\*\*foo\*\*)");
+ EXPECT_THAT(escape("*foo"), escaped('*'));
+ EXPECT_THAT(escape("foo *"), escapedNone());
+ EXPECT_THAT(escape("foo * bar"), escapedNone());
+ EXPECT_THAT(escape("foo_bar"), escapedNone());
+ EXPECT_THAT(escape("foo _bar"), escaped('_'));
+ EXPECT_THAT(escape("foo_ bar"), escaped('_'));
+ EXPECT_THAT(escape("foo _ bar"), escapedNone());
+
+ // HTML entities.
+ EXPECT_THAT(escape("fish &chips;"), escaped('&'));
+ EXPECT_THAT(escape("fish & chips;"), escapedNone());
+ EXPECT_THAT(escape("fish &chips"), escapedNone());
+ EXPECT_THAT(escape("foo * bar"), escaped('&'));
+ EXPECT_THAT(escape("foo ¯ bar"), escaped('&'));
+ EXPECT_THAT(escape("foo &?; bar"), escapedNone());
+
+ // Links.
+ EXPECT_THAT(escape("[foo](bar)"), escaped(']'));
+ EXPECT_THAT(escape("[foo]: bar"), escaped(']'));
+ // No need to escape these, as the target never exists.
+ EXPECT_THAT(escape("[foo][]"), escapedNone());
+ EXPECT_THAT(escape("[foo][bar]"), escapedNone());
+ EXPECT_THAT(escape("[foo]"), escapedNone());
// In code blocks we don't need to escape ASCII punctuation.
- P = Paragraph();
+ Paragraph P = Paragraph();
P.appendCode("* foo !+ bar * baz");
EXPECT_EQ(P.asMarkdown(), "`* foo !+ bar * baz`");
diff --git a/clang-tools-extra/clangd/unittests/HoverTests.cpp b/clang-tools-extra/clangd/unittests/HoverTests.cpp
index 51f8eac1bfb4..c243346a73f6 100644
--- a/clang-tools-extra/clangd/unittests/HoverTests.cpp
+++ b/clang-tools-extra/clangd/unittests/HoverTests.cpp
@@ -1905,7 +1905,7 @@ TEST(Hover, PresentRulers) {
llvm::StringRef ExpectedMarkdown = R"md(### variable `foo`
---
-Value \= `val`
+Value = `val`
---
```cpp
More information about the cfe-commits
mailing list