[clang-tools-extra] 704cd4d - [clangd] Only minimally escape text when rendering to markdown.

Tue Mar 17 09:10:37 PDT 2020

Author: Sam McCall
Date: 2020-03-17T17:10:20+01:00
New Revision: 704cd4d5d0754904361823588f203369c309deca

URL: https://github.com/llvm/llvm-project/commit/704cd4d5d0754904361823588f203369c309deca
DIFF: https://github.com/llvm/llvm-project/commit/704cd4d5d0754904361823588f203369c309deca.diff

LOG: [clangd] Only minimally escape text when rendering to markdown.

Summary:
Conservatively escaping everything is bad in coc.nvim which shows the markdown
to the user, and we have reports of it causing problems for other parsers.

Fixes https://github.com/clangd/clangd/issues/301

Reviewers: kadircet

Subscribers: ilya-biryukov, MaskRay, jkorous, arphaman, usaxena95, cfe-commits

Tags: #clang

Differential Revision: https://reviews.llvm.org/D75687

Added: 
    

Modified: 
    clang-tools-extra/clangd/FormattedString.cpp
    clang-tools-extra/clangd/FormattedString.h
    clang-tools-extra/clangd/unittests/FormattedStringTests.cpp
    clang-tools-extra/clangd/unittests/HoverTests.cpp

Removed: 
    


################################################################################
diff  --git a/clang-tools-extra/clangd/FormattedString.cpp b/clang-tools-extra/clangd/FormattedString.cpp
index 9cb732eabcf8..ecbc060a870a 100644

--- a/clang-tools-extra/clangd/FormattedString.cpp
+++ b/clang-tools-extra/clangd/FormattedString.cpp
@@ -12,6 +12,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/raw_ostream.h"
@@ -26,23 +27,143 @@ namespace clangd {
 namespace markup {
 
 namespace {
+
+// Is <contents a plausible start to an HTML tag?
+// Contents may not be the rest of the line, but it's the rest of the plain
+// text, so we expect to see at least the tag name.
+bool looksLikeTag(llvm::StringRef Contents) {
+  if (Contents.empty())
+    return false;
+  if (Contents.front() == '!' || Contents.front() == '?' ||
+      Contents.front() == '/')
+    return true;
+  // Check the start of the tag name.
+  if (!llvm::isAlpha(Contents.front()))
+    return false;
+  // Drop rest of the tag name, and following whitespace.
+  Contents = Contents
+                 .drop_while([](char C) {
+                   return llvm::isAlnum(C) || C == '-' || C == '_' || C == ':';
+                 })
+                 .drop_while(isWhitespace);
+  // The rest of the tag consists of attributes, which have restrictive names.
+  // If we hit '=', all bets are off (attribute values can contain anything).
+  for (; !Contents.empty(); Contents = Contents.drop_front()) {
+    if (llvm::isAlnum(Contents.front()) || isWhitespace(Contents.front()))
+      continue;
+    if (Contents.front() == '>' || Contents.startswith("/>"))
+      return true; // May close the tag.
+    if (Contents.front() == '=')
+      return true; // Don't try to parse attribute values.
+    return false;  // Random punctuation means this isn't a tag.
+  }
+  return true; // Potentially incomplete tag.
+}
+
+// Tests whether C should be backslash-escaped in markdown.
+// The string being escaped is Before + C + After. This is part of a paragraph.
+// StartsLine indicates whether `Before` is the start of the line.
+// After may not be everything until the end of the line.
+//
+// It's always safe to escape punctuation, but want minimal escaping.
+// The strategy is to escape the first character of anything that might start
+// a markdown grammar construct.
+bool needsLeadingEscape(char C, llvm::StringRef Before, llvm::StringRef After,
+                        bool StartsLine) {
+  assert(Before.take_while(isWhitespace).empty());
+  auto RulerLength = [&]() -> /*Length*/ unsigned {
+    if (!StartsLine || !Before.empty())
+      return false;
+    llvm::StringRef A = After.rtrim();
+    return llvm::all_of(A, [C](char D) { return C == D; }) ? 1 + A.size() : 0;
+  };
+  auto IsBullet = [&]() {
+    return StartsLine && Before.empty() &&
+           (After.empty() || After.startswith(" "));
+  };
+  auto SpaceSurrounds = [&]() {
+    return (After.empty() || isWhitespace(After.front())) &&
+           (Before.empty() || isWhitespace(Before.back()));
+  };
+  auto WordSurrounds = [&]() {
+    return (!After.empty() && llvm::isAlnum(After.front())) &&
+           (!Before.empty() && llvm::isAlnum(Before.back()));
+  };
+
+  switch (C) {
+  case '\\': // Escaped character.
+    return true;
+  case '`': // Code block or inline code
+    // Any number of backticks can delimit an inline code block that can end
+    // anywhere (including on another line). We must escape them all.
+    return true;
+  case '~': // Code block
+    return StartsLine && Before.empty() && After.startswith("~~");
+  case '#': { // ATX heading.
+    if (!StartsLine || !Before.empty())
+      return false;
+    llvm::StringRef Rest = After.ltrim(C);
+    return Rest.empty() || Rest.startswith(" ");
+  }
+  case ']': // Link or link reference.
+    // We escape ] rather than [ here, because it's more constrained:
+    //   ](...) is an in-line link
+    //   ]: is a link reference
+    // The following are only links if the link reference exists:
+    //   ] by itself is a shortcut link
+    //   ][...] is an out-of-line link
+    // Because we never emit link references, we don't need to handle these.
+    return After.startswith(":") || After.startswith("(");
+  case '=': // Setex heading.
+    return RulerLength() > 0;
+  case '_': // Horizontal ruler or matched delimiter.
+    if (RulerLength() >= 3)
+      return true;
+    // Not a delimiter if surrounded by space, or inside a word.
+    // (The rules at word boundaries are subtle).
+    return !(SpaceSurrounds() || WordSurrounds());
+  case '-': // Setex heading, horizontal ruler, or bullet.
+    if (RulerLength() > 0)
+      return true;
+    return IsBullet();
+  case '+': // Bullet list.
+    return IsBullet();
+  case '*': // Bullet list, horizontal ruler, or delimiter.
+    return IsBullet() || RulerLength() >= 3 || !SpaceSurrounds();
+  case '<': // HTML tag (or autolink, which we choose not to escape)
+    return looksLikeTag(After);
+  case '>': // Quote marker. Needs escaping at start of line.
+    return StartsLine && Before.empty();
+  case '&': { // HTML entity reference
+    auto End = After.find(';');
+    if (End == llvm::StringRef::npos)
+      return false;
+    llvm::StringRef Content = After.substr(0, End);
+    if (Content.consume_front("#")) {
+      if (Content.consume_front("x") || Content.consume_front("X"))
+        return llvm::all_of(Content, llvm::isHexDigit);
+      return llvm::all_of(Content, llvm::isDigit);
+    }
+    return llvm::all_of(Content, llvm::isAlpha);
+  }
+  case '.': // Numbered list indicator. Escape 12. -> 12\. at start of line.
+  case ')':
+    return StartsLine && !Before.empty() &&
+           llvm::all_of(Before, llvm::isDigit) && After.startswith(" ");
+  default:
+    return false;
+  }
+}
+
 /// Escape a markdown text block. Ensures the punctuation will not introduce
 /// any of the markdown constructs.
-std::string renderText(llvm::StringRef Input) {
-  // Escaping ASCII punctuation ensures we can't start a markdown construct.
-  constexpr llvm::StringLiteral Punctuation =
-      R"txt(!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~)txt";
-
+std::string renderText(llvm::StringRef Input, bool StartsLine) {
   std::string R;
-  for (size_t From = 0; From < Input.size();) {
-    size_t Next = Input.find_first_of(Punctuation, From);
-    R += Input.substr(From, Next - From);
-    if (Next == llvm::StringRef::npos)
-      break;
-    R += "\\";
-    R += Input[Next];
-
-    From = Next + 1;
+  for (unsigned I = 0; I < Input.size(); ++I) {
+    if (needsLeadingEscape(Input[I], Input.substr(0, I), Input.substr(I + 1),
+                           StartsLine))
+      R.push_back('\\');
+    R.push_back(Input[I]);
   }
   return R;
 }
@@ -236,7 +357,7 @@ void Paragraph::renderMarkdown(llvm::raw_ostream &OS) const {
     OS << Sep;
     switch (C.Kind) {
     case Chunk::PlainText:
-      OS << renderText(C.Contents);
+      OS << renderText(C.Contents, Sep.empty());
       break;
     case Chunk::InlineCode:
       OS << renderInlineBlock(C.Contents);

diff  --git a/clang-tools-extra/clangd/FormattedString.h b/clang-tools-extra/clangd/FormattedString.h
index effd03725925..fe864ba57568 100644
--- a/clang-tools-extra/clangd/FormattedString.h
+++ b/clang-tools-extra/clangd/FormattedString.h
@@ -95,6 +95,8 @@ class Document {
   BulletList &addBulletList();
 
   /// Doesn't contain any trailing newlines.
+  /// We try to make the markdown human-readable, e.g. avoid extra escaping.
+  /// At least one client (coc.nvim) displays the markdown verbatim!
   std::string asMarkdown() const;
   /// Doesn't contain any trailing newlines.
   std::string asPlainText() const;

diff  --git a/clang-tools-extra/clangd/unittests/FormattedStringTests.cpp b/clang-tools-extra/clangd/unittests/FormattedStringTests.cpp
index 6489aa05f844..d3fa1aa8c0a7 100644
--- a/clang-tools-extra/clangd/unittests/FormattedStringTests.cpp
+++ b/clang-tools-extra/clangd/unittests/FormattedStringTests.cpp
@@ -17,25 +17,96 @@ namespace clangd {
 namespace markup {
 namespace {
 
-TEST(Render, Escaping) {
-  // Check some ASCII punctuation
-  Paragraph P;
-  P.appendText("*!`");
-  EXPECT_EQ(P.asMarkdown(), "\\*\\!\\`");
+std::string escape(llvm::StringRef Text) {
+  return Paragraph().appendText(Text.str()).asMarkdown();
+}
+
+MATCHER_P(escaped, C, "") {
+  return testing::ExplainMatchResult(::testing::HasSubstr(std::string{'\\', C}),
+                                     arg, result_listener);
+}
 
+MATCHER(escapedNone, "") {
+  return testing::ExplainMatchResult(::testing::Not(::testing::HasSubstr("\\")),
+                                     arg, result_listener);
+}
+
+TEST(Render, Escaping) {
   // Check all ASCII punctuation.
-  P = Paragraph();
   std::string Punctuation = R"txt(!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~)txt";
-  // Same text, with each character escaped.
-  std::string EscapedPunctuation;
-  EscapedPunctuation.reserve(2 * Punctuation.size());
-  for (char C : Punctuation)
-    EscapedPunctuation += std::string("\\") + C;
-  P.appendText(Punctuation);
-  EXPECT_EQ(P.asMarkdown(), EscapedPunctuation);
+  std::string EscapedPunc = R"txt(!"#$%&'()\*+,-./:;<=>?@[\\]^\_\`{|}~)txt";
+  EXPECT_EQ(escape(Punctuation), EscapedPunc);
+
+  // Inline code
+  EXPECT_EQ(escape("`foo`"), R"(\`foo\`)");
+  EXPECT_EQ(escape("`foo"), R"(\`foo)");
+  EXPECT_EQ(escape("foo`"), R"(foo\`)");
+  EXPECT_EQ(escape("``foo``"), R"(\`\`foo\`\`)");
+  // Code blocks
+  EXPECT_EQ(escape("```"), R"(\`\`\`)"); // This could also be inline code!
+  EXPECT_EQ(escape("~~~"), R"(\~~~)");
+
+  // Rulers and headings
+  EXPECT_THAT(escape("## Heading"), escaped('#'));
+  EXPECT_THAT(escape("Foo # bar"), escapedNone());
+  EXPECT_EQ(escape("---"), R"(\---)");
+  EXPECT_EQ(escape("-"), R"(\-)");
+  EXPECT_EQ(escape("==="), R"(\===)");
+  EXPECT_EQ(escape("="), R"(\=)");
+  EXPECT_EQ(escape("***"), R"(\*\*\*)"); // \** could start emphasis!
+
+  // HTML tags.
+  EXPECT_THAT(escape("<pre"), escaped('<'));
+  EXPECT_THAT(escape("< pre"), escapedNone());
+  EXPECT_THAT(escape("if a<b then"), escaped('<'));
+  EXPECT_THAT(escape("if a<b then c."), escapedNone());
+  EXPECT_THAT(escape("if a<b then c='foo'."), escaped('<'));
+  EXPECT_THAT(escape("std::vector<T>"), escaped('<'));
+  EXPECT_THAT(escape("std::vector<std::string>"), escaped('<'));
+  EXPECT_THAT(escape("std::map<int, int>"), escapedNone());
+  // Autolinks
+  EXPECT_THAT(escape("Email <foo at bar.com>"), escapedNone());
+  EXPECT_THAT(escape("Website <http://foo.bar>"), escapedNone());
+
+  // Bullet lists.
+  EXPECT_THAT(escape("- foo"), escaped('-'));
+  EXPECT_THAT(escape("* foo"), escaped('*'));
+  EXPECT_THAT(escape("+ foo"), escaped('+'));
+  EXPECT_THAT(escape("+"), escaped('+'));
+  EXPECT_THAT(escape("a + foo"), escapedNone());
+  EXPECT_THAT(escape("a+ foo"), escapedNone());
+  EXPECT_THAT(escape("1. foo"), escaped('.'));
+  EXPECT_THAT(escape("a. foo"), escapedNone());
+
+  // Emphasis.
+  EXPECT_EQ(escape("*foo*"), R"(\*foo\*)");
+  EXPECT_EQ(escape("**foo**"), R"(\*\*foo\*\*)");
+  EXPECT_THAT(escape("*foo"), escaped('*'));
+  EXPECT_THAT(escape("foo *"), escapedNone());
+  EXPECT_THAT(escape("foo * bar"), escapedNone());
+  EXPECT_THAT(escape("foo_bar"), escapedNone());
+  EXPECT_THAT(escape("foo _bar"), escaped('_'));
+  EXPECT_THAT(escape("foo_ bar"), escaped('_'));
+  EXPECT_THAT(escape("foo _ bar"), escapedNone());
+
+  // HTML entities.
+  EXPECT_THAT(escape("fish &chips;"), escaped('&'));
+  EXPECT_THAT(escape("fish & chips;"), escapedNone());
+  EXPECT_THAT(escape("fish &chips"), escapedNone());
+  EXPECT_THAT(escape("foo * bar"), escaped('&'));
+  EXPECT_THAT(escape("foo &#xaf; bar"), escaped('&'));
+  EXPECT_THAT(escape("foo &?; bar"), escapedNone());
+
+  // Links.
+  EXPECT_THAT(escape("[foo](bar)"), escaped(']'));
+  EXPECT_THAT(escape("[foo]: bar"), escaped(']'));
+  // No need to escape these, as the target never exists.
+  EXPECT_THAT(escape("[foo][]"), escapedNone());
+  EXPECT_THAT(escape("[foo][bar]"), escapedNone());
+  EXPECT_THAT(escape("[foo]"), escapedNone());
 
   // In code blocks we don't need to escape ASCII punctuation.
-  P = Paragraph();
+  Paragraph P = Paragraph();
   P.appendCode("* foo !+ bar * baz");
   EXPECT_EQ(P.asMarkdown(), "`* foo !+ bar * baz`");
 

diff  --git a/clang-tools-extra/clangd/unittests/HoverTests.cpp b/clang-tools-extra/clangd/unittests/HoverTests.cpp
index 51f8eac1bfb4..c243346a73f6 100644
--- a/clang-tools-extra/clangd/unittests/HoverTests.cpp
+++ b/clang-tools-extra/clangd/unittests/HoverTests.cpp
@@ -1905,7 +1905,7 @@ TEST(Hover, PresentRulers) {
   llvm::StringRef ExpectedMarkdown = R"md(### variable `foo`  
 
 ---
-Value \= `val`  
+Value = `val`  
 
 ---
 ```cpp