[clang] 7d669b7 - [Clang][Comments] Allow HTML tags across multiple lines (#120843)

Wed Feb 5 05:46:18 PST 2025

Author: nerix
Date: 2025-02-05T08:46:14-05:00
New Revision: 7d669b7c25e15034a85cc4888465084e7f05f251

URL: https://github.com/llvm/llvm-project/commit/7d669b7c25e15034a85cc4888465084e7f05f251
DIFF: https://github.com/llvm/llvm-project/commit/7d669b7c25e15034a85cc4888465084e7f05f251.diff

LOG: [Clang][Comments] Allow HTML tags across multiple lines (#120843)

HTML starting tags that span multiple lines were previously not allowed
(or rather, only the starting line was lexed as HTML). Doxygen allows
those tags.

This PR allows the starting tags to span multiple lines. They can't span
multiple (C-)Comments, though (it's likely a user-error). Multiple BCPL
comments are fine as those are single lines (shown below).

Example:

```c
/// <a
///     href="foo"
/// >Aaa</a>b
int Test;
```

Fixes #28321.

Added: 
    

Modified: 
    clang/docs/ReleaseNotes.rst
    clang/lib/AST/CommentLexer.cpp
    clang/test/AST/ast-dump-comment.cpp
    clang/unittests/AST/CommentLexer.cpp
    clang/unittests/AST/CommentParser.cpp

Removed: 
    


################################################################################
diff  --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 18ecf1efa13a164..87d2544a30ed818 100644

--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -149,6 +149,8 @@ Bug Fixes to AST Handling
 Miscellaneous Bug Fixes
 ^^^^^^^^^^^^^^^^^^^^^^^
 
+- HTML tags in comments that span multiple lines are now parsed correctly by Clang's comment parser. (#GH120843)
+
 Miscellaneous Clang Crashes Fixed
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 

diff  --git a/clang/lib/AST/CommentLexer.cpp b/clang/lib/AST/CommentLexer.cpp
index ec9a5b480aa2954..804be89a8d4ddc6 100644
--- a/clang/lib/AST/CommentLexer.cpp
+++ b/clang/lib/AST/CommentLexer.cpp
@@ -196,6 +196,15 @@ const char *skipWhitespace(const char *BufferPtr, const char *BufferEnd) {
   return BufferEnd;
 }
 
+const char *skipHorizontalWhitespace(const char *BufferPtr,
+                                     const char *BufferEnd) {
+  for (; BufferPtr != BufferEnd; ++BufferPtr) {
+    if (!isHorizontalWhitespace(*BufferPtr))
+      return BufferPtr;
+  }
+  return BufferEnd;
+}
+
 bool isWhitespace(const char *BufferPtr, const char *BufferEnd) {
   return skipWhitespace(BufferPtr, BufferEnd) == BufferEnd;
 }
@@ -637,17 +646,41 @@ void Lexer::setupAndLexHTMLStartTag(Token &T) {
   formTokenWithChars(T, TagNameEnd, tok::html_start_tag);
   T.setHTMLTagStartName(Name);
 
-  BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
+  BufferPtr = skipHorizontalWhitespace(BufferPtr, CommentEnd);
+  if (BufferPtr == CommentEnd) { // in BCPL comments
+    State = LS_HTMLStartTag;
+    return;
+  }
 
   const char C = *BufferPtr;
   if (BufferPtr != CommentEnd &&
-      (C == '>' || C == '/' || isHTMLIdentifierStartingCharacter(C)))
+      (C == '>' || C == '/' || isVerticalWhitespace(C) ||
+       isHTMLIdentifierStartingCharacter(C)))
     State = LS_HTMLStartTag;
 }
 
 void Lexer::lexHTMLStartTag(Token &T) {
   assert(State == LS_HTMLStartTag);
 
+  // Skip leading whitespace and comment decorations
+  while (isVerticalWhitespace(*BufferPtr)) {
+    BufferPtr = skipNewline(BufferPtr, CommentEnd);
+
+    if (CommentState == LCS_InsideCComment)
+      skipLineStartingDecorations();
+
+    BufferPtr = skipHorizontalWhitespace(BufferPtr, CommentEnd);
+    if (BufferPtr == CommentEnd) {
+      // HTML starting tags must be defined in a single comment block.
+      // It's likely a user-error where they forgot to terminate the comment.
+      State = LS_Normal;
+      // Since at least one newline was skipped and one token needs to be lexed,
+      // return a newline.
+      formTokenWithChars(T, BufferPtr, tok::newline);
+      return;
+    }
+  }
+
   const char *TokenPtr = BufferPtr;
   char C = *TokenPtr;
   if (isHTMLIdentifierCharacter(C)) {
@@ -693,14 +726,13 @@ void Lexer::lexHTMLStartTag(Token &T) {
 
   // Now look ahead and return to normal state if we don't see any HTML tokens
   // ahead.
-  BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
+  BufferPtr = skipHorizontalWhitespace(BufferPtr, CommentEnd);
   if (BufferPtr == CommentEnd) {
-    State = LS_Normal;
     return;
   }
 
   C = *BufferPtr;
-  if (!isHTMLIdentifierStartingCharacter(C) &&
+  if (!isHTMLIdentifierStartingCharacter(C) && !isVerticalWhitespace(C) &&
       C != '=' && C != '\"' && C != '\'' && C != '>' && C != '/') {
     State = LS_Normal;
     return;
@@ -774,8 +806,17 @@ void Lexer::lex(Token &T) {
         BufferPtr++;
 
       CommentState = LCS_InsideBCPLComment;
-      if (State != LS_VerbatimBlockBody && State != LS_VerbatimBlockFirstLine)
+      switch (State) {
+      case LS_VerbatimBlockFirstLine:
+      case LS_VerbatimBlockBody:
+        break;
+      case LS_HTMLStartTag:
+        BufferPtr = skipHorizontalWhitespace(BufferPtr, BufferEnd);
+        break;
+      default:
         State = LS_Normal;
+        break;
+      }
       CommentEnd = findBCPLCommentEnd(BufferPtr, BufferEnd);
       goto again;
     }
@@ -807,6 +848,14 @@ void Lexer::lex(Token &T) {
     while(EndWhitespace != BufferEnd && *EndWhitespace != '/')
       EndWhitespace++;
 
+    // When lexing the start of an HTML tag (i.e. going through the attributes)
+    // there won't be any newlines generated.
+    if (State == LS_HTMLStartTag && EndWhitespace != BufferEnd) {
+      CommentState = LCS_BeforeComment;
+      BufferPtr = EndWhitespace;
+      goto again;
+    }
+
     // Turn any whitespace between comments (and there is only whitespace
     // between them -- guaranteed by comment extraction) into a newline.  We
     // have two newlines between C comments in total (first one was synthesized
@@ -829,6 +878,14 @@ void Lexer::lex(Token &T) {
         BufferPtr += 2;
         assert(BufferPtr <= BufferEnd);
 
+        // When lexing the start of an HTML tag (i.e. going through the
+        // attributes) there won't be any newlines generated - whitespace still
+        // needs to be skipped.
+        if (State == LS_HTMLStartTag && BufferPtr != BufferEnd) {
+          CommentState = LCS_BetweenComments;
+          goto again;
+        }
+
         // Synthenize newline just after the C comment, regardless if there is
         // actually a newline.
         formTokenWithChars(T, BufferPtr, tok::newline);

diff  --git a/clang/test/AST/ast-dump-comment.cpp b/clang/test/AST/ast-dump-comment.cpp
index 9798295b420f9a2..40c3edb62821bb7 100644
--- a/clang/test/AST/ast-dump-comment.cpp
+++ b/clang/test/AST/ast-dump-comment.cpp
@@ -91,6 +91,19 @@ int Test_HTMLTagComment;
 // CHECK-NEXT:       TextComment{{.*}} Text=" "
 // CHECK-NEXT:       HTMLStartTagComment{{.*}} Name="br" SelfClosing
 
+/// <a
+///     href="foo"
+/// >Aaa</a>b
+int Test_HTMLTagMultilineBCPL;
+// CHECK:      VarDecl{{.*}}Test_HTMLTagMultilineBCPL
+// CHECK-NEXT:   FullComment
+// CHECK-NEXT:     ParagraphComment
+// CHECK-NEXT:       TextComment{{.*}} Text=" "
+// CHECK-NEXT:       HTMLStartTagComment{{.*}} Name="a" Attrs:  "href="foo"
+// CHECK-NEXT:       TextComment{{.*}} Text="Aaa"
+// CHECK-NEXT:       HTMLEndTagComment{{.*}} Name="a"
+// CHECK-NEXT:       TextComment{{.*}} Text="b"
+
 /// \verbatim
 /// Aaa
 /// \endverbatim

diff  --git a/clang/unittests/AST/CommentLexer.cpp b/clang/unittests/AST/CommentLexer.cpp
index 1e7bad89898f4c2..22866f0eb23edcf 100644
--- a/clang/unittests/AST/CommentLexer.cpp
+++ b/clang/unittests/AST/CommentLexer.cpp
@@ -1453,6 +1453,160 @@ TEST_F(CommentLexerTest, HTML19) {
   ASSERT_EQ(tok::newline,      Toks[2].getKind());
 }
 
+TEST_F(CommentLexerTest, HTML20) {
+  const char *Source = "// <a\n"
+                       "// \n"
+                       "// href=\"foo\"\n"
+                       "// \n"
+                       "// bar>text</a>";
+
+  std::vector<Token> Toks;
+
+  lexString(Source, Toks);
+
+  ASSERT_EQ(11U, Toks.size());
+
+  ASSERT_EQ(tok::text, Toks[0].getKind());
+  ASSERT_EQ(StringRef(" "), Toks[0].getText());
+
+  ASSERT_EQ(tok::html_start_tag, Toks[1].getKind());
+  ASSERT_EQ(StringRef("a"), Toks[1].getHTMLTagStartName());
+
+  ASSERT_EQ(tok::html_ident, Toks[2].getKind());
+  ASSERT_EQ(StringRef("href"), Toks[2].getHTMLIdent());
+
+  ASSERT_EQ(tok::html_equals, Toks[3].getKind());
+
+  ASSERT_EQ(tok::html_quoted_string, Toks[4].getKind());
+  ASSERT_EQ(StringRef("foo"), Toks[4].getHTMLQuotedString());
+
+  ASSERT_EQ(tok::html_ident, Toks[5].getKind());
+  ASSERT_EQ(StringRef("bar"), Toks[5].getHTMLIdent());
+
+  ASSERT_EQ(tok::html_greater, Toks[6].getKind());
+
+  ASSERT_EQ(tok::text, Toks[7].getKind());
+  ASSERT_EQ(StringRef("text"), Toks[7].getText());
+
+  ASSERT_EQ(tok::html_end_tag, Toks[8].getKind());
+  ASSERT_EQ(StringRef("a"), Toks[8].getHTMLTagEndName());
+
+  ASSERT_EQ(tok::html_greater, Toks[9].getKind());
+
+  ASSERT_EQ(tok::newline, Toks[10].getKind());
+}
+
+TEST_F(CommentLexerTest, HTML21) {
+  const char *Source = "/**\n"
+                       " * <a\n"
+                       " * \n"
+                       " * href=\"foo\"\n"
+                       " * \n"
+                       " * bar>text</a>\n"
+                       " */";
+
+  std::vector<Token> Toks;
+
+  lexString(Source, Toks);
+
+  ASSERT_EQ(15U, Toks.size());
+
+  ASSERT_EQ(tok::newline, Toks[0].getKind());
+
+  ASSERT_EQ(tok::text, Toks[1].getKind());
+  ASSERT_EQ(StringRef(" "), Toks[1].getText());
+
+  ASSERT_EQ(tok::html_start_tag, Toks[2].getKind());
+  ASSERT_EQ(StringRef("a"), Toks[2].getHTMLTagStartName());
+
+  ASSERT_EQ(tok::html_ident, Toks[3].getKind());
+  ASSERT_EQ(StringRef("href"), Toks[3].getHTMLIdent());
+
+  ASSERT_EQ(tok::html_equals, Toks[4].getKind());
+
+  ASSERT_EQ(tok::html_quoted_string, Toks[5].getKind());
+  ASSERT_EQ(StringRef("foo"), Toks[5].getHTMLQuotedString());
+
+  ASSERT_EQ(tok::html_ident, Toks[6].getKind());
+  ASSERT_EQ(StringRef("bar"), Toks[6].getHTMLIdent());
+
+  ASSERT_EQ(tok::html_greater, Toks[7].getKind());
+
+  ASSERT_EQ(tok::text, Toks[8].getKind());
+  ASSERT_EQ(StringRef("text"), Toks[8].getText());
+
+  ASSERT_EQ(tok::html_end_tag, Toks[9].getKind());
+  ASSERT_EQ(StringRef("a"), Toks[9].getHTMLTagEndName());
+
+  ASSERT_EQ(tok::html_greater, Toks[10].getKind());
+
+  ASSERT_EQ(tok::newline, Toks[11].getKind());
+
+  ASSERT_EQ(tok::text, Toks[12].getKind());
+  ASSERT_EQ(StringRef(" "), Toks[12].getText());
+
+  ASSERT_EQ(tok::newline, Toks[13].getKind());
+
+  ASSERT_EQ(tok::newline, Toks[14].getKind());
+}
+
+TEST_F(CommentLexerTest, HTML22) {
+  const char *Source = "/**\n"
+                       " * <a\n"
+                       " */";
+
+  std::vector<Token> Toks;
+
+  lexString(Source, Toks);
+
+  ASSERT_EQ(6U, Toks.size());
+
+  ASSERT_EQ(tok::newline, Toks[0].getKind());
+
+  ASSERT_EQ(tok::text, Toks[1].getKind());
+  ASSERT_EQ(StringRef(" "), Toks[1].getText());
+
+  ASSERT_EQ(tok::html_start_tag, Toks[2].getKind());
+  ASSERT_EQ(StringRef("a"), Toks[2].getHTMLTagStartName());
+
+  ASSERT_EQ(tok::newline, Toks[3].getKind());
+
+  ASSERT_EQ(tok::newline, Toks[4].getKind());
+
+  ASSERT_EQ(tok::newline, Toks[5].getKind());
+}
+
+TEST_F(CommentLexerTest, HTML23) {
+  // NOTE: "//<" is considered a comment start
+  const char *Source = "// <\n"
+                       "// a\n"
+                       "// >";
+
+  std::vector<Token> Toks;
+
+  lexString(Source, Toks);
+
+  ASSERT_EQ(7U, Toks.size());
+
+  ASSERT_EQ(tok::text, Toks[0].getKind());
+  ASSERT_EQ(StringRef(" "), Toks[0].getText());
+
+  ASSERT_EQ(tok::text, Toks[1].getKind());
+  ASSERT_EQ(StringRef("<"), Toks[1].getText());
+
+  ASSERT_EQ(tok::newline, Toks[2].getKind());
+
+  ASSERT_EQ(tok::text, Toks[3].getKind());
+  ASSERT_EQ(StringRef(" a"), Toks[3].getText());
+
+  ASSERT_EQ(tok::newline, Toks[4].getKind());
+
+  ASSERT_EQ(tok::text, Toks[5].getKind());
+  ASSERT_EQ(StringRef(" >"), Toks[5].getText());
+
+  ASSERT_EQ(tok::newline, Toks[6].getKind());
+}
+
 TEST_F(CommentLexerTest, NotAKnownHTMLTag1) {
   const char *Source = "// <tag>";
 

diff  --git a/clang/unittests/AST/CommentParser.cpp b/clang/unittests/AST/CommentParser.cpp
index e0df182d430c36a..aa08b6718e506f5 100644
--- a/clang/unittests/AST/CommentParser.cpp
+++ b/clang/unittests/AST/CommentParser.cpp
@@ -1065,9 +1065,10 @@ TEST_F(CommentParserTest, InlineCommand5) {
 
 TEST_F(CommentParserTest, HTML1) {
   const char *Sources[] = {
-    "// <a",
-    "// <a>",
-    "// <a >"
+      "// <a",
+      "// <a>",
+      "// <a >",
+      "// <a\n// >",
   };
 
   for (size_t i = 0, e = std::size(Sources); i != e; i++) {
@@ -1088,8 +1089,9 @@ TEST_F(CommentParserTest, HTML1) {
 
 TEST_F(CommentParserTest, HTML2) {
   const char *Sources[] = {
-    "// <br/>",
-    "// <br />"
+      "// <br/>",
+      "// <br />",
+      "// <br \n// />",
   };
 
   for (size_t i = 0, e = std::size(Sources); i != e; i++) {
@@ -1110,10 +1112,8 @@ TEST_F(CommentParserTest, HTML2) {
 
 TEST_F(CommentParserTest, HTML3) {
   const char *Sources[] = {
-    "// <a href",
-    "// <a href ",
-    "// <a href>",
-    "// <a href >",
+      "// <a href",   "// <a href ",       "// <a href>",
+      "// <a href >", "// <a \n// href >",
   };
 
   for (size_t i = 0, e = std::size(Sources); i != e; i++) {
@@ -1134,8 +1134,9 @@ TEST_F(CommentParserTest, HTML3) {
 
 TEST_F(CommentParserTest, HTML4) {
   const char *Sources[] = {
-    "// <a href=\"bbb\"",
-    "// <a href=\"bbb\">",
+      "// <a href=\"bbb\"",
+      "// <a href=\"bbb\">",
+      "// <a \n// href=\"bbb\">",
   };
 
   for (size_t i = 0, e = std::size(Sources); i != e; i++) {