[clang] [Clang][Comments] Allow HTML tags across multiple lines (PR #120843)
via cfe-commits
cfe-commits at lists.llvm.org
Wed Jan 15 09:29:58 PST 2025
https://github.com/Nerixyz updated https://github.com/llvm/llvm-project/pull/120843
>From 577d6d06224410a1da80cad2377041b8cd6db3ca Mon Sep 17 00:00:00 2001
From: Nerixyz <nerixdev at outlook.de>
Date: Sat, 21 Dec 2024 16:23:32 +0100
Subject: [PATCH] [Clang][Comments] Allow HTML tags across multiple lines
---
clang/docs/ReleaseNotes.rst | 2 +
clang/lib/AST/CommentLexer.cpp | 69 +++++++++++-
clang/test/AST/ast-dump-comment.cpp | 13 +++
clang/unittests/AST/CommentLexer.cpp | 154 ++++++++++++++++++++++++++
clang/unittests/AST/CommentParser.cpp | 23 ++--
5 files changed, 244 insertions(+), 17 deletions(-)
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index edb2e4a10ded05..e0af9c15fdd07a 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -873,6 +873,8 @@ Bug Fixes to AST Handling
Miscellaneous Bug Fixes
^^^^^^^^^^^^^^^^^^^^^^^
+- HTML tags in comments that span multiple lines are now parsed correctly by Clang's comment parser. (#GH120843)
+
Miscellaneous Clang Crashes Fixed
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/clang/lib/AST/CommentLexer.cpp b/clang/lib/AST/CommentLexer.cpp
index ec9a5b480aa295..804be89a8d4ddc 100644
--- a/clang/lib/AST/CommentLexer.cpp
+++ b/clang/lib/AST/CommentLexer.cpp
@@ -196,6 +196,15 @@ const char *skipWhitespace(const char *BufferPtr, const char *BufferEnd) {
return BufferEnd;
}
+const char *skipHorizontalWhitespace(const char *BufferPtr,
+ const char *BufferEnd) {
+ for (; BufferPtr != BufferEnd; ++BufferPtr) {
+ if (!isHorizontalWhitespace(*BufferPtr))
+ return BufferPtr;
+ }
+ return BufferEnd;
+}
+
bool isWhitespace(const char *BufferPtr, const char *BufferEnd) {
return skipWhitespace(BufferPtr, BufferEnd) == BufferEnd;
}
@@ -637,17 +646,41 @@ void Lexer::setupAndLexHTMLStartTag(Token &T) {
formTokenWithChars(T, TagNameEnd, tok::html_start_tag);
T.setHTMLTagStartName(Name);
- BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
+ BufferPtr = skipHorizontalWhitespace(BufferPtr, CommentEnd);
+ if (BufferPtr == CommentEnd) { // in BCPL comments
+ State = LS_HTMLStartTag;
+ return;
+ }
const char C = *BufferPtr;
if (BufferPtr != CommentEnd &&
- (C == '>' || C == '/' || isHTMLIdentifierStartingCharacter(C)))
+ (C == '>' || C == '/' || isVerticalWhitespace(C) ||
+ isHTMLIdentifierStartingCharacter(C)))
State = LS_HTMLStartTag;
}
void Lexer::lexHTMLStartTag(Token &T) {
assert(State == LS_HTMLStartTag);
+ // Skip leading whitespace and comment decorations
+ while (isVerticalWhitespace(*BufferPtr)) {
+ BufferPtr = skipNewline(BufferPtr, CommentEnd);
+
+ if (CommentState == LCS_InsideCComment)
+ skipLineStartingDecorations();
+
+ BufferPtr = skipHorizontalWhitespace(BufferPtr, CommentEnd);
+ if (BufferPtr == CommentEnd) {
+ // HTML starting tags must be defined in a single comment block.
+ // It's likely a user-error where they forgot to terminate the comment.
+ State = LS_Normal;
+ // Since at least one newline was skipped and one token needs to be lexed,
+ // return a newline.
+ formTokenWithChars(T, BufferPtr, tok::newline);
+ return;
+ }
+ }
+
const char *TokenPtr = BufferPtr;
char C = *TokenPtr;
if (isHTMLIdentifierCharacter(C)) {
@@ -693,14 +726,13 @@ void Lexer::lexHTMLStartTag(Token &T) {
// Now look ahead and return to normal state if we don't see any HTML tokens
// ahead.
- BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
+ BufferPtr = skipHorizontalWhitespace(BufferPtr, CommentEnd);
if (BufferPtr == CommentEnd) {
- State = LS_Normal;
return;
}
C = *BufferPtr;
- if (!isHTMLIdentifierStartingCharacter(C) &&
+ if (!isHTMLIdentifierStartingCharacter(C) && !isVerticalWhitespace(C) &&
C != '=' && C != '\"' && C != '\'' && C != '>' && C != '/') {
State = LS_Normal;
return;
@@ -774,8 +806,17 @@ void Lexer::lex(Token &T) {
BufferPtr++;
CommentState = LCS_InsideBCPLComment;
- if (State != LS_VerbatimBlockBody && State != LS_VerbatimBlockFirstLine)
+ switch (State) {
+ case LS_VerbatimBlockFirstLine:
+ case LS_VerbatimBlockBody:
+ break;
+ case LS_HTMLStartTag:
+ BufferPtr = skipHorizontalWhitespace(BufferPtr, BufferEnd);
+ break;
+ default:
State = LS_Normal;
+ break;
+ }
CommentEnd = findBCPLCommentEnd(BufferPtr, BufferEnd);
goto again;
}
@@ -807,6 +848,14 @@ void Lexer::lex(Token &T) {
while(EndWhitespace != BufferEnd && *EndWhitespace != '/')
EndWhitespace++;
+ // When lexing the start of an HTML tag (i.e. going through the attributes)
+ // there won't be any newlines generated.
+ if (State == LS_HTMLStartTag && EndWhitespace != BufferEnd) {
+ CommentState = LCS_BeforeComment;
+ BufferPtr = EndWhitespace;
+ goto again;
+ }
+
// Turn any whitespace between comments (and there is only whitespace
// between them -- guaranteed by comment extraction) into a newline. We
// have two newlines between C comments in total (first one was synthesized
@@ -829,6 +878,14 @@ void Lexer::lex(Token &T) {
BufferPtr += 2;
assert(BufferPtr <= BufferEnd);
+ // When lexing the start of an HTML tag (i.e. going through the
+ // attributes) there won't be any newlines generated - whitespace still
+ // needs to be skipped.
+ if (State == LS_HTMLStartTag && BufferPtr != BufferEnd) {
+ CommentState = LCS_BetweenComments;
+ goto again;
+ }
+
// Synthenize newline just after the C comment, regardless if there is
// actually a newline.
formTokenWithChars(T, BufferPtr, tok::newline);
diff --git a/clang/test/AST/ast-dump-comment.cpp b/clang/test/AST/ast-dump-comment.cpp
index 9798295b420f9a..40c3edb62821bb 100644
--- a/clang/test/AST/ast-dump-comment.cpp
+++ b/clang/test/AST/ast-dump-comment.cpp
@@ -91,6 +91,19 @@ int Test_HTMLTagComment;
// CHECK-NEXT: TextComment{{.*}} Text=" "
// CHECK-NEXT: HTMLStartTagComment{{.*}} Name="br" SelfClosing
+/// <a
+/// href="foo"
+/// >Aaa</a>b
+int Test_HTMLTagMultilineBCPL;
+// CHECK: VarDecl{{.*}}Test_HTMLTagMultilineBCPL
+// CHECK-NEXT: FullComment
+// CHECK-NEXT: ParagraphComment
+// CHECK-NEXT: TextComment{{.*}} Text=" "
+// CHECK-NEXT: HTMLStartTagComment{{.*}} Name="a" Attrs: "href="foo"
+// CHECK-NEXT: TextComment{{.*}} Text="Aaa"
+// CHECK-NEXT: HTMLEndTagComment{{.*}} Name="a"
+// CHECK-NEXT: TextComment{{.*}} Text="b"
+
/// \verbatim
/// Aaa
/// \endverbatim
diff --git a/clang/unittests/AST/CommentLexer.cpp b/clang/unittests/AST/CommentLexer.cpp
index 1e7bad89898f4c..22866f0eb23edc 100644
--- a/clang/unittests/AST/CommentLexer.cpp
+++ b/clang/unittests/AST/CommentLexer.cpp
@@ -1453,6 +1453,160 @@ TEST_F(CommentLexerTest, HTML19) {
ASSERT_EQ(tok::newline, Toks[2].getKind());
}
+TEST_F(CommentLexerTest, HTML20) {
+ const char *Source = "// <a\n"
+ "// \n"
+ "// href=\"foo\"\n"
+ "// \n"
+ "// bar>text</a>";
+
+ std::vector<Token> Toks;
+
+ lexString(Source, Toks);
+
+ ASSERT_EQ(11U, Toks.size());
+
+ ASSERT_EQ(tok::text, Toks[0].getKind());
+ ASSERT_EQ(StringRef(" "), Toks[0].getText());
+
+ ASSERT_EQ(tok::html_start_tag, Toks[1].getKind());
+ ASSERT_EQ(StringRef("a"), Toks[1].getHTMLTagStartName());
+
+ ASSERT_EQ(tok::html_ident, Toks[2].getKind());
+ ASSERT_EQ(StringRef("href"), Toks[2].getHTMLIdent());
+
+ ASSERT_EQ(tok::html_equals, Toks[3].getKind());
+
+ ASSERT_EQ(tok::html_quoted_string, Toks[4].getKind());
+ ASSERT_EQ(StringRef("foo"), Toks[4].getHTMLQuotedString());
+
+ ASSERT_EQ(tok::html_ident, Toks[5].getKind());
+ ASSERT_EQ(StringRef("bar"), Toks[5].getHTMLIdent());
+
+ ASSERT_EQ(tok::html_greater, Toks[6].getKind());
+
+ ASSERT_EQ(tok::text, Toks[7].getKind());
+ ASSERT_EQ(StringRef("text"), Toks[7].getText());
+
+ ASSERT_EQ(tok::html_end_tag, Toks[8].getKind());
+ ASSERT_EQ(StringRef("a"), Toks[8].getHTMLTagEndName());
+
+ ASSERT_EQ(tok::html_greater, Toks[9].getKind());
+
+ ASSERT_EQ(tok::newline, Toks[10].getKind());
+}
+
+TEST_F(CommentLexerTest, HTML21) {
+ const char *Source = "/**\n"
+ " * <a\n"
+ " * \n"
+ " * href=\"foo\"\n"
+ " * \n"
+ " * bar>text</a>\n"
+ " */";
+
+ std::vector<Token> Toks;
+
+ lexString(Source, Toks);
+
+ ASSERT_EQ(15U, Toks.size());
+
+ ASSERT_EQ(tok::newline, Toks[0].getKind());
+
+ ASSERT_EQ(tok::text, Toks[1].getKind());
+ ASSERT_EQ(StringRef(" "), Toks[1].getText());
+
+ ASSERT_EQ(tok::html_start_tag, Toks[2].getKind());
+ ASSERT_EQ(StringRef("a"), Toks[2].getHTMLTagStartName());
+
+ ASSERT_EQ(tok::html_ident, Toks[3].getKind());
+ ASSERT_EQ(StringRef("href"), Toks[3].getHTMLIdent());
+
+ ASSERT_EQ(tok::html_equals, Toks[4].getKind());
+
+ ASSERT_EQ(tok::html_quoted_string, Toks[5].getKind());
+ ASSERT_EQ(StringRef("foo"), Toks[5].getHTMLQuotedString());
+
+ ASSERT_EQ(tok::html_ident, Toks[6].getKind());
+ ASSERT_EQ(StringRef("bar"), Toks[6].getHTMLIdent());
+
+ ASSERT_EQ(tok::html_greater, Toks[7].getKind());
+
+ ASSERT_EQ(tok::text, Toks[8].getKind());
+ ASSERT_EQ(StringRef("text"), Toks[8].getText());
+
+ ASSERT_EQ(tok::html_end_tag, Toks[9].getKind());
+ ASSERT_EQ(StringRef("a"), Toks[9].getHTMLTagEndName());
+
+ ASSERT_EQ(tok::html_greater, Toks[10].getKind());
+
+ ASSERT_EQ(tok::newline, Toks[11].getKind());
+
+ ASSERT_EQ(tok::text, Toks[12].getKind());
+ ASSERT_EQ(StringRef(" "), Toks[12].getText());
+
+ ASSERT_EQ(tok::newline, Toks[13].getKind());
+
+ ASSERT_EQ(tok::newline, Toks[14].getKind());
+}
+
+TEST_F(CommentLexerTest, HTML22) {
+ const char *Source = "/**\n"
+ " * <a\n"
+ " */";
+
+ std::vector<Token> Toks;
+
+ lexString(Source, Toks);
+
+ ASSERT_EQ(6U, Toks.size());
+
+ ASSERT_EQ(tok::newline, Toks[0].getKind());
+
+ ASSERT_EQ(tok::text, Toks[1].getKind());
+ ASSERT_EQ(StringRef(" "), Toks[1].getText());
+
+ ASSERT_EQ(tok::html_start_tag, Toks[2].getKind());
+ ASSERT_EQ(StringRef("a"), Toks[2].getHTMLTagStartName());
+
+ ASSERT_EQ(tok::newline, Toks[3].getKind());
+
+ ASSERT_EQ(tok::newline, Toks[4].getKind());
+
+ ASSERT_EQ(tok::newline, Toks[5].getKind());
+}
+
+TEST_F(CommentLexerTest, HTML23) {
+ // NOTE: "//<" is considered a comment start
+ const char *Source = "// <\n"
+ "// a\n"
+ "// >";
+
+ std::vector<Token> Toks;
+
+ lexString(Source, Toks);
+
+ ASSERT_EQ(7U, Toks.size());
+
+ ASSERT_EQ(tok::text, Toks[0].getKind());
+ ASSERT_EQ(StringRef(" "), Toks[0].getText());
+
+ ASSERT_EQ(tok::text, Toks[1].getKind());
+ ASSERT_EQ(StringRef("<"), Toks[1].getText());
+
+ ASSERT_EQ(tok::newline, Toks[2].getKind());
+
+ ASSERT_EQ(tok::text, Toks[3].getKind());
+ ASSERT_EQ(StringRef(" a"), Toks[3].getText());
+
+ ASSERT_EQ(tok::newline, Toks[4].getKind());
+
+ ASSERT_EQ(tok::text, Toks[5].getKind());
+ ASSERT_EQ(StringRef(" >"), Toks[5].getText());
+
+ ASSERT_EQ(tok::newline, Toks[6].getKind());
+}
+
TEST_F(CommentLexerTest, NotAKnownHTMLTag1) {
const char *Source = "// <tag>";
diff --git a/clang/unittests/AST/CommentParser.cpp b/clang/unittests/AST/CommentParser.cpp
index e0df182d430c36..aa08b6718e506f 100644
--- a/clang/unittests/AST/CommentParser.cpp
+++ b/clang/unittests/AST/CommentParser.cpp
@@ -1065,9 +1065,10 @@ TEST_F(CommentParserTest, InlineCommand5) {
TEST_F(CommentParserTest, HTML1) {
const char *Sources[] = {
- "// <a",
- "// <a>",
- "// <a >"
+ "// <a",
+ "// <a>",
+ "// <a >",
+ "// <a\n// >",
};
for (size_t i = 0, e = std::size(Sources); i != e; i++) {
@@ -1088,8 +1089,9 @@ TEST_F(CommentParserTest, HTML1) {
TEST_F(CommentParserTest, HTML2) {
const char *Sources[] = {
- "// <br/>",
- "// <br />"
+ "// <br/>",
+ "// <br />",
+ "// <br \n// />",
};
for (size_t i = 0, e = std::size(Sources); i != e; i++) {
@@ -1110,10 +1112,8 @@ TEST_F(CommentParserTest, HTML2) {
TEST_F(CommentParserTest, HTML3) {
const char *Sources[] = {
- "// <a href",
- "// <a href ",
- "// <a href>",
- "// <a href >",
+ "// <a href", "// <a href ", "// <a href>",
+ "// <a href >", "// <a \n// href >",
};
for (size_t i = 0, e = std::size(Sources); i != e; i++) {
@@ -1134,8 +1134,9 @@ TEST_F(CommentParserTest, HTML3) {
TEST_F(CommentParserTest, HTML4) {
const char *Sources[] = {
- "// <a href=\"bbb\"",
- "// <a href=\"bbb\">",
+ "// <a href=\"bbb\"",
+ "// <a href=\"bbb\">",
+ "// <a \n// href=\"bbb\">",
};
for (size_t i = 0, e = std::size(Sources); i != e; i++) {
More information about the cfe-commits
mailing list