[clang-tools-extra] [llvm] [clang-doc] Add standalone Markdown parsing library (PR #202991)
Neil Nair via cfe-commits
cfe-commits at lists.llvm.org
Sat Jun 13 13:33:34 PDT 2026
https://github.com/Neil-N4 updated https://github.com/llvm/llvm-project/pull/202991
>From b4623400ed04066b222882d46bbddf1819ffca9e Mon Sep 17 00:00:00 2001
From: Neil-N4 <neilnair4 at gmail.com>
Date: Wed, 10 Jun 2026 09:51:48 -0400
Subject: [PATCH 01/27] [clang-doc] Add standalone Markdown parsing library
---
.../clang-doc/support/CMakeLists.txt | 3 +-
.../clang-doc/support/Markdown.cpp | 145 ++++++++++++++++++
.../clang-doc/support/Markdown.h | 72 +++++++++
.../unittests/clang-doc/CMakeLists.txt | 4 +-
.../clang-doc/MarkdownParserTest.cpp | 94 ++++++++++++
5 files changed, 316 insertions(+), 2 deletions(-)
create mode 100644 clang-tools-extra/clang-doc/support/Markdown.cpp
create mode 100644 clang-tools-extra/clang-doc/support/Markdown.h
create mode 100644 clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
diff --git a/clang-tools-extra/clang-doc/support/CMakeLists.txt b/clang-tools-extra/clang-doc/support/CMakeLists.txt
index 8ac913ffbe998..acff865190ff9 100644
--- a/clang-tools-extra/clang-doc/support/CMakeLists.txt
+++ b/clang-tools-extra/clang-doc/support/CMakeLists.txt
@@ -6,5 +6,6 @@ set(LLVM_LINK_COMPONENTS
add_clang_library(clangDocSupport STATIC
File.cpp
+ Markdown.cpp
Utils.cpp
- )
+ )
\ No newline at end of file
diff --git a/clang-tools-extra/clang-doc/support/Markdown.cpp b/clang-tools-extra/clang-doc/support/Markdown.cpp
new file mode 100644
index 0000000000000..776150b939d27
--- /dev/null
+++ b/clang-tools-extra/clang-doc/support/Markdown.cpp
@@ -0,0 +1,145 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "Markdown.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/DebugLog.h"
+
+#define DEBUG_TYPE "clang-doc-markdown"
+
+using namespace llvm;
+
+namespace clang::doc::markdown {
+
+static MDNode makeText(StringRef S) {
+ return {NodeKind::NK_Text, S, {}};
+}
+
+// A line is a table separator if it only contains |, -, :, and spaces,
+// and has at least one -.
+static bool isSepRow(StringRef Line) {
+ return Line.contains('-') &&
+ Line.find_first_not_of("|-: ") == StringRef::npos;
+}
+
+// Returns true if Line begins with a bullet list marker (-, *, or +)
+// followed by a space.
+static bool isListItem(StringRef Line) {
+ return Line.starts_with("- ") || Line.starts_with("* ") ||
+ Line.starts_with("+ ");
+}
+
+static ArrayRef<MDNode> allocateNodes(const SmallVectorImpl<MDNode> &Nodes,
+ BumpPtrAllocator &Arena) {
+ if (Nodes.empty())
+ return {};
+ MDNode *Allocated = Arena.Allocate<MDNode>(Nodes.size());
+ std::uninitialized_copy(Nodes.begin(), Nodes.end(), Allocated);
+ return ArrayRef<MDNode>(Allocated, Nodes.size());
+}
+
+ArrayRef<MDNode> parseMarkdown(StringRef ParagraphText,
+ BumpPtrAllocator &Arena) {
+ if (ParagraphText.trim().empty())
+ return {};
+
+ SmallVector<StringRef, 16> Lines;
+ ParagraphText.split(Lines, '\n');
+
+ SmallVector<MDNode> Nodes;
+ size_t I = 0, E = Lines.size();
+
+ while (I < E) {
+ StringRef Line = Lines[I].trim();
+
+ if (Line.empty()) {
+ ++I;
+ continue;
+ }
+
+ // TODO: Follow CommonMark spec §4.5 more closely -- opening fences may be
+ // indented up to 3 spaces, the closing fence must use the same character
+ // and be at least as long as the opening fence, and the closing fence may
+ // only be followed by spaces. Doxygen specifics should be handled on a
+ // case-by-case basis.
+ if (Line.starts_with("```") || Line.starts_with("~~~")) {
+ char Fence = Line[0];
+ StringRef Lang = Line.drop_front(3).trim();
+ SmallVector<MDNode> CodeLines;
+ ++I;
+ while (I < E) {
+ StringRef CodeLine = Lines[I].trim();
+ if (CodeLine.size() >= 3 &&
+ all_of(CodeLine.take_front(3),
+ [Fence](char C) { return C == Fence; }))
+ break;
+ CodeLines.push_back(makeText(Lines[I]));
+ ++I;
+ }
+ ++I; // skip closing fence
+ MDNode Code;
+ Code.Kind = NodeKind::NK_FencedCode;
+ Code.Content = Lang;
+ Code.Children = allocateNodes(CodeLines, Arena);
+ LDBG() << "emitting NK_FencedCode lang='" << Lang
+ << "' lines=" << CodeLines.size();
+ Nodes.push_back(Code);
+ continue;
+ }
+
+ // Pipe table: current line has | and next line is a separator row.
+ if (Line.contains('|') && I + 1 < E && isSepRow(Lines[I + 1].trim())) {
+ SmallVector<MDNode> Rows;
+ while (I < E && Lines[I].trim().contains('|')) {
+ Rows.push_back(makeText(Lines[I].trim()));
+ ++I;
+ }
+ MDNode Table;
+ Table.Kind = NodeKind::NK_Table;
+ Table.Content = {};
+ Table.Children = allocateNodes(Rows, Arena);
+ LDBG() << "emitting NK_Table rows=" << Rows.size();
+ Nodes.push_back(Table);
+ continue;
+ }
+
+ // Unordered list item.
+ if (isListItem(Line)) {
+ SmallVector<MDNode> Items;
+ while (I < E) {
+ StringRef L = Lines[I].trim();
+ if (!isListItem(L))
+ break;
+ MDNode Item;
+ Item.Kind = NodeKind::NK_ListItem;
+ Item.Content = L.drop_front(2).trim();
+ Item.Children = {};
+ Items.push_back(Item);
+ ++I;
+ }
+ MDNode List;
+ List.Kind = NodeKind::NK_UnorderedList;
+ List.Content = {};
+ List.Children = allocateNodes(Items, Arena);
+ LDBG() << "emitting NK_UnorderedList items=" << Items.size();
+ Nodes.push_back(List);
+ continue;
+ }
+
+ // Plain text fallback.
+ Nodes.push_back(makeText(Line));
+ ++I;
+ }
+
+ LDBG() << "parseMarkdown done nodes=" << Nodes.size();
+ return allocateNodes(Nodes, Arena);
+}
+
+} // namespace clang::doc::markdown
\ No newline at end of file
diff --git a/clang-tools-extra/clang-doc/support/Markdown.h b/clang-tools-extra/clang-doc/support/Markdown.h
new file mode 100644
index 0000000000000..890f764f937b1
--- /dev/null
+++ b/clang-tools-extra/clang-doc/support/Markdown.h
@@ -0,0 +1,72 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines a standalone Markdown parsing library for the LLVM
+/// ecosystem. The parser takes plain text and returns a tree of typed nodes
+/// with no knowledge of comments, Doxygen, or Clang-Doc internals.
+///
+/// This is a simple Markdown parser for use inside Clang-Doc's comment
+/// pipeline. You give it a paragraph of text and an arena allocator, and it
+/// gives back a list of typed nodes describing the Markdown structure it found.
+///
+/// The main entry point is parseMarkdown(). If the text has no Markdown in it,
+/// you get back an empty list and can fall back to plain-text output. If it
+/// does, you get a tree of MDNode structs where each node has a kind, optional
+/// content (like the language tag on a code fence), and optional children.
+///
+/// All nodes are allocated in the arena you pass in. You own the arena and are
+/// responsible for keeping it alive as long as you use the nodes.
+///
+/// The parser handles fenced code blocks, pipe tables, and unordered lists.
+/// Anything it does not recognize comes back as a plain text node. It will
+/// never crash on bad input.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_DOC_MARKDOWN_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_DOC_MARKDOWN_H
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Allocator.h"
+
+namespace clang::doc::markdown {
+
+enum class NodeKind {
+ // Block nodes
+ NK_Paragraph,
+ NK_FencedCode,
+ NK_Table,
+ NK_UnorderedList,
+ NK_OrderedList,
+ NK_ListItem,
+ NK_ThematicBreak,
+ // Inline nodes
+ NK_Text,
+ NK_InlineCode,
+ NK_Emphasis,
+ NK_Strong,
+ NK_SoftBreak,
+};
+
+struct MDNode {
+ NodeKind Kind;
+ llvm::StringRef Content; // lang tag for FencedCode, leaf text for Text
+ llvm::ArrayRef<MDNode> Children; // arena allocated
+};
+
+/// Parses Markdown from a single comment paragraph's text.
+/// Returns an empty ArrayRef if no Markdown constructs are found,
+/// so generators can fall back to plain-text rendering at zero cost.
+llvm::ArrayRef<MDNode> parseMarkdown(llvm::StringRef ParagraphText,
+ llvm::BumpPtrAllocator &Arena);
+
+} // namespace clang::doc::markdown
+
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_DOC_MARKDOWN_H
\ No newline at end of file
diff --git a/clang-tools-extra/unittests/clang-doc/CMakeLists.txt b/clang-tools-extra/unittests/clang-doc/CMakeLists.txt
index 01b34ec9a791e..b74207ac88fa7 100644
--- a/clang-tools-extra/unittests/clang-doc/CMakeLists.txt
+++ b/clang-tools-extra/unittests/clang-doc/CMakeLists.txt
@@ -26,6 +26,7 @@ add_extra_unittest(ClangDocTests
ClangDocTest.cpp
GeneratorTest.cpp
HTMLGeneratorTest.cpp
+ MarkdownParserTest.cpp
MDGeneratorTest.cpp
MergeTest.cpp
SerializeTest.cpp
@@ -49,5 +50,6 @@ clang_target_link_libraries(ClangDocTests
target_link_libraries(ClangDocTests
PRIVATE
clangDoc
+ clangDocSupport
LLVMTestingSupport
- )
+ )
\ No newline at end of file
diff --git a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
new file mode 100644
index 0000000000000..8df5efc7f1d5f
--- /dev/null
+++ b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
@@ -0,0 +1,94 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "support/Markdown.h"
+#include "llvm/Support/Allocator.h"
+#include "gtest/gtest.h"
+
+using namespace clang::doc::markdown;
+
+namespace {
+
+TEST(MarkdownParserTest, EmptyInput) {
+ llvm::BumpPtrAllocator Arena;
+ auto Nodes = parseMarkdown("", Arena);
+ EXPECT_TRUE(Nodes.empty());
+}
+
+TEST(MarkdownParserTest, WhitespaceOnlyInput) {
+ llvm::BumpPtrAllocator Arena;
+ auto Nodes = parseMarkdown(" \n \n", Arena);
+ EXPECT_TRUE(Nodes.empty());
+}
+
+TEST(MarkdownParserTest, PlainText) {
+ llvm::BumpPtrAllocator Arena;
+ auto Nodes = parseMarkdown("hello world", Arena);
+ ASSERT_EQ(Nodes.size(), 1u);
+ EXPECT_EQ(Nodes[0].Kind, NodeKind::NK_Text);
+ EXPECT_EQ(Nodes[0].Content, "hello world");
+}
+
+TEST(MarkdownParserTest, FencedCodeBlock) {
+ llvm::BumpPtrAllocator Arena;
+ auto Nodes = parseMarkdown("```cpp\nint x = 0;\n```", Arena);
+ ASSERT_EQ(Nodes.size(), 1u);
+ EXPECT_EQ(Nodes[0].Kind, NodeKind::NK_FencedCode);
+ EXPECT_EQ(Nodes[0].Content, "cpp");
+ ASSERT_EQ(Nodes[0].Children.size(), 1u);
+}
+
+TEST(MarkdownParserTest, FencedCodeBlockNoLang) {
+ llvm::BumpPtrAllocator Arena;
+ auto Nodes = parseMarkdown("```\nsome code\n```", Arena);
+ ASSERT_EQ(Nodes.size(), 1u);
+ EXPECT_EQ(Nodes[0].Kind, NodeKind::NK_FencedCode);
+ EXPECT_TRUE(Nodes[0].Content.empty());
+}
+
+TEST(MarkdownParserTest, UnterminatedFenceReturnsEmpty) {
+ llvm::BumpPtrAllocator Arena;
+ auto Nodes = parseMarkdown("```cpp\nint x = 0;", Arena);
+ // Unterminated fence should not crash and should produce a code node
+ // with whatever lines were found.
+ EXPECT_FALSE(Nodes.empty());
+}
+
+TEST(MarkdownParserTest, PipeTable) {
+ llvm::BumpPtrAllocator Arena;
+ auto Nodes = parseMarkdown("| A | B |\n|---|---|\n| 1 | 2 |", Arena);
+ ASSERT_EQ(Nodes.size(), 1u);
+ EXPECT_EQ(Nodes[0].Kind, NodeKind::NK_Table);
+}
+
+TEST(MarkdownParserTest, PipeCharacterWithoutSepRowIsPlainText) {
+ llvm::BumpPtrAllocator Arena;
+ auto Nodes = parseMarkdown("a | b\nc | d", Arena);
+ // No separator row so should not be parsed as a table
+ for (const auto &Node : Nodes)
+ EXPECT_NE(Node.Kind, NodeKind::NK_Table);
+}
+
+TEST(MarkdownParserTest, UnorderedList) {
+ llvm::BumpPtrAllocator Arena;
+ auto Nodes = parseMarkdown("- foo\n- bar\n- baz", Arena);
+ ASSERT_EQ(Nodes.size(), 1u);
+ EXPECT_EQ(Nodes[0].Kind, NodeKind::NK_UnorderedList);
+ ASSERT_EQ(Nodes[0].Children.size(), 3u);
+ EXPECT_EQ(Nodes[0].Children[0].Content, "foo");
+ EXPECT_EQ(Nodes[0].Children[1].Content, "bar");
+ EXPECT_EQ(Nodes[0].Children[2].Content, "baz");
+}
+
+TEST(MarkdownParserTest, MixedContent) {
+ llvm::BumpPtrAllocator Arena;
+ auto Nodes = parseMarkdown("some text\n```\ncode\n```\n- item", Arena);
+ EXPECT_EQ(Nodes.size(), 3u);
+}
+
+} // namespace
\ No newline at end of file
>From f4cb4a28630e0f91289bfd4416c59114c5654ff7 Mon Sep 17 00:00:00 2001
From: Neil-N4 <neilnair4 at gmail.com>
Date: Wed, 10 Jun 2026 11:35:54 -0400
Subject: [PATCH 02/27] [clang-doc] Address review feedback: test fixture, raw
strings, DEBUG_TYPE, EOF newlines
---
.../clang-doc/support/Markdown.cpp | 4 +-
.../clang-doc/support/Markdown.h | 2 +-
.../clang-doc/MarkdownParserTest.cpp | 97 +++++++++++--------
3 files changed, 61 insertions(+), 42 deletions(-)
diff --git a/clang-tools-extra/clang-doc/support/Markdown.cpp b/clang-tools-extra/clang-doc/support/Markdown.cpp
index 776150b939d27..9e008abf8b08d 100644
--- a/clang-tools-extra/clang-doc/support/Markdown.cpp
+++ b/clang-tools-extra/clang-doc/support/Markdown.cpp
@@ -12,7 +12,7 @@
#include "llvm/Support/Allocator.h"
#include "llvm/Support/DebugLog.h"
-#define DEBUG_TYPE "clang-doc-markdown"
+#define DEBUG_TYPE "clang-doc"
using namespace llvm;
@@ -142,4 +142,4 @@ ArrayRef<MDNode> parseMarkdown(StringRef ParagraphText,
return allocateNodes(Nodes, Arena);
}
-} // namespace clang::doc::markdown
\ No newline at end of file
+} // namespace clang::doc::markdown
diff --git a/clang-tools-extra/clang-doc/support/Markdown.h b/clang-tools-extra/clang-doc/support/Markdown.h
index 890f764f937b1..09b79cc8f2437 100644
--- a/clang-tools-extra/clang-doc/support/Markdown.h
+++ b/clang-tools-extra/clang-doc/support/Markdown.h
@@ -69,4 +69,4 @@ llvm::ArrayRef<MDNode> parseMarkdown(llvm::StringRef ParagraphText,
} // namespace clang::doc::markdown
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_DOC_MARKDOWN_H
\ No newline at end of file
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_DOC_MARKDOWN_H
diff --git a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
index 8df5efc7f1d5f..ff9bad88da136 100644
--- a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
+++ b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
@@ -14,80 +14,99 @@ using namespace clang::doc::markdown;
namespace {
-TEST(MarkdownParserTest, EmptyInput) {
+struct MarkdownParserTest : public ::testing::Test {
llvm::BumpPtrAllocator Arena;
+};
+
+TEST_F(MarkdownParserTest, EmptyInput) {
auto Nodes = parseMarkdown("", Arena);
EXPECT_TRUE(Nodes.empty());
}
-TEST(MarkdownParserTest, WhitespaceOnlyInput) {
- llvm::BumpPtrAllocator Arena;
+TEST_F(MarkdownParserTest, WhitespaceOnlyInput) {
auto Nodes = parseMarkdown(" \n \n", Arena);
EXPECT_TRUE(Nodes.empty());
}
-TEST(MarkdownParserTest, PlainText) {
- llvm::BumpPtrAllocator Arena;
+TEST_F(MarkdownParserTest, PlainText) {
auto Nodes = parseMarkdown("hello world", Arena);
ASSERT_EQ(Nodes.size(), 1u);
- EXPECT_EQ(Nodes[0].Kind, NodeKind::NK_Text);
- EXPECT_EQ(Nodes[0].Content, "hello world");
+ const auto &N = Nodes[0];
+ EXPECT_EQ(N.Kind, NodeKind::NK_Text);
+ EXPECT_EQ(N.Content, "hello world");
}
-TEST(MarkdownParserTest, FencedCodeBlock) {
- llvm::BumpPtrAllocator Arena;
- auto Nodes = parseMarkdown("```cpp\nint x = 0;\n```", Arena);
+TEST_F(MarkdownParserTest, FencedCodeBlock) {
+ auto Nodes = parseMarkdown(R"(```cpp
+int x = 0;
+````)",
+ Arena);
ASSERT_EQ(Nodes.size(), 1u);
- EXPECT_EQ(Nodes[0].Kind, NodeKind::NK_FencedCode);
- EXPECT_EQ(Nodes[0].Content, "cpp");
- ASSERT_EQ(Nodes[0].Children.size(), 1u);
+ const auto &N = Nodes[0];
+ EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode);
+ EXPECT_EQ(N.Content, "cpp");
+ ASSERT_EQ(N.Children.size(), 1u);
}
-TEST(MarkdownParserTest, FencedCodeBlockNoLang) {
- llvm::BumpPtrAllocator Arena;
- auto Nodes = parseMarkdown("```\nsome code\n```", Arena);
+TEST_F(MarkdownParserTest, FencedCodeBlockNoLang) {
+ auto Nodes = parseMarkdown(R"(```
+some code
+```)",
+ Arena);
ASSERT_EQ(Nodes.size(), 1u);
- EXPECT_EQ(Nodes[0].Kind, NodeKind::NK_FencedCode);
- EXPECT_TRUE(Nodes[0].Content.empty());
+ const auto &N = Nodes[0];
+ EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode);
+ EXPECT_TRUE(N.Content.empty());
}
-TEST(MarkdownParserTest, UnterminatedFenceReturnsEmpty) {
- llvm::BumpPtrAllocator Arena;
- auto Nodes = parseMarkdown("```cpp\nint x = 0;", Arena);
+TEST_F(MarkdownParserTest, UnterminatedFenceReturnsEmpty) {
+ auto Nodes = parseMarkdown(R"(```cpp
+int x = 0;)",
+ Arena);
// Unterminated fence should not crash and should produce a code node
// with whatever lines were found.
EXPECT_FALSE(Nodes.empty());
}
-TEST(MarkdownParserTest, PipeTable) {
- llvm::BumpPtrAllocator Arena;
- auto Nodes = parseMarkdown("| A | B |\n|---|---|\n| 1 | 2 |", Arena);
+TEST_F(MarkdownParserTest, PipeTable) {
+ auto Nodes = parseMarkdown(R"(| A | B |
+|---|---|
+| 1 | 2 |)",
+ Arena);
ASSERT_EQ(Nodes.size(), 1u);
EXPECT_EQ(Nodes[0].Kind, NodeKind::NK_Table);
}
-TEST(MarkdownParserTest, PipeCharacterWithoutSepRowIsPlainText) {
- llvm::BumpPtrAllocator Arena;
- auto Nodes = parseMarkdown("a | b\nc | d", Arena);
- // No separator row so should not be parsed as a table
+TEST_F(MarkdownParserTest, PipeCharacterWithoutSepRowIsPlainText) {
+ auto Nodes = parseMarkdown(R"(a | b
+c | d)",
+ Arena);
+ // No separator row so should not be parsed as a table.
for (const auto &Node : Nodes)
EXPECT_NE(Node.Kind, NodeKind::NK_Table);
}
-TEST(MarkdownParserTest, UnorderedList) {
- llvm::BumpPtrAllocator Arena;
- auto Nodes = parseMarkdown("- foo\n- bar\n- baz", Arena);
+TEST_F(MarkdownParserTest, UnorderedList) {
+ auto Nodes = parseMarkdown(R"(- foo
+- bar
+- baz)",
+ Arena);
ASSERT_EQ(Nodes.size(), 1u);
- EXPECT_EQ(Nodes[0].Kind, NodeKind::NK_UnorderedList);
- ASSERT_EQ(Nodes[0].Children.size(), 3u);
- EXPECT_EQ(Nodes[0].Children[0].Content, "foo");
- EXPECT_EQ(Nodes[0].Children[1].Content, "bar");
- EXPECT_EQ(Nodes[0].Children[2].Content, "baz");
+ const auto &N = Nodes[0];
+ EXPECT_EQ(N.Kind, NodeKind::NK_UnorderedList);
+ ASSERT_EQ(N.Children.size(), 3u);
+ EXPECT_EQ(N.Children[0].Content, "foo");
+ EXPECT_EQ(N.Children[1].Content, "bar");
+ EXPECT_EQ(N.Children[2].Content, "baz");
}
-TEST(MarkdownParserTest, MixedContent) {
- llvm::BumpPtrAllocator Arena;
- auto Nodes = parseMarkdown("some text\n```\ncode\n```\n- item", Arena);
+TEST_F(MarkdownParserTest, MixedContent) {
+ auto Nodes = parseMarkdown(R"(some text
+```
+code
+````
+- item)",
+ Arena);
EXPECT_EQ(Nodes.size(), 3u);
}
>From 3ef8f62edab311caff0907ab2b9a0c3aaeb14353 Mon Sep 17 00:00:00 2001
From: Neil-N4 <neilnair4 at gmail.com>
Date: Wed, 10 Jun 2026 13:45:44 -0400
Subject: [PATCH 03/27] [clang-doc] Add CommonMark spec tests for fenced code
blocks
---
.../clang-doc/MarkdownParserTest.cpp | 112 +++++++++++++++++-
1 file changed, 108 insertions(+), 4 deletions(-)
diff --git a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
index ff9bad88da136..4ca979c1f1d24 100644
--- a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
+++ b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
@@ -39,7 +39,7 @@ TEST_F(MarkdownParserTest, PlainText) {
TEST_F(MarkdownParserTest, FencedCodeBlock) {
auto Nodes = parseMarkdown(R"(```cpp
int x = 0;
-````)",
+````````)",
Arena);
ASSERT_EQ(Nodes.size(), 1u);
const auto &N = Nodes[0];
@@ -51,7 +51,7 @@ int x = 0;
TEST_F(MarkdownParserTest, FencedCodeBlockNoLang) {
auto Nodes = parseMarkdown(R"(```
some code
-```)",
+```````)",
Arena);
ASSERT_EQ(Nodes.size(), 1u);
const auto &N = Nodes[0];
@@ -102,12 +102,116 @@ TEST_F(MarkdownParserTest, UnorderedList) {
TEST_F(MarkdownParserTest, MixedContent) {
auto Nodes = parseMarkdown(R"(some text
-```
+```````
code
-````
+````````
- item)",
Arena);
EXPECT_EQ(Nodes.size(), 3u);
}
+// CommonMark §4.5 example 120: tilde fences work the same as backtick fences.
+TEST_F(MarkdownParserTest, TildeFence) {
+ auto Nodes = parseMarkdown(R"(~~~
+int x = 0;
+~~~)",
+ Arena);
+ ASSERT_EQ(Nodes.size(), 1u);
+ const auto &N = Nodes[0];
+ EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode);
+ EXPECT_TRUE(N.Content.empty());
+ ASSERT_EQ(N.Children.size(), 1u);
+}
+
+// CommonMark §4.5 example 120: tilde fence with a language tag.
+TEST_F(MarkdownParserTest, TildeFenceWithLang) {
+ auto Nodes = parseMarkdown(R"(~~~cpp
+int x = 0;
+~~~)",
+ Arena);
+ ASSERT_EQ(Nodes.size(), 1u);
+ const auto &N = Nodes[0];
+ EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode);
+ EXPECT_EQ(N.Content, "cpp");
+ ASSERT_EQ(N.Children.size(), 1u);
+}
+
+// CommonMark §4.5 example 122: a tilde line does not close a backtick fence.
+TEST_F(MarkdownParserTest, ClosingFenceMustMatchOpeningChar) {
+ auto Nodes = parseMarkdown(R"(```
+aaa
+~~~
+````````)",
+ Arena);
+ ASSERT_EQ(Nodes.size(), 1u);
+ const auto &N = Nodes[0];
+ EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode);
+ // ~~~ is content, not a closing fence.
+ ASSERT_EQ(N.Children.size(), 2u);
+}
+
+// CommonMark §4.5 example 130: a code block can be empty.
+TEST_F(MarkdownParserTest, EmptyFencedCodeBlock) {
+ auto Nodes = parseMarkdown(R"(```
+```````)",
+ Arena);
+ ASSERT_EQ(Nodes.size(), 1u);
+ const auto &N = Nodes[0];
+ EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode);
+ EXPECT_TRUE(N.Children.empty());
+}
+
+// CommonMark §4.5 example 129: a code block may contain only blank lines.
+TEST_F(MarkdownParserTest, FencedCodeBlockBlankLineContent) {
+ auto Nodes = parseMarkdown("```\n\n \n```", Arena);
+ ASSERT_EQ(Nodes.size(), 1u);
+ const auto &N = Nodes[0];
+ EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode);
+ ASSERT_EQ(N.Children.size(), 2u);
+}
+
+// CommonMark §4.5 example 142: lang tag is captured from the info string.
+TEST_F(MarkdownParserTest, InfoStringLangTag) {
+ auto Nodes = parseMarkdown(R"(```ruby
+def foo(x)
+ return 3
+end
+``````)",
+ Arena);
+ ASSERT_EQ(Nodes.size(), 1u);
+ const auto &N = Nodes[0];
+ EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode);
+ EXPECT_EQ(N.Content, "ruby");
+ ASSERT_EQ(N.Children.size(), 3u);
+}
+
+// CommonMark §4.5 example 146: tilde fence info string may contain backticks.
+TEST_F(MarkdownParserTest, TildeFenceInfoStringWithBackticks) {
+ auto Nodes = parseMarkdown(R"(~~~ aa ``` ~~~
+foo
+~~~)",
+ Arena);
+ ASSERT_EQ(Nodes.size(), 1u);
+ const auto &N = Nodes[0];
+ EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode);
+ EXPECT_EQ(N.Content, "aa ``` ~~~");
+ ASSERT_EQ(N.Children.size(), 1u);
+}
+
+// CommonMark §4.5 example 124: closing fence must be at least as long as the
+// opening fence.
+// TODO: our parser currently closes on the first line with 3 matching fence
+// chars regardless of opening fence length. Fix as part of the CommonMark
+// TODO in parseMarkdown().
+TEST_F(MarkdownParserTest, ClosingFenceLengthTODO) {
+ auto Nodes = parseMarkdown("````\naaa\n```", Arena);
+ // The ``` line should not close the ```` fence per CommonMark, but our
+ // parser currently treats it as a closing fence. This test documents the
+ // current (non-conformant) behavior.
+ ASSERT_EQ(Nodes.size(), 1u);
+ const auto &N = Nodes[0];
+ EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode);
+ ASSERT_EQ(N.Children.size(), 1u);
+}
+
} // namespace
\ No newline at end of file
>From ffb56028d83a542a775119a7b0c2f88271b2df84 Mon Sep 17 00:00:00 2001
From: Neil-N4 <neilnair4 at gmail.com>
Date: Wed, 10 Jun 2026 13:59:52 -0400
Subject: [PATCH 04/27] [clang-doc] Replace flat MDNode with typed node
hierarchy using LLVM RTTI
---
.../clang-doc/support/Markdown.cpp | 84 +++---
.../clang-doc/support/Markdown.h | 264 ++++++++++++++++--
.../clang-doc/MarkdownParserTest.cpp | 84 +++---
3 files changed, 312 insertions(+), 120 deletions(-)
diff --git a/clang-tools-extra/clang-doc/support/Markdown.cpp b/clang-tools-extra/clang-doc/support/Markdown.cpp
index 9e008abf8b08d..bee15c3e23ec3 100644
--- a/clang-tools-extra/clang-doc/support/Markdown.cpp
+++ b/clang-tools-extra/clang-doc/support/Markdown.cpp
@@ -18,8 +18,24 @@ using namespace llvm;
namespace clang::doc::markdown {
-static MDNode makeText(StringRef S) {
- return {NodeKind::NK_Text, S, {}};
+// Allocates a contiguous array of T in the arena and returns an ArrayRef.
+template <typename T>
+static ArrayRef<T> allocateArray(SmallVectorImpl<T> &Vec,
+ BumpPtrAllocator &Arena) {
+ if (Vec.empty())
+ return {};
+ T *Allocated = Arena.Allocate<T>(Vec.size());
+ std::uninitialized_copy(Vec.begin(), Vec.end(), Allocated);
+ return ArrayRef<T>(Allocated, Vec.size());
+}
+
+// Interns a StringRef into the arena so it outlives the parse loop.
+static StringRef internString(StringRef S, BumpPtrAllocator &Arena) {
+ if (S.empty())
+ return {};
+ char *Buf = Arena.Allocate<char>(S.size());
+ std::copy(S.begin(), S.end(), Buf);
+ return StringRef(Buf, S.size());
}
// A line is a table separator if it only contains |, -, :, and spaces,
@@ -36,24 +52,15 @@ static bool isListItem(StringRef Line) {
Line.starts_with("+ ");
}
-static ArrayRef<MDNode> allocateNodes(const SmallVectorImpl<MDNode> &Nodes,
- BumpPtrAllocator &Arena) {
- if (Nodes.empty())
- return {};
- MDNode *Allocated = Arena.Allocate<MDNode>(Nodes.size());
- std::uninitialized_copy(Nodes.begin(), Nodes.end(), Allocated);
- return ArrayRef<MDNode>(Allocated, Nodes.size());
-}
-
-ArrayRef<MDNode> parseMarkdown(StringRef ParagraphText,
- BumpPtrAllocator &Arena) {
+ArrayRef<MDNode *> parseMarkdown(StringRef ParagraphText,
+ BumpPtrAllocator &Arena) {
if (ParagraphText.trim().empty())
return {};
SmallVector<StringRef, 16> Lines;
ParagraphText.split(Lines, '\n');
- SmallVector<MDNode> Nodes;
+ SmallVector<MDNode *> Nodes;
size_t I = 0, E = Lines.size();
while (I < E) {
@@ -71,8 +78,8 @@ ArrayRef<MDNode> parseMarkdown(StringRef ParagraphText,
// case-by-case basis.
if (Line.starts_with("```") || Line.starts_with("~~~")) {
char Fence = Line[0];
- StringRef Lang = Line.drop_front(3).trim();
- SmallVector<MDNode> CodeLines;
+ StringRef Lang = internString(Line.drop_front(3).trim(), Arena);
+ SmallVector<StringRef> CodeLines;
++I;
while (I < E) {
StringRef CodeLine = Lines[I].trim();
@@ -80,15 +87,13 @@ ArrayRef<MDNode> parseMarkdown(StringRef ParagraphText,
all_of(CodeLine.take_front(3),
[Fence](char C) { return C == Fence; }))
break;
- CodeLines.push_back(makeText(Lines[I]));
+ CodeLines.push_back(internString(Lines[I], Arena));
++I;
}
++I; // skip closing fence
- MDNode Code;
- Code.Kind = NodeKind::NK_FencedCode;
- Code.Content = Lang;
- Code.Children = allocateNodes(CodeLines, Arena);
- LDBG() << "emitting NK_FencedCode lang='" << Lang
+ auto *Code =
+ new (Arena) FencedCodeNode(Lang, allocateArray(CodeLines, Arena));
+ LDBG() << "emitting FencedCodeNode lang='" << Lang
<< "' lines=" << CodeLines.size();
Nodes.push_back(Code);
continue;
@@ -96,50 +101,45 @@ ArrayRef<MDNode> parseMarkdown(StringRef ParagraphText,
// Pipe table: current line has | and next line is a separator row.
if (Line.contains('|') && I + 1 < E && isSepRow(Lines[I + 1].trim())) {
- SmallVector<MDNode> Rows;
+ SmallVector<StringRef> Rows;
while (I < E && Lines[I].trim().contains('|')) {
- Rows.push_back(makeText(Lines[I].trim()));
+ Rows.push_back(internString(Lines[I].trim(), Arena));
++I;
}
- MDNode Table;
- Table.Kind = NodeKind::NK_Table;
- Table.Content = {};
- Table.Children = allocateNodes(Rows, Arena);
- LDBG() << "emitting NK_Table rows=" << Rows.size();
+ auto *Table = new (Arena) TableNode(allocateArray(Rows, Arena));
+ LDBG() << "emitting TableNode rows=" << Rows.size();
Nodes.push_back(Table);
continue;
}
// Unordered list item.
if (isListItem(Line)) {
- SmallVector<MDNode> Items;
+ SmallVector<ListItemNode *> Items;
while (I < E) {
StringRef L = Lines[I].trim();
if (!isListItem(L))
break;
- MDNode Item;
- Item.Kind = NodeKind::NK_ListItem;
- Item.Content = L.drop_front(2).trim();
- Item.Children = {};
+ StringRef ItemText = internString(L.drop_front(2).trim(), Arena);
+ SmallVector<MDNode *> ItemChildren;
+ ItemChildren.push_back(new (Arena) TextNode(ItemText));
+ auto *Item =
+ new (Arena) ListItemNode(allocateArray(ItemChildren, Arena));
Items.push_back(Item);
++I;
}
- MDNode List;
- List.Kind = NodeKind::NK_UnorderedList;
- List.Content = {};
- List.Children = allocateNodes(Items, Arena);
- LDBG() << "emitting NK_UnorderedList items=" << Items.size();
+ auto *List = new (Arena) UnorderedListNode(allocateArray(Items, Arena));
+ LDBG() << "emitting UnorderedListNode items=" << Items.size();
Nodes.push_back(List);
continue;
}
// Plain text fallback.
- Nodes.push_back(makeText(Line));
+ Nodes.push_back(new (Arena) TextNode(internString(Line, Arena)));
++I;
}
LDBG() << "parseMarkdown done nodes=" << Nodes.size();
- return allocateNodes(Nodes, Arena);
+ return allocateArray(Nodes, Arena);
}
-} // namespace clang::doc::markdown
+} // namespace clang::doc::markdown
\ No newline at end of file
diff --git a/clang-tools-extra/clang-doc/support/Markdown.h b/clang-tools-extra/clang-doc/support/Markdown.h
index 09b79cc8f2437..3d457bcddfac6 100644
--- a/clang-tools-extra/clang-doc/support/Markdown.h
+++ b/clang-tools-extra/clang-doc/support/Markdown.h
@@ -7,30 +7,50 @@
//===----------------------------------------------------------------------===//
///
/// \file
-/// This file defines a standalone Markdown parsing library for the LLVM
-/// ecosystem. The parser takes plain text and returns a tree of typed nodes
-/// with no knowledge of comments, Doxygen, or Clang-Doc internals.
+/// Standalone Markdown parsing library for the LLVM ecosystem.
///
-/// This is a simple Markdown parser for use inside Clang-Doc's comment
-/// pipeline. You give it a paragraph of text and an arena allocator, and it
-/// gives back a list of typed nodes describing the Markdown structure it found.
+/// The parser takes plain paragraph text and returns a polymorphic tree of
+/// MDNode-derived objects allocated in a caller-supplied BumpPtrAllocator.
+/// Node types form a closed class hierarchy rooted at MDNode. Each concrete
+/// type carries exactly the fields it needs -- no overloaded Content field,
+/// no unused arrays. Use llvm::isa<>/cast<>/dyn_cast<> for type-safe
+/// downcasting; each concrete type provides classof() for this purpose.
///
-/// The main entry point is parseMarkdown(). If the text has no Markdown in it,
-/// you get back an empty list and can fall back to plain-text output. If it
-/// does, you get a tree of MDNode structs where each node has a kind, optional
-/// content (like the language tag on a code fence), and optional children.
+/// See
+/// https://llvm.org/docs/ProgrammerManual.html#the-isa-cast-and-dyn-cast-templates
///
-/// All nodes are allocated in the arena you pass in. You own the arena and are
-/// responsible for keeping it alive as long as you use the nodes.
+/// Field ordering in each derived struct is chosen to minimize padding:
+/// 4-byte fields (like Level or Start) are declared before 16-byte fields
+/// (ArrayRef, StringRef) so that no implicit padding is inserted between the
+/// base class's 4-byte Kind and the first derived field.
///
-/// The parser handles fenced code blocks, pipe tables, and unordered lists.
-/// Anything it does not recognize comes back as a plain text node. It will
-/// never crash on bad input.
+/// Inline nodes (appear inside ParagraphNode, HeadingNode, etc.):
+/// TextNode -- plain text run
+/// SoftBreakNode -- soft line break
+/// HardBreakNode -- hard line break (trailing spaces or backslash)
+/// InlineCodeNode -- inline code span (`code`)
+/// EmphasisNode -- emphasis (*text* or _text_)
+/// StrongNode -- strong emphasis (**text** or __text__)
+///
+/// Block nodes:
+/// ParagraphNode -- sequence of inline nodes
+/// HeadingNode -- ATX heading (# through ######), level 1-6
+/// FencedCodeNode -- fenced code block (``` or ~~~)
+/// TableNode -- pipe table (raw row text; TODO: structured cells)
+/// UnorderedListNode -- bullet list (-, *, +)
+/// OrderedListNode -- numbered list with explicit start number
+/// ListItemNode -- single item inside a list
+/// BlockQuoteNode -- block quote (>)
+/// ThematicBreakNode -- horizontal rule (---, ***, ___)
+///
+/// All nodes are arena-allocated. The caller owns the arena and must keep it
+/// alive for the lifetime of any returned nodes. The parser never crashes on
+/// malformed input; unrecognized text falls back to TextNode.
///
//===----------------------------------------------------------------------===//
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_DOC_MARKDOWN_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_DOC_MARKDOWN_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_DOC_SUPPORT_MARKDOWN_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_DOC_SUPPORT_MARKDOWN_H
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/StringRef.h"
@@ -38,35 +58,217 @@
namespace clang::doc::markdown {
+/// Discriminator for all Markdown AST nodes. Inline kinds are grouped before
+/// block kinds so that the sentinels NK_LastInline and NK_FirstBlock enable
+/// cheap range-based checks in classof() implementations.
enum class NodeKind {
+ // Inline nodes
+ NK_Text,
+ NK_SoftBreak,
+ NK_HardBreak,
+ NK_InlineCode,
+ NK_Emphasis,
+ NK_Strong,
+ NK_LastInline = NK_Strong, // sentinel -- all inline kinds are <= this
+
// Block nodes
NK_Paragraph,
+ NK_Heading,
NK_FencedCode,
NK_Table,
NK_UnorderedList,
NK_OrderedList,
NK_ListItem,
+ NK_BlockQuote,
NK_ThematicBreak,
- // Inline nodes
- NK_Text,
- NK_InlineCode,
- NK_Emphasis,
- NK_Strong,
- NK_SoftBreak,
+ NK_FirstBlock = NK_Paragraph, // sentinel -- all block kinds are >= this
};
+/// Base type for all Markdown AST nodes. Carries only the kind discriminator.
+/// Nodes are arena-allocated and have no virtual destructor; use
+/// llvm::isa<>/cast<>/dyn_cast<> for type-safe downcasting.
struct MDNode {
NodeKind Kind;
- llvm::StringRef Content; // lang tag for FencedCode, leaf text for Text
- llvm::ArrayRef<MDNode> Children; // arena allocated
+ explicit MDNode(NodeKind K) : Kind(K) {}
+};
+
+//===----------------------------------------------------------------------===//
+// Inline nodes
+//===----------------------------------------------------------------------===//
+
+/// Plain text run.
+struct TextNode : MDNode {
+ llvm::StringRef Text;
+ explicit TextNode(llvm::StringRef Text)
+ : MDNode(NodeKind::NK_Text), Text(Text) {}
+ static bool classof(const MDNode *N) { return N->Kind == NodeKind::NK_Text; }
+};
+
+/// Soft line break -- a newline that does not end the paragraph.
+struct SoftBreakNode : MDNode {
+ SoftBreakNode() : MDNode(NodeKind::NK_SoftBreak) {}
+ static bool classof(const MDNode *N) {
+ return N->Kind == NodeKind::NK_SoftBreak;
+ }
+};
+
+/// Hard line break -- two trailing spaces or a backslash before a newline.
+struct HardBreakNode : MDNode {
+ HardBreakNode() : MDNode(NodeKind::NK_HardBreak) {}
+ static bool classof(const MDNode *N) {
+ return N->Kind == NodeKind::NK_HardBreak;
+ }
+};
+
+/// Inline code span: `code`. Code does not include the surrounding backticks.
+struct InlineCodeNode : MDNode {
+ llvm::StringRef Code;
+ explicit InlineCodeNode(llvm::StringRef Code)
+ : MDNode(NodeKind::NK_InlineCode), Code(Code) {}
+ static bool classof(const MDNode *N) {
+ return N->Kind == NodeKind::NK_InlineCode;
+ }
+};
+
+/// Emphasized text: *text* or _text_.
+struct EmphasisNode : MDNode {
+ llvm::ArrayRef<MDNode *> Children;
+ explicit EmphasisNode(llvm::ArrayRef<MDNode *> Children)
+ : MDNode(NodeKind::NK_Emphasis), Children(Children) {}
+ static bool classof(const MDNode *N) {
+ return N->Kind == NodeKind::NK_Emphasis;
+ }
};
-/// Parses Markdown from a single comment paragraph's text.
-/// Returns an empty ArrayRef if no Markdown constructs are found,
-/// so generators can fall back to plain-text rendering at zero cost.
-llvm::ArrayRef<MDNode> parseMarkdown(llvm::StringRef ParagraphText,
- llvm::BumpPtrAllocator &Arena);
+/// Strongly emphasized text: **text** or __text__.
+struct StrongNode : MDNode {
+ llvm::ArrayRef<MDNode *> Children;
+ explicit StrongNode(llvm::ArrayRef<MDNode *> Children)
+ : MDNode(NodeKind::NK_Strong), Children(Children) {}
+ static bool classof(const MDNode *N) {
+ return N->Kind == NodeKind::NK_Strong;
+ }
+};
+
+//===----------------------------------------------------------------------===//
+// Block nodes
+//===----------------------------------------------------------------------===//
+
+/// A paragraph -- sequence of inline nodes separated from other blocks by
+/// blank lines.
+struct ParagraphNode : MDNode {
+ llvm::ArrayRef<MDNode *> Children;
+ explicit ParagraphNode(llvm::ArrayRef<MDNode *> Children)
+ : MDNode(NodeKind::NK_Paragraph), Children(Children) {}
+ static bool classof(const MDNode *N) {
+ return N->Kind == NodeKind::NK_Paragraph;
+ }
+};
+
+/// ATX heading: one to six leading # characters. Level is declared before
+/// Children to avoid padding between the base class's 4-byte Kind and the
+/// 8-byte-aligned ArrayRef, keeping sizeof(HeadingNode) at 24 bytes.
+struct HeadingNode : MDNode {
+ unsigned Level; // 1-6
+ llvm::ArrayRef<MDNode *> Children; // inline content
+ HeadingNode(unsigned Level, llvm::ArrayRef<MDNode *> Children)
+ : MDNode(NodeKind::NK_Heading), Level(Level), Children(Children) {}
+ static bool classof(const MDNode *N) {
+ return N->Kind == NodeKind::NK_Heading;
+ }
+};
+
+/// Fenced code block opened with ``` or ~~~. Lang is the info string (e.g.
+/// "cpp"); empty when no language was specified. Lines contains the raw text
+/// of each interior line, without the opening or closing fence.
+///
+/// TODO: Follow CommonMark spec §4.5 -- the opening fence may be indented up
+/// to 3 spaces; the closing fence must use the same character and be at least
+/// as long as the opening fence; only spaces may follow the closing fence.
+struct FencedCodeNode : MDNode {
+ llvm::StringRef Lang;
+ llvm::ArrayRef<llvm::StringRef> Lines;
+ FencedCodeNode(llvm::StringRef Lang, llvm::ArrayRef<llvm::StringRef> Lines)
+ : MDNode(NodeKind::NK_FencedCode), Lang(Lang), Lines(Lines) {}
+ static bool classof(const MDNode *N) {
+ return N->Kind == NodeKind::NK_FencedCode;
+ }
+};
+
+/// Pipe table. Rows contains the raw text of each row line including the
+/// header and separator rows.
+/// TODO: replace with a structured header/body/cell representation.
+struct TableNode : MDNode {
+ llvm::ArrayRef<llvm::StringRef> Rows;
+ explicit TableNode(llvm::ArrayRef<llvm::StringRef> Rows)
+ : MDNode(NodeKind::NK_Table), Rows(Rows) {}
+ static bool classof(const MDNode *N) { return N->Kind == NodeKind::NK_Table; }
+};
+
+/// A single list item. Children may contain block-level nodes for loose
+/// lists, or a single inline sequence for tight lists.
+struct ListItemNode : MDNode {
+ llvm::ArrayRef<MDNode *> Children;
+ explicit ListItemNode(llvm::ArrayRef<MDNode *> Children)
+ : MDNode(NodeKind::NK_ListItem), Children(Children) {}
+ static bool classof(const MDNode *N) {
+ return N->Kind == NodeKind::NK_ListItem;
+ }
+};
+
+/// Unordered (bullet) list. Markers are -, *, or +.
+struct UnorderedListNode : MDNode {
+ llvm::ArrayRef<ListItemNode *> Items;
+ explicit UnorderedListNode(llvm::ArrayRef<ListItemNode *> Items)
+ : MDNode(NodeKind::NK_UnorderedList), Items(Items) {}
+ static bool classof(const MDNode *N) {
+ return N->Kind == NodeKind::NK_UnorderedList;
+ }
+};
+
+/// Ordered (numbered) list. Start is the number on the first item. Start is
+/// declared before Items to avoid padding, keeping sizeof at 24 bytes.
+struct OrderedListNode : MDNode {
+ unsigned Start;
+ llvm::ArrayRef<ListItemNode *> Items;
+ OrderedListNode(unsigned Start, llvm::ArrayRef<ListItemNode *> Items)
+ : MDNode(NodeKind::NK_OrderedList), Start(Start), Items(Items) {}
+ static bool classof(const MDNode *N) {
+ return N->Kind == NodeKind::NK_OrderedList;
+ }
+};
+
+/// Block quote (> ...). Children are block-level nodes inside the quote.
+struct BlockQuoteNode : MDNode {
+ llvm::ArrayRef<MDNode *> Children;
+ explicit BlockQuoteNode(llvm::ArrayRef<MDNode *> Children)
+ : MDNode(NodeKind::NK_BlockQuote), Children(Children) {}
+ static bool classof(const MDNode *N) {
+ return N->Kind == NodeKind::NK_BlockQuote;
+ }
+};
+
+/// Thematic break: a line of three or more ---, ***, or ___ characters.
+struct ThematicBreakNode : MDNode {
+ ThematicBreakNode() : MDNode(NodeKind::NK_ThematicBreak) {}
+ static bool classof(const MDNode *N) {
+ return N->Kind == NodeKind::NK_ThematicBreak;
+ }
+};
+
+//===----------------------------------------------------------------------===//
+// Parser entry point
+//===----------------------------------------------------------------------===//
+
+/// Parse Markdown from a single paragraph of plain text. Returns a list of
+/// top-level block nodes allocated in Arena. Returns an empty ArrayRef if no
+/// Markdown constructs are found, letting callers fall back to plain-text
+/// rendering at zero cost. The parser never crashes on malformed input.
+///
+/// The caller must keep Arena alive for the lifetime of any returned nodes.
+llvm::ArrayRef<MDNode *> parseMarkdown(llvm::StringRef ParagraphText,
+ llvm::BumpPtrAllocator &Arena);
} // namespace clang::doc::markdown
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_DOC_MARKDOWN_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_DOC_SUPPORT_MARKDOWN_H
diff --git a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
index 4ca979c1f1d24..b61094f034375 100644
--- a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
+++ b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
@@ -8,9 +8,11 @@
#include "support/Markdown.h"
#include "llvm/Support/Allocator.h"
+#include "llvm/Support/Casting.h"
#include "gtest/gtest.h"
using namespace clang::doc::markdown;
+using namespace llvm;
namespace {
@@ -31,9 +33,8 @@ TEST_F(MarkdownParserTest, WhitespaceOnlyInput) {
TEST_F(MarkdownParserTest, PlainText) {
auto Nodes = parseMarkdown("hello world", Arena);
ASSERT_EQ(Nodes.size(), 1u);
- const auto &N = Nodes[0];
- EXPECT_EQ(N.Kind, NodeKind::NK_Text);
- EXPECT_EQ(N.Content, "hello world");
+ auto *N = cast<TextNode>(Nodes[0]);
+ EXPECT_EQ(N->Text, "hello world");
}
TEST_F(MarkdownParserTest, FencedCodeBlock) {
@@ -42,10 +43,9 @@ int x = 0;
````````)",
Arena);
ASSERT_EQ(Nodes.size(), 1u);
- const auto &N = Nodes[0];
- EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode);
- EXPECT_EQ(N.Content, "cpp");
- ASSERT_EQ(N.Children.size(), 1u);
+ auto *N = cast<FencedCodeNode>(Nodes[0]);
+ EXPECT_EQ(N->Lang, "cpp");
+ ASSERT_EQ(N->Lines.size(), 1u);
}
TEST_F(MarkdownParserTest, FencedCodeBlockNoLang) {
@@ -54,9 +54,8 @@ some code
```````)",
Arena);
ASSERT_EQ(Nodes.size(), 1u);
- const auto &N = Nodes[0];
- EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode);
- EXPECT_TRUE(N.Content.empty());
+ auto *N = cast<FencedCodeNode>(Nodes[0]);
+ EXPECT_TRUE(N->Lang.empty());
}
TEST_F(MarkdownParserTest, UnterminatedFenceReturnsEmpty) {
@@ -74,7 +73,7 @@ TEST_F(MarkdownParserTest, PipeTable) {
| 1 | 2 |)",
Arena);
ASSERT_EQ(Nodes.size(), 1u);
- EXPECT_EQ(Nodes[0].Kind, NodeKind::NK_Table);
+ EXPECT_TRUE(isa<TableNode>(Nodes[0]));
}
TEST_F(MarkdownParserTest, PipeCharacterWithoutSepRowIsPlainText) {
@@ -82,8 +81,8 @@ TEST_F(MarkdownParserTest, PipeCharacterWithoutSepRowIsPlainText) {
c | d)",
Arena);
// No separator row so should not be parsed as a table.
- for (const auto &Node : Nodes)
- EXPECT_NE(Node.Kind, NodeKind::NK_Table);
+ for (const auto *Node : Nodes)
+ EXPECT_FALSE(isa<TableNode>(Node));
}
TEST_F(MarkdownParserTest, UnorderedList) {
@@ -92,12 +91,11 @@ TEST_F(MarkdownParserTest, UnorderedList) {
- baz)",
Arena);
ASSERT_EQ(Nodes.size(), 1u);
- const auto &N = Nodes[0];
- EXPECT_EQ(N.Kind, NodeKind::NK_UnorderedList);
- ASSERT_EQ(N.Children.size(), 3u);
- EXPECT_EQ(N.Children[0].Content, "foo");
- EXPECT_EQ(N.Children[1].Content, "bar");
- EXPECT_EQ(N.Children[2].Content, "baz");
+ auto *N = cast<UnorderedListNode>(Nodes[0]);
+ ASSERT_EQ(N->Items.size(), 3u);
+ EXPECT_EQ(cast<TextNode>(N->Items[0]->Children[0])->Text, "foo");
+ EXPECT_EQ(cast<TextNode>(N->Items[1]->Children[0])->Text, "bar");
+ EXPECT_EQ(cast<TextNode>(N->Items[2]->Children[0])->Text, "baz");
}
TEST_F(MarkdownParserTest, MixedContent) {
@@ -117,10 +115,9 @@ int x = 0;
~~~)",
Arena);
ASSERT_EQ(Nodes.size(), 1u);
- const auto &N = Nodes[0];
- EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode);
- EXPECT_TRUE(N.Content.empty());
- ASSERT_EQ(N.Children.size(), 1u);
+ auto *N = cast<FencedCodeNode>(Nodes[0]);
+ EXPECT_TRUE(N->Lang.empty());
+ ASSERT_EQ(N->Lines.size(), 1u);
}
// CommonMark §4.5 example 120: tilde fence with a language tag.
@@ -130,10 +127,9 @@ int x = 0;
~~~)",
Arena);
ASSERT_EQ(Nodes.size(), 1u);
- const auto &N = Nodes[0];
- EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode);
- EXPECT_EQ(N.Content, "cpp");
- ASSERT_EQ(N.Children.size(), 1u);
+ auto *N = cast<FencedCodeNode>(Nodes[0]);
+ EXPECT_EQ(N->Lang, "cpp");
+ ASSERT_EQ(N->Lines.size(), 1u);
}
// CommonMark §4.5 example 122: a tilde line does not close a backtick fence.
@@ -144,10 +140,9 @@ aaa
````````)",
Arena);
ASSERT_EQ(Nodes.size(), 1u);
- const auto &N = Nodes[0];
- EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode);
+ auto *N = cast<FencedCodeNode>(Nodes[0]);
// ~~~ is content, not a closing fence.
- ASSERT_EQ(N.Children.size(), 2u);
+ ASSERT_EQ(N->Lines.size(), 2u);
}
// CommonMark §4.5 example 130: a code block can be empty.
@@ -156,18 +151,16 @@ TEST_F(MarkdownParserTest, EmptyFencedCodeBlock) {
```````)",
Arena);
ASSERT_EQ(Nodes.size(), 1u);
- const auto &N = Nodes[0];
- EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode);
- EXPECT_TRUE(N.Children.empty());
+ auto *N = cast<FencedCodeNode>(Nodes[0]);
+ EXPECT_TRUE(N->Lines.empty());
}
// CommonMark §4.5 example 129: a code block may contain only blank lines.
TEST_F(MarkdownParserTest, FencedCodeBlockBlankLineContent) {
auto Nodes = parseMarkdown("```\n\n \n```", Arena);
ASSERT_EQ(Nodes.size(), 1u);
- const auto &N = Nodes[0];
- EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode);
- ASSERT_EQ(N.Children.size(), 2u);
+ auto *N = cast<FencedCodeNode>(Nodes[0]);
+ ASSERT_EQ(N->Lines.size(), 2u);
}
// CommonMark §4.5 example 142: lang tag is captured from the info string.
@@ -179,10 +172,9 @@ end
``````)",
Arena);
ASSERT_EQ(Nodes.size(), 1u);
- const auto &N = Nodes[0];
- EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode);
- EXPECT_EQ(N.Content, "ruby");
- ASSERT_EQ(N.Children.size(), 3u);
+ auto *N = cast<FencedCodeNode>(Nodes[0]);
+ EXPECT_EQ(N->Lang, "ruby");
+ ASSERT_EQ(N->Lines.size(), 3u);
}
// CommonMark §4.5 example 146: tilde fence info string may contain backticks.
@@ -192,10 +184,9 @@ foo
~~~)",
Arena);
ASSERT_EQ(Nodes.size(), 1u);
- const auto &N = Nodes[0];
- EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode);
- EXPECT_EQ(N.Content, "aa ``` ~~~");
- ASSERT_EQ(N.Children.size(), 1u);
+ auto *N = cast<FencedCodeNode>(Nodes[0]);
+ EXPECT_EQ(N->Lang, "aa ``` ~~~");
+ ASSERT_EQ(N->Lines.size(), 1u);
}
// CommonMark §4.5 example 124: closing fence must be at least as long as the
@@ -209,9 +200,8 @@ TEST_F(MarkdownParserTest, ClosingFenceLengthTODO) {
// parser currently treats it as a closing fence. This test documents the
// current (non-conformant) behavior.
ASSERT_EQ(Nodes.size(), 1u);
- const auto &N = Nodes[0];
- EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode);
- ASSERT_EQ(N.Children.size(), 1u);
+ auto *N = cast<FencedCodeNode>(Nodes[0]);
+ ASSERT_EQ(N->Lines.size(), 1u);
}
} // namespace
\ No newline at end of file
>From 25fe7daff183f51a8b31ed0d8481b9a2f1fbdbd8 Mon Sep 17 00:00:00 2001
From: Neil-N4 <neilnair4 at gmail.com>
Date: Fri, 12 Jun 2026 02:33:44 -0400
Subject: [PATCH 05/27] [clang-doc] Introduce LineReader cursor for the
Markdown parse loop
Replace the raw size_t I = 0, E = Lines.size() index arithmetic in
parseMarkdown() with a LineReader cursor that encapsulates the position
and exposes peek(), peek(Offset), advance(), and atEnd(). The parse
logic and emitted nodes are unchanged; this only removes manual index
bookkeeping. All 18 MarkdownParserTest cases still pass.
Co-Authored-By: Claude Opus 4.8 <noreply at anthropic.com>
---
.../clang-doc/support/Markdown.cpp | 73 ++++++++++++++-----
1 file changed, 54 insertions(+), 19 deletions(-)
diff --git a/clang-tools-extra/clang-doc/support/Markdown.cpp b/clang-tools-extra/clang-doc/support/Markdown.cpp
index bee15c3e23ec3..f171457e73046 100644
--- a/clang-tools-extra/clang-doc/support/Markdown.cpp
+++ b/clang-tools-extra/clang-doc/support/Markdown.cpp
@@ -11,6 +11,7 @@
#include "llvm/ADT/StringRef.h"
#include "llvm/Support/Allocator.h"
#include "llvm/Support/DebugLog.h"
+#include <cassert>
#define DEBUG_TYPE "clang-doc"
@@ -52,6 +53,42 @@ static bool isListItem(StringRef Line) {
Line.starts_with("+ ");
}
+// A forward cursor over the lines of a paragraph. Encapsulates the parse
+// position so the loop can inspect the current or an upcoming line and consume
+// lines without manual index arithmetic. Lines are stored untrimmed; callers
+// trim where they need a normalized view.
+class LineReader {
+public:
+ explicit LineReader(ArrayRef<StringRef> Lines) : Lines(Lines) {}
+
+ // True once every line has been consumed.
+ bool atEnd() const { return Pos >= Lines.size(); }
+
+ // The current line, untrimmed. Must not be called when atEnd().
+ StringRef peek() const {
+ assert(!atEnd() && "peek past end of input");
+ return Lines[Pos];
+ }
+
+ // The line Offset positions ahead of the cursor, or an empty StringRef when
+ // that position is past the end. peek(0) is the current line.
+ StringRef peek(size_t Offset) const {
+ size_t Target = Pos + Offset;
+ return Target < Lines.size() ? Lines[Target] : StringRef();
+ }
+
+ // Consume the current line and return it, untrimmed. Must not be called when
+ // atEnd().
+ StringRef advance() {
+ assert(!atEnd() && "advance past end of input");
+ return Lines[Pos++];
+ }
+
+private:
+ ArrayRef<StringRef> Lines;
+ size_t Pos = 0;
+};
+
ArrayRef<MDNode *> parseMarkdown(StringRef ParagraphText,
BumpPtrAllocator &Arena) {
if (ParagraphText.trim().empty())
@@ -61,13 +98,13 @@ ArrayRef<MDNode *> parseMarkdown(StringRef ParagraphText,
ParagraphText.split(Lines, '\n');
SmallVector<MDNode *> Nodes;
- size_t I = 0, E = Lines.size();
+ LineReader Reader(Lines);
- while (I < E) {
- StringRef Line = Lines[I].trim();
+ while (!Reader.atEnd()) {
+ StringRef Line = Reader.peek().trim();
if (Line.empty()) {
- ++I;
+ Reader.advance();
continue;
}
@@ -79,18 +116,18 @@ ArrayRef<MDNode *> parseMarkdown(StringRef ParagraphText,
if (Line.starts_with("```") || Line.starts_with("~~~")) {
char Fence = Line[0];
StringRef Lang = internString(Line.drop_front(3).trim(), Arena);
+ Reader.advance(); // consume opening fence
SmallVector<StringRef> CodeLines;
- ++I;
- while (I < E) {
- StringRef CodeLine = Lines[I].trim();
+ while (!Reader.atEnd()) {
+ StringRef CodeLine = Reader.peek().trim();
if (CodeLine.size() >= 3 &&
all_of(CodeLine.take_front(3),
[Fence](char C) { return C == Fence; }))
break;
- CodeLines.push_back(internString(Lines[I], Arena));
- ++I;
+ CodeLines.push_back(internString(Reader.advance(), Arena));
}
- ++I; // skip closing fence
+ if (!Reader.atEnd())
+ Reader.advance(); // consume closing fence
auto *Code =
new (Arena) FencedCodeNode(Lang, allocateArray(CodeLines, Arena));
LDBG() << "emitting FencedCodeNode lang='" << Lang
@@ -100,12 +137,10 @@ ArrayRef<MDNode *> parseMarkdown(StringRef ParagraphText,
}
// Pipe table: current line has | and next line is a separator row.
- if (Line.contains('|') && I + 1 < E && isSepRow(Lines[I + 1].trim())) {
+ if (Line.contains('|') && isSepRow(Reader.peek(1).trim())) {
SmallVector<StringRef> Rows;
- while (I < E && Lines[I].trim().contains('|')) {
- Rows.push_back(internString(Lines[I].trim(), Arena));
- ++I;
- }
+ while (!Reader.atEnd() && Reader.peek().trim().contains('|'))
+ Rows.push_back(internString(Reader.advance().trim(), Arena));
auto *Table = new (Arena) TableNode(allocateArray(Rows, Arena));
LDBG() << "emitting TableNode rows=" << Rows.size();
Nodes.push_back(Table);
@@ -115,8 +150,8 @@ ArrayRef<MDNode *> parseMarkdown(StringRef ParagraphText,
// Unordered list item.
if (isListItem(Line)) {
SmallVector<ListItemNode *> Items;
- while (I < E) {
- StringRef L = Lines[I].trim();
+ while (!Reader.atEnd()) {
+ StringRef L = Reader.peek().trim();
if (!isListItem(L))
break;
StringRef ItemText = internString(L.drop_front(2).trim(), Arena);
@@ -125,7 +160,7 @@ ArrayRef<MDNode *> parseMarkdown(StringRef ParagraphText,
auto *Item =
new (Arena) ListItemNode(allocateArray(ItemChildren, Arena));
Items.push_back(Item);
- ++I;
+ Reader.advance();
}
auto *List = new (Arena) UnorderedListNode(allocateArray(Items, Arena));
LDBG() << "emitting UnorderedListNode items=" << Items.size();
@@ -135,7 +170,7 @@ ArrayRef<MDNode *> parseMarkdown(StringRef ParagraphText,
// Plain text fallback.
Nodes.push_back(new (Arena) TextNode(internString(Line, Arena)));
- ++I;
+ Reader.advance();
}
LDBG() << "parseMarkdown done nodes=" << Nodes.size();
>From 060bf63fe9f19fa45ef941f10594897351591d56 Mon Sep 17 00:00:00 2001
From: Neil-N4 <neilnair4 at gmail.com>
Date: Fri, 12 Jun 2026 02:44:27 -0400
Subject: [PATCH 06/27] [clang-doc] Parse inline emphasis, strong, and code in
Markdown
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Add an inline pass over paragraph text that recognizes emphasis
(*text* or _text_), strong (**text** or __text__), and inline code
(`code`), emitting the EmphasisNode, StrongNode, and InlineCodeNode
types already in the hierarchy. Emphasis and strong recurse into their
content, and runs that match no construct stay plain TextNodes.
Delimiter matching uses a simplified subset of the CommonMark §6
flanking rules: a delimiter opens only with non-whitespace inside it and
closes only with non-whitespace before it, and code spans close on a
backtick run of equal length. The full delimiter-stack model is left as
a TODO. Adds 12 unit tests covering each construct plus the unmatched
and unterminated cases.
Co-Authored-By: Claude Opus 4.8 <noreply at anthropic.com>
---
.../clang-doc/support/Markdown.cpp | 121 +++++++++++++++++-
.../clang-doc/MarkdownParserTest.cpp | 97 ++++++++++++++
2 files changed, 216 insertions(+), 2 deletions(-)
diff --git a/clang-tools-extra/clang-doc/support/Markdown.cpp b/clang-tools-extra/clang-doc/support/Markdown.cpp
index f171457e73046..f1af4f5430772 100644
--- a/clang-tools-extra/clang-doc/support/Markdown.cpp
+++ b/clang-tools-extra/clang-doc/support/Markdown.cpp
@@ -8,6 +8,7 @@
#include "Markdown.h"
#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/Support/Allocator.h"
#include "llvm/Support/DebugLog.h"
@@ -89,6 +90,121 @@ class LineReader {
size_t Pos = 0;
};
+// Returns the number of consecutive copies of C starting at S[Start].
+static size_t countRun(StringRef S, size_t Start, char C) {
+ size_t I = Start;
+ while (I < S.size() && S[I] == C)
+ ++I;
+ return I - Start;
+}
+
+// Strips one leading and one trailing space from a code span's content when
+// both are present and the content is not all spaces, per CommonMark §6.1.
+static StringRef trimCodeSpan(StringRef Code) {
+ if (Code.size() >= 2 && Code.front() == ' ' && Code.back() == ' ' &&
+ Code.find_first_not_of(' ') != StringRef::npos)
+ return Code.drop_front().drop_back();
+ return Code;
+}
+
+// Finds the start index of a closing emphasis run of exactly Count copies of C,
+// searching forward from From. Requires non-whitespace immediately inside both
+// the opening and closing delimiters and non-empty content, a simplified take
+// on the CommonMark §6.2 flanking rules. Returns StringRef::npos if no valid
+// closing run exists.
+static size_t findClosingDelim(StringRef S, size_t From, char C, size_t Count) {
+ size_t E = S.size();
+ // Opening delimiter is not left-flanking if whitespace follows it.
+ if (From >= E || isSpace(S[From]))
+ return StringRef::npos;
+ for (size_t J = From; J + Count <= E; ++J) {
+ if (S[J] != C)
+ continue;
+ size_t Run = countRun(S, J, C);
+ if (Run != Count) {
+ J += Run - 1; // Skip the whole run; the loop's ++J lands past it.
+ continue;
+ }
+ // Reject empty content and closing runs that are not right-flanking.
+ if (J == From || isSpace(S[J - 1]))
+ continue;
+ return J;
+ }
+ return StringRef::npos;
+}
+
+// Parses the inline content of a single line into a sequence of inline nodes:
+// inline code (`code`), strong (**text** or __text__), and emphasis (*text* or
+// _text_). Runs that match no construct become TextNodes. Emphasis and strong
+// recurse so their content may itself contain inline constructs. Text with no
+// markers yields a single TextNode.
+//
+// TODO: This covers the common cases but not the full CommonMark §6 inline
+// model (delimiter stacks, intraword underscore rules, links, autolinks).
+static ArrayRef<MDNode *> parseInline(StringRef S, BumpPtrAllocator &Arena) {
+ SmallVector<MDNode *> Nodes;
+ size_t TextStart = 0, I = 0, E = S.size();
+
+ auto flushText = [&](size_t End) {
+ if (End > TextStart)
+ Nodes.push_back(new (Arena) TextNode(
+ internString(S.substr(TextStart, End - TextStart), Arena)));
+ };
+
+ while (I < E) {
+ char C = S[I];
+
+ // Inline code span: a run of N backticks closed by a run of N backticks.
+ if (C == '`') {
+ size_t N = countRun(S, I, '`');
+ size_t J = I + N;
+ while (J < E && countRun(S, J, '`') != N)
+ J += S[J] == '`' ? countRun(S, J, '`') : 1;
+ if (J < E) {
+ flushText(I);
+ StringRef Code = trimCodeSpan(S.substr(I + N, J - (I + N)));
+ Nodes.push_back(new (Arena) InlineCodeNode(internString(Code, Arena)));
+ I = J + N;
+ TextStart = I;
+ continue;
+ }
+ // No closing run; leave the backticks as literal text.
+ I += N;
+ continue;
+ }
+
+ // Emphasis (*text*, _text_) and strong (**text**, __text__).
+ if (C == '*' || C == '_') {
+ // Strong binds the two-delimiter form before single-delimiter emphasis.
+ if (I + 1 < E && S[I + 1] == C) {
+ size_t Close = findClosingDelim(S, I + 2, C, 2);
+ if (Close != StringRef::npos) {
+ flushText(I);
+ StringRef Inner = S.substr(I + 2, Close - (I + 2));
+ Nodes.push_back(new (Arena) StrongNode(parseInline(Inner, Arena)));
+ I = Close + 2;
+ TextStart = I;
+ continue;
+ }
+ }
+ size_t Close = findClosingDelim(S, I + 1, C, 1);
+ if (Close != StringRef::npos) {
+ flushText(I);
+ StringRef Inner = S.substr(I + 1, Close - (I + 1));
+ Nodes.push_back(new (Arena) EmphasisNode(parseInline(Inner, Arena)));
+ I = Close + 1;
+ TextStart = I;
+ continue;
+ }
+ }
+
+ ++I;
+ }
+
+ flushText(E);
+ return allocateArray(Nodes, Arena);
+}
+
ArrayRef<MDNode *> parseMarkdown(StringRef ParagraphText,
BumpPtrAllocator &Arena) {
if (ParagraphText.trim().empty())
@@ -168,8 +284,9 @@ ArrayRef<MDNode *> parseMarkdown(StringRef ParagraphText,
continue;
}
- // Plain text fallback.
- Nodes.push_back(new (Arena) TextNode(internString(Line, Arena)));
+ // Plain text, scanned for inline constructs (emphasis, strong, code).
+ for (MDNode *Inline : parseInline(Line, Arena))
+ Nodes.push_back(Inline);
Reader.advance();
}
diff --git a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
index b61094f034375..ea72dacfb08e5 100644
--- a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
+++ b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
@@ -204,4 +204,101 @@ TEST_F(MarkdownParserTest, ClosingFenceLengthTODO) {
ASSERT_EQ(N->Lines.size(), 1u);
}
+TEST_F(MarkdownParserTest, EmphasisAsterisk) {
+ auto Nodes = parseMarkdown("an *important* word", Arena);
+ ASSERT_EQ(Nodes.size(), 3u);
+ EXPECT_EQ(cast<TextNode>(Nodes[0])->Text, "an ");
+ auto *Em = cast<EmphasisNode>(Nodes[1]);
+ ASSERT_EQ(Em->Children.size(), 1u);
+ EXPECT_EQ(cast<TextNode>(Em->Children[0])->Text, "important");
+ EXPECT_EQ(cast<TextNode>(Nodes[2])->Text, " word");
+}
+
+TEST_F(MarkdownParserTest, EmphasisUnderscore) {
+ auto Nodes = parseMarkdown("_em_", Arena);
+ ASSERT_EQ(Nodes.size(), 1u);
+ auto *Em = cast<EmphasisNode>(Nodes[0]);
+ ASSERT_EQ(Em->Children.size(), 1u);
+ EXPECT_EQ(cast<TextNode>(Em->Children[0])->Text, "em");
+}
+
+TEST_F(MarkdownParserTest, StrongAsterisk) {
+ auto Nodes = parseMarkdown("**bold**", Arena);
+ ASSERT_EQ(Nodes.size(), 1u);
+ auto *St = cast<StrongNode>(Nodes[0]);
+ ASSERT_EQ(St->Children.size(), 1u);
+ EXPECT_EQ(cast<TextNode>(St->Children[0])->Text, "bold");
+}
+
+TEST_F(MarkdownParserTest, StrongUnderscore) {
+ auto Nodes = parseMarkdown("__bold__", Arena);
+ ASSERT_EQ(Nodes.size(), 1u);
+ auto *St = cast<StrongNode>(Nodes[0]);
+ ASSERT_EQ(St->Children.size(), 1u);
+ EXPECT_EQ(cast<TextNode>(St->Children[0])->Text, "bold");
+}
+
+// Two delimiters must be parsed as strong, not as nested emphasis.
+TEST_F(MarkdownParserTest, StrongBindsBeforeEmphasis) {
+ auto Nodes = parseMarkdown("**strong**", Arena);
+ ASSERT_EQ(Nodes.size(), 1u);
+ EXPECT_TRUE(isa<StrongNode>(Nodes[0]));
+}
+
+TEST_F(MarkdownParserTest, InlineCode) {
+ auto Nodes = parseMarkdown("call `foo()` here", Arena);
+ ASSERT_EQ(Nodes.size(), 3u);
+ EXPECT_EQ(cast<TextNode>(Nodes[0])->Text, "call ");
+ EXPECT_EQ(cast<InlineCodeNode>(Nodes[1])->Code, "foo()");
+ EXPECT_EQ(cast<TextNode>(Nodes[2])->Text, " here");
+}
+
+// CommonMark §6.1: a doubled backtick fence lets the span contain a single
+// backtick.
+TEST_F(MarkdownParserTest, InlineCodeDoubleBacktick) {
+ auto Nodes = parseMarkdown("``a`b``", Arena);
+ ASSERT_EQ(Nodes.size(), 1u);
+ EXPECT_EQ(cast<InlineCodeNode>(Nodes[0])->Code, "a`b");
+}
+
+// Emphasis and strong recurse, so a code span inside emphasis is parsed.
+TEST_F(MarkdownParserTest, CodeSpanInsideEmphasis) {
+ auto Nodes = parseMarkdown("*see `x`*", Arena);
+ ASSERT_EQ(Nodes.size(), 1u);
+ auto *Em = cast<EmphasisNode>(Nodes[0]);
+ ASSERT_EQ(Em->Children.size(), 2u);
+ EXPECT_EQ(cast<TextNode>(Em->Children[0])->Text, "see ");
+ EXPECT_EQ(cast<InlineCodeNode>(Em->Children[1])->Code, "x");
+}
+
+TEST_F(MarkdownParserTest, CodeSpanInsideStrong) {
+ auto Nodes = parseMarkdown("**a `b`**", Arena);
+ ASSERT_EQ(Nodes.size(), 1u);
+ auto *St = cast<StrongNode>(Nodes[0]);
+ ASSERT_EQ(St->Children.size(), 2u);
+ EXPECT_EQ(cast<TextNode>(St->Children[0])->Text, "a ");
+ EXPECT_EQ(cast<InlineCodeNode>(St->Children[1])->Code, "b");
+}
+
+// A delimiter with whitespace on the inside does not open emphasis.
+TEST_F(MarkdownParserTest, UnmatchedDelimiterIsText) {
+ auto Nodes = parseMarkdown("a * b", Arena);
+ ASSERT_EQ(Nodes.size(), 1u);
+ EXPECT_EQ(cast<TextNode>(Nodes[0])->Text, "a * b");
+}
+
+// An unterminated code span leaves the backtick as literal text.
+TEST_F(MarkdownParserTest, UnterminatedCodeSpanIsText) {
+ auto Nodes = parseMarkdown("a `b c", Arena);
+ ASSERT_EQ(Nodes.size(), 1u);
+ EXPECT_EQ(cast<TextNode>(Nodes[0])->Text, "a `b c");
+}
+
+// Inline parsing must not disturb plain text with no markers.
+TEST_F(MarkdownParserTest, PlainTextHasNoInlineNodes) {
+ auto Nodes = parseMarkdown("just words", Arena);
+ ASSERT_EQ(Nodes.size(), 1u);
+ EXPECT_EQ(cast<TextNode>(Nodes[0])->Text, "just words");
+}
+
} // namespace
\ No newline at end of file
>From 0af1c8e2999a20e2044cc337a8c4f0d8112d208b Mon Sep 17 00:00:00 2001
From: Neil-N4 <neilnair4 at gmail.com>
Date: Fri, 12 Jun 2026 10:18:32 -0400
Subject: [PATCH 07/27] [clang-doc] Address review feedback: rename inline
parser variables, simplify header docs
---
.../clang-doc/support/Markdown.cpp | 54 ++++++++++---------
.../clang-doc/support/Markdown.h | 25 +++------
2 files changed, 34 insertions(+), 45 deletions(-)
diff --git a/clang-tools-extra/clang-doc/support/Markdown.cpp b/clang-tools-extra/clang-doc/support/Markdown.cpp
index f1af4f5430772..ef29daa76a166 100644
--- a/clang-tools-extra/clang-doc/support/Markdown.cpp
+++ b/clang-tools-extra/clang-doc/support/Markdown.cpp
@@ -143,7 +143,7 @@ static size_t findClosingDelim(StringRef S, size_t From, char C, size_t Count) {
// model (delimiter stacks, intraword underscore rules, links, autolinks).
static ArrayRef<MDNode *> parseInline(StringRef S, BumpPtrAllocator &Arena) {
SmallVector<MDNode *> Nodes;
- size_t TextStart = 0, I = 0, E = S.size();
+ size_t TextStart = 0, Pos = 0, E = S.size();
auto flushText = [&](size_t End) {
if (End > TextStart)
@@ -151,54 +151,56 @@ static ArrayRef<MDNode *> parseInline(StringRef S, BumpPtrAllocator &Arena) {
internString(S.substr(TextStart, End - TextStart), Arena)));
};
- while (I < E) {
- char C = S[I];
+ while (Pos < E) {
+ char C = S[Pos];
- // Inline code span: a run of N backticks closed by a run of N backticks.
+ // Inline code span: an opening backtick run closed by a run of the same
+ // length.
if (C == '`') {
- size_t N = countRun(S, I, '`');
- size_t J = I + N;
- while (J < E && countRun(S, J, '`') != N)
- J += S[J] == '`' ? countRun(S, J, '`') : 1;
- if (J < E) {
- flushText(I);
- StringRef Code = trimCodeSpan(S.substr(I + N, J - (I + N)));
+ size_t OpenLen = countRun(S, Pos, '`');
+ size_t ClosePos = Pos + OpenLen;
+ while (ClosePos < E && countRun(S, ClosePos, '`') != OpenLen)
+ ClosePos += S[ClosePos] == '`' ? countRun(S, ClosePos, '`') : 1;
+ if (ClosePos < E) {
+ flushText(Pos);
+ StringRef Code =
+ trimCodeSpan(S.substr(Pos + OpenLen, ClosePos - (Pos + OpenLen)));
Nodes.push_back(new (Arena) InlineCodeNode(internString(Code, Arena)));
- I = J + N;
- TextStart = I;
+ Pos = ClosePos + OpenLen;
+ TextStart = Pos;
continue;
}
// No closing run; leave the backticks as literal text.
- I += N;
+ Pos += OpenLen;
continue;
}
// Emphasis (*text*, _text_) and strong (**text**, __text__).
if (C == '*' || C == '_') {
// Strong binds the two-delimiter form before single-delimiter emphasis.
- if (I + 1 < E && S[I + 1] == C) {
- size_t Close = findClosingDelim(S, I + 2, C, 2);
+ if (Pos + 1 < E && S[Pos + 1] == C) {
+ size_t Close = findClosingDelim(S, Pos + 2, C, 2);
if (Close != StringRef::npos) {
- flushText(I);
- StringRef Inner = S.substr(I + 2, Close - (I + 2));
+ flushText(Pos);
+ StringRef Inner = S.substr(Pos + 2, Close - (Pos + 2));
Nodes.push_back(new (Arena) StrongNode(parseInline(Inner, Arena)));
- I = Close + 2;
- TextStart = I;
+ Pos = Close + 2;
+ TextStart = Pos;
continue;
}
}
- size_t Close = findClosingDelim(S, I + 1, C, 1);
+ size_t Close = findClosingDelim(S, Pos + 1, C, 1);
if (Close != StringRef::npos) {
- flushText(I);
- StringRef Inner = S.substr(I + 1, Close - (I + 1));
+ flushText(Pos);
+ StringRef Inner = S.substr(Pos + 1, Close - (Pos + 1));
Nodes.push_back(new (Arena) EmphasisNode(parseInline(Inner, Arena)));
- I = Close + 1;
- TextStart = I;
+ Pos = Close + 1;
+ TextStart = Pos;
continue;
}
}
- ++I;
+ ++Pos;
}
flushText(E);
diff --git a/clang-tools-extra/clang-doc/support/Markdown.h b/clang-tools-extra/clang-doc/support/Markdown.h
index 3d457bcddfac6..60390465588c3 100644
--- a/clang-tools-extra/clang-doc/support/Markdown.h
+++ b/clang-tools-extra/clang-doc/support/Markdown.h
@@ -9,20 +9,10 @@
/// \file
/// Standalone Markdown parsing library for the LLVM ecosystem.
///
-/// The parser takes plain paragraph text and returns a polymorphic tree of
-/// MDNode-derived objects allocated in a caller-supplied BumpPtrAllocator.
-/// Node types form a closed class hierarchy rooted at MDNode. Each concrete
-/// type carries exactly the fields it needs -- no overloaded Content field,
-/// no unused arrays. Use llvm::isa<>/cast<>/dyn_cast<> for type-safe
-/// downcasting; each concrete type provides classof() for this purpose.
-///
-/// See
-/// https://llvm.org/docs/ProgrammerManual.html#the-isa-cast-and-dyn-cast-templates
-///
-/// Field ordering in each derived struct is chosen to minimize padding:
-/// 4-byte fields (like Level or Start) are declared before 16-byte fields
-/// (ArrayRef, StringRef) so that no implicit padding is inserted between the
-/// base class's 4-byte Kind and the first derived field.
+/// The parser takes a single paragraph of plain text and returns a list of
+/// nodes describing the Markdown it found. Each kind of construct has its own
+/// node type, and every node shares a common MDNode base, so you can use
+/// llvm::isa<>/cast<>/dyn_cast<> to check what a node is.
///
/// Inline nodes (appear inside ParagraphNode, HeadingNode, etc.):
/// TextNode -- plain text run
@@ -165,9 +155,7 @@ struct ParagraphNode : MDNode {
}
};
-/// ATX heading: one to six leading # characters. Level is declared before
-/// Children to avoid padding between the base class's 4-byte Kind and the
-/// 8-byte-aligned ArrayRef, keeping sizeof(HeadingNode) at 24 bytes.
+/// ATX heading: one to six leading # characters.
struct HeadingNode : MDNode {
unsigned Level; // 1-6
llvm::ArrayRef<MDNode *> Children; // inline content
@@ -226,8 +214,7 @@ struct UnorderedListNode : MDNode {
}
};
-/// Ordered (numbered) list. Start is the number on the first item. Start is
-/// declared before Items to avoid padding, keeping sizeof at 24 bytes.
+/// Ordered (numbered) list. Start is the number on the first item.
struct OrderedListNode : MDNode {
unsigned Start;
llvm::ArrayRef<ListItemNode *> Items;
>From b76bfa182db40e7a358ffb7d42506aff24453e14 Mon Sep 17 00:00:00 2001
From: Neil-N4 <neilnair4 at gmail.com>
Date: Fri, 12 Jun 2026 11:50:32 -0400
Subject: [PATCH 08/27] [clang-doc] Add libFuzzer harness for parseMarkdown()
---
clang-tools-extra/clang-doc/CMakeLists.txt | 1 +
.../clang-doc/fuzzer/CMakeLists.txt | 21 +++++++++++++
.../clang-doc/fuzzer/DummyMarkdownFuzzer.cpp | 21 +++++++++++++
.../clang-doc/fuzzer/FuzzMarkdown.cpp | 30 +++++++++++++++++++
4 files changed, 73 insertions(+)
create mode 100644 clang-tools-extra/clang-doc/fuzzer/CMakeLists.txt
create mode 100644 clang-tools-extra/clang-doc/fuzzer/DummyMarkdownFuzzer.cpp
create mode 100644 clang-tools-extra/clang-doc/fuzzer/FuzzMarkdown.cpp
diff --git a/clang-tools-extra/clang-doc/CMakeLists.txt b/clang-tools-extra/clang-doc/CMakeLists.txt
index 22e2c8159e9f6..f64d1129ed4af 100644
--- a/clang-tools-extra/clang-doc/CMakeLists.txt
+++ b/clang-tools-extra/clang-doc/CMakeLists.txt
@@ -44,6 +44,7 @@ target_link_libraries(clangDoc
)
add_subdirectory(tool)
+add_subdirectory(fuzzer)
if (LLVM_INCLUDE_BENCHMARKS)
add_subdirectory(benchmarks)
diff --git a/clang-tools-extra/clang-doc/fuzzer/CMakeLists.txt b/clang-tools-extra/clang-doc/fuzzer/CMakeLists.txt
new file mode 100644
index 0000000000000..5e6e943891052
--- /dev/null
+++ b/clang-tools-extra/clang-doc/fuzzer/CMakeLists.txt
@@ -0,0 +1,21 @@
+# Resolve "support/Markdown.h" against the parent clang-doc directory.
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/..)
+
+set(LLVM_LINK_COMPONENTS
+ FuzzerCLI
+ Support
+ )
+
+# This fuzzer runs on oss-fuzz, so keep it around even if it looks unreferenced.
+# With a fuzzing engine configured (LLVM_USE_SANITIZE_COVERAGE or an external
+# LLVM_LIB_FUZZING_ENGINE) this builds a real fuzz target; otherwise DUMMY_MAIN
+# provides a main() so it still builds and can be replayed over saved inputs.
+add_llvm_fuzzer(clang-doc-markdown-fuzzer
+ FuzzMarkdown.cpp
+ DUMMY_MAIN DummyMarkdownFuzzer.cpp
+ )
+
+target_link_libraries(clang-doc-markdown-fuzzer
+ PRIVATE
+ clangDocSupport
+ )
diff --git a/clang-tools-extra/clang-doc/fuzzer/DummyMarkdownFuzzer.cpp b/clang-tools-extra/clang-doc/fuzzer/DummyMarkdownFuzzer.cpp
new file mode 100644
index 0000000000000..61466e0fa4ef6
--- /dev/null
+++ b/clang-tools-extra/clang-doc/fuzzer/DummyMarkdownFuzzer.cpp
@@ -0,0 +1,21 @@
+//===-- DummyMarkdownFuzzer.cpp - Entry point to test the fuzzer ----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Implementation of main so we can build and test the harness without linking
+// libFuzzer. Each command line argument is treated as a file to run the
+// harness on.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/FuzzMutate/FuzzerCLI.h"
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size);
+
+int main(int argc, char *argv[]) {
+ return llvm::runFuzzerOnInputs(argc, argv, LLVMFuzzerTestOneInput);
+}
diff --git a/clang-tools-extra/clang-doc/fuzzer/FuzzMarkdown.cpp b/clang-tools-extra/clang-doc/fuzzer/FuzzMarkdown.cpp
new file mode 100644
index 0000000000000..e407b3baccf2e
--- /dev/null
+++ b/clang-tools-extra/clang-doc/fuzzer/FuzzMarkdown.cpp
@@ -0,0 +1,30 @@
+//===-- FuzzMarkdown.cpp - Fuzzer for the clang-doc Markdown parser -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file implements a libFuzzer harness for parseMarkdown(). It feeds
+/// arbitrary bytes to the parser and checks that it never crashes. The parsed
+/// nodes are walked so the returned tree is exercised, not just allocated.
+///
+//===----------------------------------------------------------------------===//
+
+#include "support/Markdown.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Allocator.h"
+#include <cstddef>
+#include <cstdint>
+
+using namespace clang::doc::markdown;
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) {
+ llvm::BumpPtrAllocator Arena;
+ llvm::StringRef Input(reinterpret_cast<const char *>(Data), Size);
+ for (const MDNode *Node : parseMarkdown(Input, Arena))
+ (void)Node->Kind;
+ return 0;
+}
>From 77e28993d7a167410fd1a1ee97d2824945b44063 Mon Sep 17 00:00:00 2001
From: Neil-N4 <neilnair4 at gmail.com>
Date: Fri, 12 Jun 2026 18:47:47 -0400
Subject: [PATCH 09/27] [clang-doc] Address review feedback: rename
findClosingDelim params, add table TODO, fix EOF newline
---
.../clang-doc/support/Markdown.cpp | 28 +++++++++++--------
.../clang-doc/MarkdownParserTest.cpp | 2 +-
2 files changed, 17 insertions(+), 13 deletions(-)
diff --git a/clang-tools-extra/clang-doc/support/Markdown.cpp b/clang-tools-extra/clang-doc/support/Markdown.cpp
index ef29daa76a166..6a57cd7900ea2 100644
--- a/clang-tools-extra/clang-doc/support/Markdown.cpp
+++ b/clang-tools-extra/clang-doc/support/Markdown.cpp
@@ -107,26 +107,27 @@ static StringRef trimCodeSpan(StringRef Code) {
return Code;
}
-// Finds the start index of a closing emphasis run of exactly Count copies of C,
-// searching forward from From. Requires non-whitespace immediately inside both
-// the opening and closing delimiters and non-empty content, a simplified take
-// on the CommonMark §6.2 flanking rules. Returns StringRef::npos if no valid
-// closing run exists.
-static size_t findClosingDelim(StringRef S, size_t From, char C, size_t Count) {
+// Finds the start index of a closing emphasis run of exactly DelimLen copies of
+// DelimChar, searching forward from StartPos. Requires non-whitespace
+// immediately inside both the opening and closing delimiters and non-empty
+// content, a simplified take on the CommonMark §6.2 flanking rules. Returns
+// StringRef::npos if no valid closing run exists.
+static size_t findClosingDelim(StringRef S, size_t StartPos, char DelimChar,
+ size_t DelimLen) {
size_t E = S.size();
// Opening delimiter is not left-flanking if whitespace follows it.
- if (From >= E || isSpace(S[From]))
+ if (StartPos >= E || isSpace(S[StartPos]))
return StringRef::npos;
- for (size_t J = From; J + Count <= E; ++J) {
- if (S[J] != C)
+ for (size_t J = StartPos; J + DelimLen <= E; ++J) {
+ if (S[J] != DelimChar)
continue;
- size_t Run = countRun(S, J, C);
- if (Run != Count) {
+ size_t Run = countRun(S, J, DelimChar);
+ if (Run != DelimLen) {
J += Run - 1; // Skip the whole run; the loop's ++J lands past it.
continue;
}
// Reject empty content and closing runs that are not right-flanking.
- if (J == From || isSpace(S[J - 1]))
+ if (J == StartPos || isSpace(S[J - 1]))
continue;
return J;
}
@@ -257,6 +258,9 @@ ArrayRef<MDNode *> parseMarkdown(StringRef ParagraphText,
// Pipe table: current line has | and next line is a separator row.
if (Line.contains('|') && isSepRow(Reader.peek(1).trim())) {
SmallVector<StringRef> Rows;
+ // TODO: Rows are kept as raw line text for now. Table cells may contain
+ // inline content (emphasis, code spans, links), so each row may need to
+ // be split on '|' and parsed further into structured cells.
while (!Reader.atEnd() && Reader.peek().trim().contains('|'))
Rows.push_back(internString(Reader.advance().trim(), Arena));
auto *Table = new (Arena) TableNode(allocateArray(Rows, Arena));
diff --git a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
index ea72dacfb08e5..28bb9d567e6bc 100644
--- a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
+++ b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
@@ -301,4 +301,4 @@ TEST_F(MarkdownParserTest, PlainTextHasNoInlineNodes) {
EXPECT_EQ(cast<TextNode>(Nodes[0])->Text, "just words");
}
-} // namespace
\ No newline at end of file
+} // namespace
>From f33ef2ce3f9292e10f1e1dd220a500070ef21bc5 Mon Sep 17 00:00:00 2001
From: Neil-N4 <neilnair4 at gmail.com>
Date: Fri, 12 Jun 2026 22:29:15 -0400
Subject: [PATCH 10/27] [clang-doc] Address review feedback: make
UnterminatedFence and MixedContent tests explicit
---
.../unittests/clang-doc/MarkdownParserTest.cpp | 17 ++++++++++++-----
1 file changed, 12 insertions(+), 5 deletions(-)
diff --git a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
index 28bb9d567e6bc..207ae938c299a 100644
--- a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
+++ b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
@@ -58,13 +58,17 @@ some code
EXPECT_TRUE(N->Lang.empty());
}
-TEST_F(MarkdownParserTest, UnterminatedFenceReturnsEmpty) {
+TEST_F(MarkdownParserTest, UnterminatedFenceProducesCodeNode) {
auto Nodes = parseMarkdown(R"(```cpp
int x = 0;)",
Arena);
- // Unterminated fence should not crash and should produce a code node
- // with whatever lines were found.
- EXPECT_FALSE(Nodes.empty());
+ // An unterminated fence should not crash. The parser falls back to emitting a
+ // FencedCodeNode with whatever lines were found before the end of input.
+ ASSERT_EQ(Nodes.size(), 1u);
+ auto *N = cast<FencedCodeNode>(Nodes[0]);
+ EXPECT_EQ(N->Lang, "cpp");
+ ASSERT_EQ(N->Lines.size(), 1u);
+ EXPECT_EQ(N->Lines[0], "int x = 0;");
}
TEST_F(MarkdownParserTest, PipeTable) {
@@ -105,7 +109,10 @@ code
````````
- item)",
Arena);
- EXPECT_EQ(Nodes.size(), 3u);
+ ASSERT_EQ(Nodes.size(), 3u);
+ EXPECT_TRUE(isa<TextNode>(Nodes[0]));
+ EXPECT_TRUE(isa<FencedCodeNode>(Nodes[1]));
+ EXPECT_TRUE(isa<UnorderedListNode>(Nodes[2]));
}
// CommonMark §4.5 example 120: tilde fences work the same as backtick fences.
>From 4371be42e6ccb7a955301c77b5b732e45675347d Mon Sep 17 00:00:00 2001
From: Neil-N4 <neilnair4 at gmail.com>
Date: Fri, 12 Jun 2026 22:35:54 -0400
Subject: [PATCH 11/27] [clang-doc] Replace internString with
llvm::StringSaver, matching Mustache pattern
---
.../clang-doc/support/Markdown.cpp | 34 ++++++++-----------
1 file changed, 15 insertions(+), 19 deletions(-)
diff --git a/clang-tools-extra/clang-doc/support/Markdown.cpp b/clang-tools-extra/clang-doc/support/Markdown.cpp
index 6a57cd7900ea2..be2800bff5df7 100644
--- a/clang-tools-extra/clang-doc/support/Markdown.cpp
+++ b/clang-tools-extra/clang-doc/support/Markdown.cpp
@@ -12,6 +12,7 @@
#include "llvm/ADT/StringRef.h"
#include "llvm/Support/Allocator.h"
#include "llvm/Support/DebugLog.h"
+#include "llvm/Support/StringSaver.h"
#include <cassert>
#define DEBUG_TYPE "clang-doc"
@@ -31,15 +32,6 @@ static ArrayRef<T> allocateArray(SmallVectorImpl<T> &Vec,
return ArrayRef<T>(Allocated, Vec.size());
}
-// Interns a StringRef into the arena so it outlives the parse loop.
-static StringRef internString(StringRef S, BumpPtrAllocator &Arena) {
- if (S.empty())
- return {};
- char *Buf = Arena.Allocate<char>(S.size());
- std::copy(S.begin(), S.end(), Buf);
- return StringRef(Buf, S.size());
-}
-
// A line is a table separator if it only contains |, -, :, and spaces,
// and has at least one -.
static bool isSepRow(StringRef Line) {
@@ -142,14 +134,15 @@ static size_t findClosingDelim(StringRef S, size_t StartPos, char DelimChar,
//
// TODO: This covers the common cases but not the full CommonMark §6 inline
// model (delimiter stacks, intraword underscore rules, links, autolinks).
-static ArrayRef<MDNode *> parseInline(StringRef S, BumpPtrAllocator &Arena) {
+static ArrayRef<MDNode *> parseInline(StringRef S, BumpPtrAllocator &Arena,
+ StringSaver &Saver) {
SmallVector<MDNode *> Nodes;
size_t TextStart = 0, Pos = 0, E = S.size();
auto flushText = [&](size_t End) {
if (End > TextStart)
Nodes.push_back(new (Arena) TextNode(
- internString(S.substr(TextStart, End - TextStart), Arena)));
+ Saver.save(S.substr(TextStart, End - TextStart))));
};
while (Pos < E) {
@@ -166,7 +159,7 @@ static ArrayRef<MDNode *> parseInline(StringRef S, BumpPtrAllocator &Arena) {
flushText(Pos);
StringRef Code =
trimCodeSpan(S.substr(Pos + OpenLen, ClosePos - (Pos + OpenLen)));
- Nodes.push_back(new (Arena) InlineCodeNode(internString(Code, Arena)));
+ Nodes.push_back(new (Arena) InlineCodeNode(Saver.save(Code)));
Pos = ClosePos + OpenLen;
TextStart = Pos;
continue;
@@ -184,7 +177,8 @@ static ArrayRef<MDNode *> parseInline(StringRef S, BumpPtrAllocator &Arena) {
if (Close != StringRef::npos) {
flushText(Pos);
StringRef Inner = S.substr(Pos + 2, Close - (Pos + 2));
- Nodes.push_back(new (Arena) StrongNode(parseInline(Inner, Arena)));
+ Nodes.push_back(new (Arena)
+ StrongNode(parseInline(Inner, Arena, Saver)));
Pos = Close + 2;
TextStart = Pos;
continue;
@@ -194,7 +188,8 @@ static ArrayRef<MDNode *> parseInline(StringRef S, BumpPtrAllocator &Arena) {
if (Close != StringRef::npos) {
flushText(Pos);
StringRef Inner = S.substr(Pos + 1, Close - (Pos + 1));
- Nodes.push_back(new (Arena) EmphasisNode(parseInline(Inner, Arena)));
+ Nodes.push_back(new (Arena)
+ EmphasisNode(parseInline(Inner, Arena, Saver)));
Pos = Close + 1;
TextStart = Pos;
continue;
@@ -213,6 +208,7 @@ ArrayRef<MDNode *> parseMarkdown(StringRef ParagraphText,
if (ParagraphText.trim().empty())
return {};
+ StringSaver Saver(Arena);
SmallVector<StringRef, 16> Lines;
ParagraphText.split(Lines, '\n');
@@ -234,7 +230,7 @@ ArrayRef<MDNode *> parseMarkdown(StringRef ParagraphText,
// case-by-case basis.
if (Line.starts_with("```") || Line.starts_with("~~~")) {
char Fence = Line[0];
- StringRef Lang = internString(Line.drop_front(3).trim(), Arena);
+ StringRef Lang = Saver.save(Line.drop_front(3).trim());
Reader.advance(); // consume opening fence
SmallVector<StringRef> CodeLines;
while (!Reader.atEnd()) {
@@ -243,7 +239,7 @@ ArrayRef<MDNode *> parseMarkdown(StringRef ParagraphText,
all_of(CodeLine.take_front(3),
[Fence](char C) { return C == Fence; }))
break;
- CodeLines.push_back(internString(Reader.advance(), Arena));
+ CodeLines.push_back(Saver.save(Reader.advance()));
}
if (!Reader.atEnd())
Reader.advance(); // consume closing fence
@@ -262,7 +258,7 @@ ArrayRef<MDNode *> parseMarkdown(StringRef ParagraphText,
// inline content (emphasis, code spans, links), so each row may need to
// be split on '|' and parsed further into structured cells.
while (!Reader.atEnd() && Reader.peek().trim().contains('|'))
- Rows.push_back(internString(Reader.advance().trim(), Arena));
+ Rows.push_back(Saver.save(Reader.advance().trim()));
auto *Table = new (Arena) TableNode(allocateArray(Rows, Arena));
LDBG() << "emitting TableNode rows=" << Rows.size();
Nodes.push_back(Table);
@@ -276,7 +272,7 @@ ArrayRef<MDNode *> parseMarkdown(StringRef ParagraphText,
StringRef L = Reader.peek().trim();
if (!isListItem(L))
break;
- StringRef ItemText = internString(L.drop_front(2).trim(), Arena);
+ StringRef ItemText = Saver.save(L.drop_front(2).trim());
SmallVector<MDNode *> ItemChildren;
ItemChildren.push_back(new (Arena) TextNode(ItemText));
auto *Item =
@@ -291,7 +287,7 @@ ArrayRef<MDNode *> parseMarkdown(StringRef ParagraphText,
}
// Plain text, scanned for inline constructs (emphasis, strong, code).
- for (MDNode *Inline : parseInline(Line, Arena))
+ for (MDNode *Inline : parseInline(Line, Arena, Saver))
Nodes.push_back(Inline);
Reader.advance();
}
>From 0b5f53715fc6e78a56145609893fa61f5cf4f353 Mon Sep 17 00:00:00 2001
From: Neil-N4 <neilnair4 at gmail.com>
Date: Fri, 12 Jun 2026 22:59:09 -0400
Subject: [PATCH 12/27] [clang-doc] Address review feedback: fix comment
accuracy and trim AI-sounding language
---
clang-tools-extra/clang-doc/support/Markdown.h | 12 +++++-------
.../unittests/clang-doc/MarkdownParserTest.cpp | 13 +++++--------
2 files changed, 10 insertions(+), 15 deletions(-)
diff --git a/clang-tools-extra/clang-doc/support/Markdown.h b/clang-tools-extra/clang-doc/support/Markdown.h
index 60390465588c3..8c2055868671a 100644
--- a/clang-tools-extra/clang-doc/support/Markdown.h
+++ b/clang-tools-extra/clang-doc/support/Markdown.h
@@ -49,8 +49,7 @@
namespace clang::doc::markdown {
/// Discriminator for all Markdown AST nodes. Inline kinds are grouped before
-/// block kinds so that the sentinels NK_LastInline and NK_FirstBlock enable
-/// cheap range-based checks in classof() implementations.
+/// block kinds.
enum class NodeKind {
// Inline nodes
NK_Text,
@@ -193,8 +192,7 @@ struct TableNode : MDNode {
static bool classof(const MDNode *N) { return N->Kind == NodeKind::NK_Table; }
};
-/// A single list item. Children may contain block-level nodes for loose
-/// lists, or a single inline sequence for tight lists.
+/// A single list item. Children holds the item's inline content.
struct ListItemNode : MDNode {
llvm::ArrayRef<MDNode *> Children;
explicit ListItemNode(llvm::ArrayRef<MDNode *> Children)
@@ -248,9 +246,9 @@ struct ThematicBreakNode : MDNode {
//===----------------------------------------------------------------------===//
/// Parse Markdown from a single paragraph of plain text. Returns a list of
-/// top-level block nodes allocated in Arena. Returns an empty ArrayRef if no
-/// Markdown constructs are found, letting callers fall back to plain-text
-/// rendering at zero cost. The parser never crashes on malformed input.
+/// top-level nodes allocated in Arena. Returns an empty ArrayRef only for empty
+/// or whitespace-only input; plain text with no Markdown constructs returns a
+/// single TextNode.
///
/// The caller must keep Arena alive for the lifetime of any returned nodes.
llvm::ArrayRef<MDNode *> parseMarkdown(llvm::StringRef ParagraphText,
diff --git a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
index 207ae938c299a..e2fd07159d446 100644
--- a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
+++ b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
@@ -196,16 +196,13 @@ foo
ASSERT_EQ(N->Lines.size(), 1u);
}
-// CommonMark §4.5 example 124: closing fence must be at least as long as the
-// opening fence.
-// TODO: our parser currently closes on the first line with 3 matching fence
-// chars regardless of opening fence length. Fix as part of the CommonMark
-// TODO in parseMarkdown().
+// CommonMark §4.5 example 124: the closing fence must be at least as long as
+// the opening fence. Our parser closes on the first line with 3 matching fence
+// chars regardless of opening length, so this documents the current
+// non-conformant behavior.
+// TODO: fix as part of the CommonMark TODO in parseMarkdown().
TEST_F(MarkdownParserTest, ClosingFenceLengthTODO) {
auto Nodes = parseMarkdown("````\naaa\n```", Arena);
- // The ``` line should not close the ```` fence per CommonMark, but our
- // parser currently treats it as a closing fence. This test documents the
- // current (non-conformant) behavior.
ASSERT_EQ(Nodes.size(), 1u);
auto *N = cast<FencedCodeNode>(Nodes[0]);
ASSERT_EQ(N->Lines.size(), 1u);
>From 14f455ecde0305ec38e20ca6068b0d8f5f259776 Mon Sep 17 00:00:00 2001
From: Neil-N4 <neilnair4 at gmail.com>
Date: Fri, 12 Jun 2026 23:07:02 -0400
Subject: [PATCH 13/27] [clang-doc] Wrap plain-text paragraph lines in
ParagraphNode
---
.../clang-doc/support/Markdown.cpp | 7 +-
.../clang-doc/MarkdownParserTest.cpp | 67 +++++++++++++------
2 files changed, 50 insertions(+), 24 deletions(-)
diff --git a/clang-tools-extra/clang-doc/support/Markdown.cpp b/clang-tools-extra/clang-doc/support/Markdown.cpp
index be2800bff5df7..59e651d2b8b05 100644
--- a/clang-tools-extra/clang-doc/support/Markdown.cpp
+++ b/clang-tools-extra/clang-doc/support/Markdown.cpp
@@ -286,9 +286,10 @@ ArrayRef<MDNode *> parseMarkdown(StringRef ParagraphText,
continue;
}
- // Plain text, scanned for inline constructs (emphasis, strong, code).
- for (MDNode *Inline : parseInline(Line, Arena, Saver))
- Nodes.push_back(Inline);
+ // Plain text line: scan for inline constructs (emphasis, strong, code) and
+ // wrap the result in a paragraph.
+ auto Inlines = parseInline(Line, Arena, Saver);
+ Nodes.push_back(new (Arena) ParagraphNode(Inlines));
Reader.advance();
}
diff --git a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
index e2fd07159d446..63d978061b99b 100644
--- a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
+++ b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
@@ -33,8 +33,9 @@ TEST_F(MarkdownParserTest, WhitespaceOnlyInput) {
TEST_F(MarkdownParserTest, PlainText) {
auto Nodes = parseMarkdown("hello world", Arena);
ASSERT_EQ(Nodes.size(), 1u);
- auto *N = cast<TextNode>(Nodes[0]);
- EXPECT_EQ(N->Text, "hello world");
+ auto *P = cast<ParagraphNode>(Nodes[0]);
+ ASSERT_EQ(P->Children.size(), 1u);
+ EXPECT_EQ(cast<TextNode>(P->Children[0])->Text, "hello world");
}
TEST_F(MarkdownParserTest, FencedCodeBlock) {
@@ -110,7 +111,7 @@ code
- item)",
Arena);
ASSERT_EQ(Nodes.size(), 3u);
- EXPECT_TRUE(isa<TextNode>(Nodes[0]));
+ EXPECT_TRUE(isa<ParagraphNode>(Nodes[0]));
EXPECT_TRUE(isa<FencedCodeNode>(Nodes[1]));
EXPECT_TRUE(isa<UnorderedListNode>(Nodes[2]));
}
@@ -210,18 +211,22 @@ TEST_F(MarkdownParserTest, ClosingFenceLengthTODO) {
TEST_F(MarkdownParserTest, EmphasisAsterisk) {
auto Nodes = parseMarkdown("an *important* word", Arena);
- ASSERT_EQ(Nodes.size(), 3u);
- EXPECT_EQ(cast<TextNode>(Nodes[0])->Text, "an ");
- auto *Em = cast<EmphasisNode>(Nodes[1]);
+ ASSERT_EQ(Nodes.size(), 1u);
+ auto *P = cast<ParagraphNode>(Nodes[0]);
+ ASSERT_EQ(P->Children.size(), 3u);
+ EXPECT_EQ(cast<TextNode>(P->Children[0])->Text, "an ");
+ auto *Em = cast<EmphasisNode>(P->Children[1]);
ASSERT_EQ(Em->Children.size(), 1u);
EXPECT_EQ(cast<TextNode>(Em->Children[0])->Text, "important");
- EXPECT_EQ(cast<TextNode>(Nodes[2])->Text, " word");
+ EXPECT_EQ(cast<TextNode>(P->Children[2])->Text, " word");
}
TEST_F(MarkdownParserTest, EmphasisUnderscore) {
auto Nodes = parseMarkdown("_em_", Arena);
ASSERT_EQ(Nodes.size(), 1u);
- auto *Em = cast<EmphasisNode>(Nodes[0]);
+ auto *P = cast<ParagraphNode>(Nodes[0]);
+ ASSERT_EQ(P->Children.size(), 1u);
+ auto *Em = cast<EmphasisNode>(P->Children[0]);
ASSERT_EQ(Em->Children.size(), 1u);
EXPECT_EQ(cast<TextNode>(Em->Children[0])->Text, "em");
}
@@ -229,7 +234,9 @@ TEST_F(MarkdownParserTest, EmphasisUnderscore) {
TEST_F(MarkdownParserTest, StrongAsterisk) {
auto Nodes = parseMarkdown("**bold**", Arena);
ASSERT_EQ(Nodes.size(), 1u);
- auto *St = cast<StrongNode>(Nodes[0]);
+ auto *P = cast<ParagraphNode>(Nodes[0]);
+ ASSERT_EQ(P->Children.size(), 1u);
+ auto *St = cast<StrongNode>(P->Children[0]);
ASSERT_EQ(St->Children.size(), 1u);
EXPECT_EQ(cast<TextNode>(St->Children[0])->Text, "bold");
}
@@ -237,7 +244,9 @@ TEST_F(MarkdownParserTest, StrongAsterisk) {
TEST_F(MarkdownParserTest, StrongUnderscore) {
auto Nodes = parseMarkdown("__bold__", Arena);
ASSERT_EQ(Nodes.size(), 1u);
- auto *St = cast<StrongNode>(Nodes[0]);
+ auto *P = cast<ParagraphNode>(Nodes[0]);
+ ASSERT_EQ(P->Children.size(), 1u);
+ auto *St = cast<StrongNode>(P->Children[0]);
ASSERT_EQ(St->Children.size(), 1u);
EXPECT_EQ(cast<TextNode>(St->Children[0])->Text, "bold");
}
@@ -246,15 +255,19 @@ TEST_F(MarkdownParserTest, StrongUnderscore) {
TEST_F(MarkdownParserTest, StrongBindsBeforeEmphasis) {
auto Nodes = parseMarkdown("**strong**", Arena);
ASSERT_EQ(Nodes.size(), 1u);
- EXPECT_TRUE(isa<StrongNode>(Nodes[0]));
+ auto *P = cast<ParagraphNode>(Nodes[0]);
+ ASSERT_EQ(P->Children.size(), 1u);
+ EXPECT_TRUE(isa<StrongNode>(P->Children[0]));
}
TEST_F(MarkdownParserTest, InlineCode) {
auto Nodes = parseMarkdown("call `foo()` here", Arena);
- ASSERT_EQ(Nodes.size(), 3u);
- EXPECT_EQ(cast<TextNode>(Nodes[0])->Text, "call ");
- EXPECT_EQ(cast<InlineCodeNode>(Nodes[1])->Code, "foo()");
- EXPECT_EQ(cast<TextNode>(Nodes[2])->Text, " here");
+ ASSERT_EQ(Nodes.size(), 1u);
+ auto *P = cast<ParagraphNode>(Nodes[0]);
+ ASSERT_EQ(P->Children.size(), 3u);
+ EXPECT_EQ(cast<TextNode>(P->Children[0])->Text, "call ");
+ EXPECT_EQ(cast<InlineCodeNode>(P->Children[1])->Code, "foo()");
+ EXPECT_EQ(cast<TextNode>(P->Children[2])->Text, " here");
}
// CommonMark §6.1: a doubled backtick fence lets the span contain a single
@@ -262,14 +275,18 @@ TEST_F(MarkdownParserTest, InlineCode) {
TEST_F(MarkdownParserTest, InlineCodeDoubleBacktick) {
auto Nodes = parseMarkdown("``a`b``", Arena);
ASSERT_EQ(Nodes.size(), 1u);
- EXPECT_EQ(cast<InlineCodeNode>(Nodes[0])->Code, "a`b");
+ auto *P = cast<ParagraphNode>(Nodes[0]);
+ ASSERT_EQ(P->Children.size(), 1u);
+ EXPECT_EQ(cast<InlineCodeNode>(P->Children[0])->Code, "a`b");
}
// Emphasis and strong recurse, so a code span inside emphasis is parsed.
TEST_F(MarkdownParserTest, CodeSpanInsideEmphasis) {
auto Nodes = parseMarkdown("*see `x`*", Arena);
ASSERT_EQ(Nodes.size(), 1u);
- auto *Em = cast<EmphasisNode>(Nodes[0]);
+ auto *P = cast<ParagraphNode>(Nodes[0]);
+ ASSERT_EQ(P->Children.size(), 1u);
+ auto *Em = cast<EmphasisNode>(P->Children[0]);
ASSERT_EQ(Em->Children.size(), 2u);
EXPECT_EQ(cast<TextNode>(Em->Children[0])->Text, "see ");
EXPECT_EQ(cast<InlineCodeNode>(Em->Children[1])->Code, "x");
@@ -278,7 +295,9 @@ TEST_F(MarkdownParserTest, CodeSpanInsideEmphasis) {
TEST_F(MarkdownParserTest, CodeSpanInsideStrong) {
auto Nodes = parseMarkdown("**a `b`**", Arena);
ASSERT_EQ(Nodes.size(), 1u);
- auto *St = cast<StrongNode>(Nodes[0]);
+ auto *P = cast<ParagraphNode>(Nodes[0]);
+ ASSERT_EQ(P->Children.size(), 1u);
+ auto *St = cast<StrongNode>(P->Children[0]);
ASSERT_EQ(St->Children.size(), 2u);
EXPECT_EQ(cast<TextNode>(St->Children[0])->Text, "a ");
EXPECT_EQ(cast<InlineCodeNode>(St->Children[1])->Code, "b");
@@ -288,21 +307,27 @@ TEST_F(MarkdownParserTest, CodeSpanInsideStrong) {
TEST_F(MarkdownParserTest, UnmatchedDelimiterIsText) {
auto Nodes = parseMarkdown("a * b", Arena);
ASSERT_EQ(Nodes.size(), 1u);
- EXPECT_EQ(cast<TextNode>(Nodes[0])->Text, "a * b");
+ auto *P = cast<ParagraphNode>(Nodes[0]);
+ ASSERT_EQ(P->Children.size(), 1u);
+ EXPECT_EQ(cast<TextNode>(P->Children[0])->Text, "a * b");
}
// An unterminated code span leaves the backtick as literal text.
TEST_F(MarkdownParserTest, UnterminatedCodeSpanIsText) {
auto Nodes = parseMarkdown("a `b c", Arena);
ASSERT_EQ(Nodes.size(), 1u);
- EXPECT_EQ(cast<TextNode>(Nodes[0])->Text, "a `b c");
+ auto *P = cast<ParagraphNode>(Nodes[0]);
+ ASSERT_EQ(P->Children.size(), 1u);
+ EXPECT_EQ(cast<TextNode>(P->Children[0])->Text, "a `b c");
}
// Inline parsing must not disturb plain text with no markers.
TEST_F(MarkdownParserTest, PlainTextHasNoInlineNodes) {
auto Nodes = parseMarkdown("just words", Arena);
ASSERT_EQ(Nodes.size(), 1u);
- EXPECT_EQ(cast<TextNode>(Nodes[0])->Text, "just words");
+ auto *P = cast<ParagraphNode>(Nodes[0]);
+ ASSERT_EQ(P->Children.size(), 1u);
+ EXPECT_EQ(cast<TextNode>(P->Children[0])->Text, "just words");
}
} // namespace
>From 7bb303ad25d10ba9540af4ee38f3aac0582d49df Mon Sep 17 00:00:00 2001
From: Neil-N4 <neilnair4 at gmail.com>
Date: Fri, 12 Jun 2026 23:18:11 -0400
Subject: [PATCH 14/27] [clang-doc] Add CharReader cursor for character-level
inline scanning
---
.../clang-doc/support/Markdown.cpp | 75 +++++++++++++++----
1 file changed, 60 insertions(+), 15 deletions(-)
diff --git a/clang-tools-extra/clang-doc/support/Markdown.cpp b/clang-tools-extra/clang-doc/support/Markdown.cpp
index 59e651d2b8b05..1eb6ad51eaf02 100644
--- a/clang-tools-extra/clang-doc/support/Markdown.cpp
+++ b/clang-tools-extra/clang-doc/support/Markdown.cpp
@@ -82,6 +82,49 @@ class LineReader {
size_t Pos = 0;
};
+// A forward cursor over the characters of a string. The character-level analog
+// of LineReader: the inline scanner inspects the current or an upcoming
+// character and consumes characters without manual index arithmetic. position()
+// and seek() let it interoperate with the index-based run and delimiter helpers
+// below, since inline constructs are not consumed one character at a time.
+class CharReader {
+public:
+ explicit CharReader(StringRef S) : S(S) {}
+
+ // True once every character has been consumed.
+ bool atEnd() const { return Pos >= S.size(); }
+
+ // The current character. Must not be called when atEnd().
+ char peek() const {
+ assert(!atEnd() && "peek past end of input");
+ return S[Pos];
+ }
+
+ // The character Offset positions ahead of the cursor, or '\0' when that
+ // position is past the end. peek(0) is the current character.
+ char peek(size_t Offset) const {
+ size_t Target = Pos + Offset;
+ return Target < S.size() ? S[Target] : '\0';
+ }
+
+ // Consume the current character and return it. Must not be called when
+ // atEnd().
+ char advance() {
+ assert(!atEnd() && "advance past end of input");
+ return S[Pos++];
+ }
+
+ // The current scan position, for substring, run, and delimiter computations.
+ size_t position() const { return Pos; }
+
+ // Move the cursor to an absolute position, used to skip past a matched span.
+ void seek(size_t NewPos) { Pos = NewPos; }
+
+private:
+ StringRef S;
+ size_t Pos = 0;
+};
+
// Returns the number of consecutive copies of C starting at S[Start].
static size_t countRun(StringRef S, size_t Start, char C) {
size_t I = Start;
@@ -137,7 +180,8 @@ static size_t findClosingDelim(StringRef S, size_t StartPos, char DelimChar,
static ArrayRef<MDNode *> parseInline(StringRef S, BumpPtrAllocator &Arena,
StringSaver &Saver) {
SmallVector<MDNode *> Nodes;
- size_t TextStart = 0, Pos = 0, E = S.size();
+ CharReader Reader(S);
+ size_t TextStart = 0;
auto flushText = [&](size_t End) {
if (End > TextStart)
@@ -145,42 +189,43 @@ static ArrayRef<MDNode *> parseInline(StringRef S, BumpPtrAllocator &Arena,
Saver.save(S.substr(TextStart, End - TextStart))));
};
- while (Pos < E) {
- char C = S[Pos];
+ while (!Reader.atEnd()) {
+ size_t Pos = Reader.position();
+ char C = Reader.peek();
// Inline code span: an opening backtick run closed by a run of the same
// length.
if (C == '`') {
size_t OpenLen = countRun(S, Pos, '`');
size_t ClosePos = Pos + OpenLen;
- while (ClosePos < E && countRun(S, ClosePos, '`') != OpenLen)
+ while (ClosePos < S.size() && countRun(S, ClosePos, '`') != OpenLen)
ClosePos += S[ClosePos] == '`' ? countRun(S, ClosePos, '`') : 1;
- if (ClosePos < E) {
+ if (ClosePos < S.size()) {
flushText(Pos);
StringRef Code =
trimCodeSpan(S.substr(Pos + OpenLen, ClosePos - (Pos + OpenLen)));
Nodes.push_back(new (Arena) InlineCodeNode(Saver.save(Code)));
- Pos = ClosePos + OpenLen;
- TextStart = Pos;
+ Reader.seek(ClosePos + OpenLen);
+ TextStart = Reader.position();
continue;
}
// No closing run; leave the backticks as literal text.
- Pos += OpenLen;
+ Reader.seek(Pos + OpenLen);
continue;
}
// Emphasis (*text*, _text_) and strong (**text**, __text__).
if (C == '*' || C == '_') {
// Strong binds the two-delimiter form before single-delimiter emphasis.
- if (Pos + 1 < E && S[Pos + 1] == C) {
+ if (Reader.peek(1) == C) {
size_t Close = findClosingDelim(S, Pos + 2, C, 2);
if (Close != StringRef::npos) {
flushText(Pos);
StringRef Inner = S.substr(Pos + 2, Close - (Pos + 2));
Nodes.push_back(new (Arena)
StrongNode(parseInline(Inner, Arena, Saver)));
- Pos = Close + 2;
- TextStart = Pos;
+ Reader.seek(Close + 2);
+ TextStart = Reader.position();
continue;
}
}
@@ -190,16 +235,16 @@ static ArrayRef<MDNode *> parseInline(StringRef S, BumpPtrAllocator &Arena,
StringRef Inner = S.substr(Pos + 1, Close - (Pos + 1));
Nodes.push_back(new (Arena)
EmphasisNode(parseInline(Inner, Arena, Saver)));
- Pos = Close + 1;
- TextStart = Pos;
+ Reader.seek(Close + 1);
+ TextStart = Reader.position();
continue;
}
}
- ++Pos;
+ Reader.advance();
}
- flushText(E);
+ flushText(S.size());
return allocateArray(Nodes, Arena);
}
>From 6864c7b552b37e64ee69c4660517da2cf2c22975 Mon Sep 17 00:00:00 2001
From: Neil-N4 <neilnair4 at gmail.com>
Date: Fri, 12 Jun 2026 23:22:06 -0400
Subject: [PATCH 15/27] [clang-doc] Extract block parse bodies into separate
functions
---
.../clang-doc/support/Markdown.cpp | 122 +++++++++++-------
1 file changed, 73 insertions(+), 49 deletions(-)
diff --git a/clang-tools-extra/clang-doc/support/Markdown.cpp b/clang-tools-extra/clang-doc/support/Markdown.cpp
index 1eb6ad51eaf02..625b3e6305ab9 100644
--- a/clang-tools-extra/clang-doc/support/Markdown.cpp
+++ b/clang-tools-extra/clang-doc/support/Markdown.cpp
@@ -248,6 +248,75 @@ static ArrayRef<MDNode *> parseInline(StringRef S, BumpPtrAllocator &Arena,
return allocateArray(Nodes, Arena);
}
+// Parses a fenced code block opened with ``` or ~~~. The cursor must be on the
+// opening fence; the fence, body lines, and closing fence are consumed.
+//
+// TODO: Follow CommonMark spec §4.5 more closely -- opening fences may be
+// indented up to 3 spaces, the closing fence must use the same character and be
+// at least as long as the opening fence, and the closing fence may only be
+// followed by spaces. Doxygen specifics should be handled on a case-by-case
+// basis.
+static FencedCodeNode *parseFencedCode(LineReader &Reader,
+ BumpPtrAllocator &Arena,
+ StringSaver &Saver) {
+ StringRef Open = Reader.peek().trim();
+ char Fence = Open[0];
+ StringRef Lang = Saver.save(Open.drop_front(3).trim());
+ Reader.advance(); // consume opening fence
+ SmallVector<StringRef> CodeLines;
+ while (!Reader.atEnd()) {
+ StringRef CodeLine = Reader.peek().trim();
+ if (CodeLine.size() >= 3 &&
+ all_of(CodeLine.take_front(3), [Fence](char C) { return C == Fence; }))
+ break;
+ CodeLines.push_back(Saver.save(Reader.advance()));
+ }
+ if (!Reader.atEnd())
+ Reader.advance(); // consume closing fence
+ auto *Code =
+ new (Arena) FencedCodeNode(Lang, allocateArray(CodeLines, Arena));
+ LDBG() << "emitting FencedCodeNode lang='" << Lang
+ << "' lines=" << CodeLines.size();
+ return Code;
+}
+
+// Parses a pipe table. The cursor must be on the header row, with a separator
+// row following; consecutive lines containing a | are taken as rows.
+static TableNode *parsePipeTable(LineReader &Reader, BumpPtrAllocator &Arena,
+ StringSaver &Saver) {
+ SmallVector<StringRef> Rows;
+ // TODO: Rows are kept as raw line text for now. Table cells may contain
+ // inline content (emphasis, code spans, links), so each row may need to be
+ // split on '|' and parsed further into structured cells.
+ while (!Reader.atEnd() && Reader.peek().trim().contains('|'))
+ Rows.push_back(Saver.save(Reader.advance().trim()));
+ auto *Table = new (Arena) TableNode(allocateArray(Rows, Arena));
+ LDBG() << "emitting TableNode rows=" << Rows.size();
+ return Table;
+}
+
+// Parses an unordered (bullet) list. The cursor must be on the first item;
+// consecutive bullet lines are consumed into list items.
+static UnorderedListNode *parseUnorderedList(LineReader &Reader,
+ BumpPtrAllocator &Arena,
+ StringSaver &Saver) {
+ SmallVector<ListItemNode *> Items;
+ while (!Reader.atEnd()) {
+ StringRef L = Reader.peek().trim();
+ if (!isListItem(L))
+ break;
+ StringRef ItemText = Saver.save(L.drop_front(2).trim());
+ SmallVector<MDNode *> ItemChildren;
+ ItemChildren.push_back(new (Arena) TextNode(ItemText));
+ auto *Item = new (Arena) ListItemNode(allocateArray(ItemChildren, Arena));
+ Items.push_back(Item);
+ Reader.advance();
+ }
+ auto *List = new (Arena) UnorderedListNode(allocateArray(Items, Arena));
+ LDBG() << "emitting UnorderedListNode items=" << Items.size();
+ return List;
+}
+
ArrayRef<MDNode *> parseMarkdown(StringRef ParagraphText,
BumpPtrAllocator &Arena) {
if (ParagraphText.trim().empty())
@@ -268,66 +337,21 @@ ArrayRef<MDNode *> parseMarkdown(StringRef ParagraphText,
continue;
}
- // TODO: Follow CommonMark spec §4.5 more closely -- opening fences may be
- // indented up to 3 spaces, the closing fence must use the same character
- // and be at least as long as the opening fence, and the closing fence may
- // only be followed by spaces. Doxygen specifics should be handled on a
- // case-by-case basis.
+ // Fenced code block.
if (Line.starts_with("```") || Line.starts_with("~~~")) {
- char Fence = Line[0];
- StringRef Lang = Saver.save(Line.drop_front(3).trim());
- Reader.advance(); // consume opening fence
- SmallVector<StringRef> CodeLines;
- while (!Reader.atEnd()) {
- StringRef CodeLine = Reader.peek().trim();
- if (CodeLine.size() >= 3 &&
- all_of(CodeLine.take_front(3),
- [Fence](char C) { return C == Fence; }))
- break;
- CodeLines.push_back(Saver.save(Reader.advance()));
- }
- if (!Reader.atEnd())
- Reader.advance(); // consume closing fence
- auto *Code =
- new (Arena) FencedCodeNode(Lang, allocateArray(CodeLines, Arena));
- LDBG() << "emitting FencedCodeNode lang='" << Lang
- << "' lines=" << CodeLines.size();
- Nodes.push_back(Code);
+ Nodes.push_back(parseFencedCode(Reader, Arena, Saver));
continue;
}
// Pipe table: current line has | and next line is a separator row.
if (Line.contains('|') && isSepRow(Reader.peek(1).trim())) {
- SmallVector<StringRef> Rows;
- // TODO: Rows are kept as raw line text for now. Table cells may contain
- // inline content (emphasis, code spans, links), so each row may need to
- // be split on '|' and parsed further into structured cells.
- while (!Reader.atEnd() && Reader.peek().trim().contains('|'))
- Rows.push_back(Saver.save(Reader.advance().trim()));
- auto *Table = new (Arena) TableNode(allocateArray(Rows, Arena));
- LDBG() << "emitting TableNode rows=" << Rows.size();
- Nodes.push_back(Table);
+ Nodes.push_back(parsePipeTable(Reader, Arena, Saver));
continue;
}
// Unordered list item.
if (isListItem(Line)) {
- SmallVector<ListItemNode *> Items;
- while (!Reader.atEnd()) {
- StringRef L = Reader.peek().trim();
- if (!isListItem(L))
- break;
- StringRef ItemText = Saver.save(L.drop_front(2).trim());
- SmallVector<MDNode *> ItemChildren;
- ItemChildren.push_back(new (Arena) TextNode(ItemText));
- auto *Item =
- new (Arena) ListItemNode(allocateArray(ItemChildren, Arena));
- Items.push_back(Item);
- Reader.advance();
- }
- auto *List = new (Arena) UnorderedListNode(allocateArray(Items, Arena));
- LDBG() << "emitting UnorderedListNode items=" << Items.size();
- Nodes.push_back(List);
+ Nodes.push_back(parseUnorderedList(Reader, Arena, Saver));
continue;
}
>From 86e45d603fefc440f3334516b5f4fcfd69354d7a Mon Sep 17 00:00:00 2001
From: Neil-N4 <neilnair4 at gmail.com>
Date: Fri, 12 Jun 2026 23:27:51 -0400
Subject: [PATCH 16/27] [clang-doc] Add ATX heading parsing with inline content
support
---
.../clang-doc/support/Markdown.cpp | 34 +++++++++++
.../clang-doc/MarkdownParserTest.cpp | 59 +++++++++++++++++++
2 files changed, 93 insertions(+)
diff --git a/clang-tools-extra/clang-doc/support/Markdown.cpp b/clang-tools-extra/clang-doc/support/Markdown.cpp
index 625b3e6305ab9..d59d95586e836 100644
--- a/clang-tools-extra/clang-doc/support/Markdown.cpp
+++ b/clang-tools-extra/clang-doc/support/Markdown.cpp
@@ -46,6 +46,16 @@ static bool isListItem(StringRef Line) {
Line.starts_with("+ ");
}
+// Returns the ATX heading level (1 to 6) when Line is an ATX heading: one to
+// six leading # characters followed by a space. Returns 0 otherwise, so seven
+// or more # characters fall back to plain text.
+static unsigned atxHeadingLevel(StringRef Line) {
+ size_t Level = Line.find_first_not_of('#');
+ if (Level == StringRef::npos || Level < 1 || Level > 6 || Line[Level] != ' ')
+ return 0;
+ return Level;
+}
+
// A forward cursor over the lines of a paragraph. Encapsulates the parse
// position so the loop can inspect the current or an upcoming line and consume
// lines without manual index arithmetic. Lines are stored untrimmed; callers
@@ -317,6 +327,24 @@ static UnorderedListNode *parseUnorderedList(LineReader &Reader,
return List;
}
+// Parses an ATX heading: one to six leading # characters and a space, followed
+// by inline content. The cursor must be on the heading line, which is consumed.
+//
+// TODO: CommonMark §4.2 also allows up to 3 leading spaces and an optional
+// closing run of # characters; neither is handled yet.
+static HeadingNode *parseHeading(LineReader &Reader, BumpPtrAllocator &Arena,
+ StringSaver &Saver) {
+ StringRef Line = Reader.peek().trim();
+ unsigned Level = atxHeadingLevel(Line);
+ assert(Level >= 1 && Level <= 6 && "parseHeading called on a non-heading");
+ StringRef Content = Line.drop_front(Level).trim();
+ Reader.advance();
+ auto *Heading =
+ new (Arena) HeadingNode(Level, parseInline(Content, Arena, Saver));
+ LDBG() << "emitting HeadingNode level=" << Level;
+ return Heading;
+}
+
ArrayRef<MDNode *> parseMarkdown(StringRef ParagraphText,
BumpPtrAllocator &Arena) {
if (ParagraphText.trim().empty())
@@ -343,6 +371,12 @@ ArrayRef<MDNode *> parseMarkdown(StringRef ParagraphText,
continue;
}
+ // ATX heading: 1 to 6 leading # characters and a space.
+ if (atxHeadingLevel(Line)) {
+ Nodes.push_back(parseHeading(Reader, Arena, Saver));
+ continue;
+ }
+
// Pipe table: current line has | and next line is a separator row.
if (Line.contains('|') && isSepRow(Reader.peek(1).trim())) {
Nodes.push_back(parsePipeTable(Reader, Arena, Saver));
diff --git a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
index 63d978061b99b..c48b7a463c3a0 100644
--- a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
+++ b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
@@ -330,4 +330,63 @@ TEST_F(MarkdownParserTest, PlainTextHasNoInlineNodes) {
EXPECT_EQ(cast<TextNode>(P->Children[0])->Text, "just words");
}
+TEST_F(MarkdownParserTest, Heading1) {
+ auto Nodes = parseMarkdown("# Title", Arena);
+ ASSERT_EQ(Nodes.size(), 1u);
+ auto *H = cast<HeadingNode>(Nodes[0]);
+ EXPECT_EQ(H->Level, 1u);
+ ASSERT_EQ(H->Children.size(), 1u);
+ EXPECT_EQ(cast<TextNode>(H->Children[0])->Text, "Title");
+}
+
+TEST_F(MarkdownParserTest, Heading2) {
+ auto Nodes = parseMarkdown("## Title", Arena);
+ ASSERT_EQ(Nodes.size(), 1u);
+ auto *H = cast<HeadingNode>(Nodes[0]);
+ EXPECT_EQ(H->Level, 2u);
+ ASSERT_EQ(H->Children.size(), 1u);
+ EXPECT_EQ(cast<TextNode>(H->Children[0])->Text, "Title");
+}
+
+TEST_F(MarkdownParserTest, Heading3) {
+ auto Nodes = parseMarkdown("### Title", Arena);
+ ASSERT_EQ(Nodes.size(), 1u);
+ auto *H = cast<HeadingNode>(Nodes[0]);
+ EXPECT_EQ(H->Level, 3u);
+ ASSERT_EQ(H->Children.size(), 1u);
+ EXPECT_EQ(cast<TextNode>(H->Children[0])->Text, "Title");
+}
+
+TEST_F(MarkdownParserTest, HeadingWithInlineCode) {
+ auto Nodes = parseMarkdown("# Use `foo()`", Arena);
+ ASSERT_EQ(Nodes.size(), 1u);
+ auto *H = cast<HeadingNode>(Nodes[0]);
+ EXPECT_EQ(H->Level, 1u);
+ ASSERT_EQ(H->Children.size(), 2u);
+ EXPECT_EQ(cast<TextNode>(H->Children[0])->Text, "Use ");
+ EXPECT_EQ(cast<InlineCodeNode>(H->Children[1])->Code, "foo()");
+}
+
+TEST_F(MarkdownParserTest, HeadingWithEmphasis) {
+ auto Nodes = parseMarkdown("## see *this*", Arena);
+ ASSERT_EQ(Nodes.size(), 1u);
+ auto *H = cast<HeadingNode>(Nodes[0]);
+ EXPECT_EQ(H->Level, 2u);
+ ASSERT_EQ(H->Children.size(), 2u);
+ EXPECT_EQ(cast<TextNode>(H->Children[0])->Text, "see ");
+ auto *Em = cast<EmphasisNode>(H->Children[1]);
+ ASSERT_EQ(Em->Children.size(), 1u);
+ EXPECT_EQ(cast<TextNode>(Em->Children[0])->Text, "this");
+}
+
+// Seven or more # characters are not a valid ATX heading, so the line falls
+// back to a plain-text paragraph.
+TEST_F(MarkdownParserTest, SevenHashesIsPlainText) {
+ auto Nodes = parseMarkdown("####### too many", Arena);
+ ASSERT_EQ(Nodes.size(), 1u);
+ auto *P = cast<ParagraphNode>(Nodes[0]);
+ ASSERT_EQ(P->Children.size(), 1u);
+ EXPECT_EQ(cast<TextNode>(P->Children[0])->Text, "####### too many");
+}
+
} // namespace
>From 2b14505cadb016131f53cbc3200973c3cee6ae04 Mon Sep 17 00:00:00 2001
From: Neil-N4 <neilnair4 at gmail.com>
Date: Fri, 12 Jun 2026 23:32:14 -0400
Subject: [PATCH 17/27] [clang-doc] Run list item text through parseInline for
inline markup support
---
.../clang-doc/support/Markdown.cpp | 6 ++---
.../clang-doc/MarkdownParserTest.cpp | 24 ++++++++++++++++---
2 files changed, 23 insertions(+), 7 deletions(-)
diff --git a/clang-tools-extra/clang-doc/support/Markdown.cpp b/clang-tools-extra/clang-doc/support/Markdown.cpp
index d59d95586e836..6901f6c2f40a5 100644
--- a/clang-tools-extra/clang-doc/support/Markdown.cpp
+++ b/clang-tools-extra/clang-doc/support/Markdown.cpp
@@ -315,10 +315,8 @@ static UnorderedListNode *parseUnorderedList(LineReader &Reader,
StringRef L = Reader.peek().trim();
if (!isListItem(L))
break;
- StringRef ItemText = Saver.save(L.drop_front(2).trim());
- SmallVector<MDNode *> ItemChildren;
- ItemChildren.push_back(new (Arena) TextNode(ItemText));
- auto *Item = new (Arena) ListItemNode(allocateArray(ItemChildren, Arena));
+ StringRef ItemText = L.drop_front(2).trim();
+ auto *Item = new (Arena) ListItemNode(parseInline(ItemText, Arena, Saver));
Items.push_back(Item);
Reader.advance();
}
diff --git a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
index c48b7a463c3a0..9a7d6d1fd0942 100644
--- a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
+++ b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
@@ -98,9 +98,27 @@ TEST_F(MarkdownParserTest, UnorderedList) {
ASSERT_EQ(Nodes.size(), 1u);
auto *N = cast<UnorderedListNode>(Nodes[0]);
ASSERT_EQ(N->Items.size(), 3u);
- EXPECT_EQ(cast<TextNode>(N->Items[0]->Children[0])->Text, "foo");
- EXPECT_EQ(cast<TextNode>(N->Items[1]->Children[0])->Text, "bar");
- EXPECT_EQ(cast<TextNode>(N->Items[2]->Children[0])->Text, "baz");
+ // Each item's children are the inline nodes from parseInline.
+ StringRef ExpectedText[] = {"foo", "bar", "baz"};
+ for (size_t I = 0; I < N->Items.size(); ++I) {
+ auto *Item = N->Items[I];
+ ASSERT_EQ(Item->Children.size(), 1u);
+ EXPECT_EQ(cast<TextNode>(Item->Children[0])->Text, ExpectedText[I]);
+ }
+}
+
+TEST_F(MarkdownParserTest, ListItemWithEmphasis) {
+ auto Nodes = parseMarkdown("- an *important* note", Arena);
+ ASSERT_EQ(Nodes.size(), 1u);
+ auto *N = cast<UnorderedListNode>(Nodes[0]);
+ ASSERT_EQ(N->Items.size(), 1u);
+ auto *Item = N->Items[0];
+ ASSERT_EQ(Item->Children.size(), 3u);
+ EXPECT_EQ(cast<TextNode>(Item->Children[0])->Text, "an ");
+ auto *Em = cast<EmphasisNode>(Item->Children[1]);
+ ASSERT_EQ(Em->Children.size(), 1u);
+ EXPECT_EQ(cast<TextNode>(Em->Children[0])->Text, "important");
+ EXPECT_EQ(cast<TextNode>(Item->Children[2])->Text, " note");
}
TEST_F(MarkdownParserTest, MixedContent) {
>From aaf4b6e2b1600bce25f625abb2caf9ad25b52f90 Mon Sep 17 00:00:00 2001
From: Neil-N4 <neilnair4 at gmail.com>
Date: Fri, 12 Jun 2026 23:42:27 -0400
Subject: [PATCH 18/27] [clang-doc] Add ordered list parsing with inline
content support
---
.../clang-doc/support/Markdown.cpp | 40 +++++++++++++++++
.../clang-doc/MarkdownParserTest.cpp | 44 +++++++++++++++++++
2 files changed, 84 insertions(+)
diff --git a/clang-tools-extra/clang-doc/support/Markdown.cpp b/clang-tools-extra/clang-doc/support/Markdown.cpp
index 6901f6c2f40a5..211fb0407578f 100644
--- a/clang-tools-extra/clang-doc/support/Markdown.cpp
+++ b/clang-tools-extra/clang-doc/support/Markdown.cpp
@@ -46,6 +46,14 @@ static bool isListItem(StringRef Line) {
Line.starts_with("+ ");
}
+// Returns true if Line begins with an ordered list marker: one or more digits
+// followed by a period and a space (e.g. "1. ", "42. ").
+static bool isOrderedListItem(StringRef Line) {
+ size_t Dot = Line.find_first_not_of("0123456789");
+ return Dot != StringRef::npos && Dot > 0 && Line[Dot] == '.' &&
+ Dot + 1 < Line.size() && Line[Dot + 1] == ' ';
+}
+
// Returns the ATX heading level (1 to 6) when Line is an ATX heading: one to
// six leading # characters followed by a space. Returns 0 otherwise, so seven
// or more # characters fall back to plain text.
@@ -325,6 +333,32 @@ static UnorderedListNode *parseUnorderedList(LineReader &Reader,
return List;
}
+// Parses an ordered (numbered) list. The cursor must be on the first item; the
+// start number is taken from that item's marker and consecutive numbered lines
+// are consumed. Item numbers after the first are not validated.
+static OrderedListNode *parseOrderedList(LineReader &Reader,
+ BumpPtrAllocator &Arena,
+ StringSaver &Saver) {
+ unsigned Start = 0;
+ Reader.peek().trim().take_while(isDigit).getAsInteger(10, Start);
+ SmallVector<ListItemNode *> Items;
+ while (!Reader.atEnd()) {
+ StringRef L = Reader.peek().trim();
+ if (!isOrderedListItem(L))
+ break;
+ // Drop the "<digits>. " marker: the digits, the period, and the space.
+ StringRef ItemText =
+ L.drop_front(L.find_first_not_of("0123456789") + 2).trim();
+ auto *Item = new (Arena) ListItemNode(parseInline(ItemText, Arena, Saver));
+ Items.push_back(Item);
+ Reader.advance();
+ }
+ auto *List = new (Arena) OrderedListNode(Start, allocateArray(Items, Arena));
+ LDBG() << "emitting OrderedListNode start=" << Start
+ << " items=" << Items.size();
+ return List;
+}
+
// Parses an ATX heading: one to six leading # characters and a space, followed
// by inline content. The cursor must be on the heading line, which is consumed.
//
@@ -387,6 +421,12 @@ ArrayRef<MDNode *> parseMarkdown(StringRef ParagraphText,
continue;
}
+ // Ordered list item: digits followed by a period and a space.
+ if (isOrderedListItem(Line)) {
+ Nodes.push_back(parseOrderedList(Reader, Arena, Saver));
+ continue;
+ }
+
// Plain text line: scan for inline constructs (emphasis, strong, code) and
// wrap the result in a paragraph.
auto Inlines = parseInline(Line, Arena, Saver);
diff --git a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
index 9a7d6d1fd0942..a0ba39c163a34 100644
--- a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
+++ b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
@@ -121,6 +121,50 @@ TEST_F(MarkdownParserTest, ListItemWithEmphasis) {
EXPECT_EQ(cast<TextNode>(Item->Children[2])->Text, " note");
}
+TEST_F(MarkdownParserTest, OrderedList) {
+ auto Nodes = parseMarkdown(R"(1. foo
+2. bar
+3. baz)",
+ Arena);
+ ASSERT_EQ(Nodes.size(), 1u);
+ auto *N = cast<OrderedListNode>(Nodes[0]);
+ EXPECT_EQ(N->Start, 1u);
+ ASSERT_EQ(N->Items.size(), 3u);
+ StringRef ExpectedText[] = {"foo", "bar", "baz"};
+ for (size_t I = 0; I < N->Items.size(); ++I) {
+ auto *Item = N->Items[I];
+ ASSERT_EQ(Item->Children.size(), 1u);
+ EXPECT_EQ(cast<TextNode>(Item->Children[0])->Text, ExpectedText[I]);
+ }
+}
+
+TEST_F(MarkdownParserTest, OrderedListCustomStart) {
+ auto Nodes = parseMarkdown(R"(5. five
+6. six)",
+ Arena);
+ ASSERT_EQ(Nodes.size(), 1u);
+ auto *N = cast<OrderedListNode>(Nodes[0]);
+ EXPECT_EQ(N->Start, 5u);
+ ASSERT_EQ(N->Items.size(), 2u);
+ EXPECT_EQ(cast<TextNode>(N->Items[0]->Children[0])->Text, "five");
+ EXPECT_EQ(cast<TextNode>(N->Items[1]->Children[0])->Text, "six");
+}
+
+TEST_F(MarkdownParserTest, OrderedListItemWithEmphasis) {
+ auto Nodes = parseMarkdown("1. an *important* note", Arena);
+ ASSERT_EQ(Nodes.size(), 1u);
+ auto *N = cast<OrderedListNode>(Nodes[0]);
+ EXPECT_EQ(N->Start, 1u);
+ ASSERT_EQ(N->Items.size(), 1u);
+ auto *Item = N->Items[0];
+ ASSERT_EQ(Item->Children.size(), 3u);
+ EXPECT_EQ(cast<TextNode>(Item->Children[0])->Text, "an ");
+ auto *Em = cast<EmphasisNode>(Item->Children[1]);
+ ASSERT_EQ(Em->Children.size(), 1u);
+ EXPECT_EQ(cast<TextNode>(Em->Children[0])->Text, "important");
+ EXPECT_EQ(cast<TextNode>(Item->Children[2])->Text, " note");
+}
+
TEST_F(MarkdownParserTest, MixedContent) {
auto Nodes = parseMarkdown(R"(some text
```````
>From 2ce9a89495e81eb5f0c67551f114e08eadefdabd Mon Sep 17 00:00:00 2001
From: Neil-N4 <neilnair4 at gmail.com>
Date: Fri, 12 Jun 2026 23:47:42 -0400
Subject: [PATCH 19/27] [clang-doc] Add thematic break parsing
---
.../clang-doc/support/Markdown.cpp | 26 +++++++++++++++++++
.../clang-doc/MarkdownParserTest.cpp | 18 +++++++++++++
2 files changed, 44 insertions(+)
diff --git a/clang-tools-extra/clang-doc/support/Markdown.cpp b/clang-tools-extra/clang-doc/support/Markdown.cpp
index 211fb0407578f..2f0cc5bffe566 100644
--- a/clang-tools-extra/clang-doc/support/Markdown.cpp
+++ b/clang-tools-extra/clang-doc/support/Markdown.cpp
@@ -54,6 +54,23 @@ static bool isOrderedListItem(StringRef Line) {
Dot + 1 < Line.size() && Line[Dot + 1] == ' ';
}
+// Returns true if Line is a thematic break: three or more matching -, *, or _
+// characters, optionally separated by spaces, with nothing else. Line is
+// expected to be trimmed.
+static bool isThematicBreak(StringRef Line) {
+ char Marker = Line.empty() ? '\0' : Line[0];
+ if (Marker != '-' && Marker != '*' && Marker != '_')
+ return false;
+ unsigned Count = 0;
+ for (char C : Line) {
+ if (C == Marker)
+ ++Count;
+ else if (C != ' ')
+ return false;
+ }
+ return Count >= 3;
+}
+
// Returns the ATX heading level (1 to 6) when Line is an ATX heading: one to
// six leading # characters followed by a space. Returns 0 otherwise, so seven
// or more # characters fall back to plain text.
@@ -409,6 +426,15 @@ ArrayRef<MDNode *> parseMarkdown(StringRef ParagraphText,
continue;
}
+ // Thematic break: 3 or more matching -, *, or _ characters. Checked before
+ // the list cases so that "* * *" and "- - -" are breaks, not list items.
+ if (isThematicBreak(Line)) {
+ Reader.advance();
+ Nodes.push_back(new (Arena) ThematicBreakNode());
+ LDBG() << "emitting ThematicBreakNode";
+ continue;
+ }
+
// Pipe table: current line has | and next line is a separator row.
if (Line.contains('|') && isSepRow(Reader.peek(1).trim())) {
Nodes.push_back(parsePipeTable(Reader, Arena, Saver));
diff --git a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
index a0ba39c163a34..188d1987ac06d 100644
--- a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
+++ b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
@@ -451,4 +451,22 @@ TEST_F(MarkdownParserTest, SevenHashesIsPlainText) {
EXPECT_EQ(cast<TextNode>(P->Children[0])->Text, "####### too many");
}
+TEST_F(MarkdownParserTest, ThematicBreakDashes) {
+ auto Nodes = parseMarkdown("---", Arena);
+ ASSERT_EQ(Nodes.size(), 1u);
+ EXPECT_TRUE(isa<ThematicBreakNode>(Nodes[0]));
+}
+
+TEST_F(MarkdownParserTest, ThematicBreakAsterisks) {
+ auto Nodes = parseMarkdown("***", Arena);
+ ASSERT_EQ(Nodes.size(), 1u);
+ EXPECT_TRUE(isa<ThematicBreakNode>(Nodes[0]));
+}
+
+TEST_F(MarkdownParserTest, ThematicBreakUnderscores) {
+ auto Nodes = parseMarkdown("___", Arena);
+ ASSERT_EQ(Nodes.size(), 1u);
+ EXPECT_TRUE(isa<ThematicBreakNode>(Nodes[0]));
+}
+
} // namespace
>From 843d0554dd1ad93c139d69f09fdc06800df7b078 Mon Sep 17 00:00:00 2001
From: Neil-N4 <neilnair4 at gmail.com>
Date: Fri, 12 Jun 2026 23:58:15 -0400
Subject: [PATCH 20/27] [clang-doc] Add CommonMark spec edge case tests with
section citations
---
.../clang-doc/MarkdownParserTest.cpp | 145 ++++++++++++++++++
1 file changed, 145 insertions(+)
diff --git a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
index 188d1987ac06d..350b15c2541ed 100644
--- a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
+++ b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
@@ -469,4 +469,149 @@ TEST_F(MarkdownParserTest, ThematicBreakUnderscores) {
EXPECT_TRUE(isa<ThematicBreakNode>(Nodes[0]));
}
+//===----------------------------------------------------------------------===//
+// CommonMark spec edge cases (spec.commonmark.org/0.31.2). Each test cites the
+// section and example it exercises. Cases marked DIVERGENCE document where this
+// simplified parser intentionally differs from full CommonMark.
+//===----------------------------------------------------------------------===//
+
+// CommonMark §4.1 Example 51: spaces are allowed between the characters.
+TEST_F(MarkdownParserTest, ThematicBreakSpacedDashes) {
+ auto Nodes = parseMarkdown("- - -", Arena);
+ ASSERT_EQ(Nodes.size(), 1u);
+ EXPECT_TRUE(isa<ThematicBreakNode>(Nodes[0]));
+}
+
+// CommonMark §4.1 Example 44: +++ is not a thematic break.
+TEST_F(MarkdownParserTest, PlusesAreNotThematicBreak) {
+ auto Nodes = parseMarkdown("+++", Arena);
+ ASSERT_EQ(Nodes.size(), 1u);
+ auto *P = cast<ParagraphNode>(Nodes[0]);
+ ASSERT_EQ(P->Children.size(), 1u);
+ EXPECT_EQ(cast<TextNode>(P->Children[0])->Text, "+++");
+}
+
+// CommonMark §4.1 Example 46: fewer than three characters is not a break.
+TEST_F(MarkdownParserTest, TwoDashesAreNotThematicBreak) {
+ auto Nodes = parseMarkdown("--", Arena);
+ ASSERT_EQ(Nodes.size(), 1u);
+ auto *P = cast<ParagraphNode>(Nodes[0]);
+ ASSERT_EQ(P->Children.size(), 1u);
+ EXPECT_EQ(cast<TextNode>(P->Children[0])->Text, "--");
+}
+
+// CommonMark §4.2 Example 64: a # not followed by a space is not a heading.
+TEST_F(MarkdownParserTest, HashWithoutSpaceIsNotHeading) {
+ auto Nodes = parseMarkdown("#5 bolt", Arena);
+ ASSERT_EQ(Nodes.size(), 1u);
+ auto *P = cast<ParagraphNode>(Nodes[0]);
+ ASSERT_EQ(P->Children.size(), 1u);
+ EXPECT_EQ(cast<TextNode>(P->Children[0])->Text, "#5 bolt");
+}
+
+// CommonMark §4.2 Example 64: "#hashtag" is a paragraph, not a heading.
+TEST_F(MarkdownParserTest, HashtagIsNotHeading) {
+ auto Nodes = parseMarkdown("#hashtag", Arena);
+ ASSERT_EQ(Nodes.size(), 1u);
+ auto *P = cast<ParagraphNode>(Nodes[0]);
+ ASSERT_EQ(P->Children.size(), 1u);
+ EXPECT_EQ(cast<TextNode>(P->Children[0])->Text, "#hashtag");
+}
+
+// CommonMark §4.2 Example 67: spaces around the heading content are stripped.
+TEST_F(MarkdownParserTest, HeadingStripsContentSpaces) {
+ auto Nodes = parseMarkdown("# foo", Arena);
+ ASSERT_EQ(Nodes.size(), 1u);
+ auto *H = cast<HeadingNode>(Nodes[0]);
+ EXPECT_EQ(H->Level, 1u);
+ ASSERT_EQ(H->Children.size(), 1u);
+ EXPECT_EQ(cast<TextNode>(H->Children[0])->Text, "foo");
+}
+
+// CommonMark §5.2: * is a valid bullet list marker.
+TEST_F(MarkdownParserTest, UnorderedListAsteriskMarker) {
+ auto Nodes = parseMarkdown("* foo", Arena);
+ ASSERT_EQ(Nodes.size(), 1u);
+ auto *N = cast<UnorderedListNode>(Nodes[0]);
+ ASSERT_EQ(N->Items.size(), 1u);
+ EXPECT_EQ(cast<TextNode>(N->Items[0]->Children[0])->Text, "foo");
+}
+
+// CommonMark §5.2 Example 301: + is a valid bullet list marker.
+TEST_F(MarkdownParserTest, UnorderedListPlusMarker) {
+ auto Nodes = parseMarkdown("+ foo", Arena);
+ ASSERT_EQ(Nodes.size(), 1u);
+ auto *N = cast<UnorderedListNode>(Nodes[0]);
+ ASSERT_EQ(N->Items.size(), 1u);
+ EXPECT_EQ(cast<TextNode>(N->Items[0]->Children[0])->Text, "foo");
+}
+
+// CommonMark §5.2 Example 267: an ordered list may start at 0.
+TEST_F(MarkdownParserTest, OrderedListStartZero) {
+ auto Nodes = parseMarkdown("0. ok", Arena);
+ ASSERT_EQ(Nodes.size(), 1u);
+ auto *N = cast<OrderedListNode>(Nodes[0]);
+ EXPECT_EQ(N->Start, 0u);
+ ASSERT_EQ(N->Items.size(), 1u);
+ EXPECT_EQ(cast<TextNode>(N->Items[0]->Children[0])->Text, "ok");
+}
+
+// CommonMark §5.2 Example 296: ordered lists may use a ) delimiter. DIVERGENCE:
+// this parser only recognizes the . delimiter, so "1) foo" is plain text.
+TEST_F(MarkdownParserTest, OrderedListParenDelimiterNotSupported) {
+ auto Nodes = parseMarkdown("1) foo", Arena);
+ ASSERT_EQ(Nodes.size(), 1u);
+ auto *P = cast<ParagraphNode>(Nodes[0]);
+ ASSERT_EQ(P->Children.size(), 1u);
+ EXPECT_EQ(cast<TextNode>(P->Children[0])->Text, "1) foo");
+}
+
+// CommonMark §6.2 Example 355: intraword emphasis with asterisks.
+TEST_F(MarkdownParserTest, IntrawordEmphasisAsterisk) {
+ auto Nodes = parseMarkdown("foo*bar*", Arena);
+ ASSERT_EQ(Nodes.size(), 1u);
+ auto *P = cast<ParagraphNode>(Nodes[0]);
+ ASSERT_EQ(P->Children.size(), 2u);
+ EXPECT_EQ(cast<TextNode>(P->Children[0])->Text, "foo");
+ auto *Em = cast<EmphasisNode>(P->Children[1]);
+ ASSERT_EQ(Em->Children.size(), 1u);
+ EXPECT_EQ(cast<TextNode>(Em->Children[0])->Text, "bar");
+}
+
+// CommonMark §6.2 Example 381: intraword strong with asterisks.
+TEST_F(MarkdownParserTest, IntrawordStrongAsterisk) {
+ auto Nodes = parseMarkdown("foo**bar**", Arena);
+ ASSERT_EQ(Nodes.size(), 1u);
+ auto *P = cast<ParagraphNode>(Nodes[0]);
+ ASSERT_EQ(P->Children.size(), 2u);
+ EXPECT_EQ(cast<TextNode>(P->Children[0])->Text, "foo");
+ auto *St = cast<StrongNode>(P->Children[1]);
+ ASSERT_EQ(St->Children.size(), 1u);
+ EXPECT_EQ(cast<TextNode>(St->Children[0])->Text, "bar");
+}
+
+// CommonMark §6.2 Example 360: intraword underscores do NOT open emphasis, so
+// "foo_bar_" is literal text. DIVERGENCE: this parser lacks the intraword
+// underscore rule (see the findClosingDelim TODO) and treats it as emphasis.
+TEST_F(MarkdownParserTest, IntrawordUnderscoreEmphasisDivergence) {
+ auto Nodes = parseMarkdown("foo_bar_", Arena);
+ ASSERT_EQ(Nodes.size(), 1u);
+ auto *P = cast<ParagraphNode>(Nodes[0]);
+ ASSERT_EQ(P->Children.size(), 2u);
+ EXPECT_EQ(cast<TextNode>(P->Children[0])->Text, "foo");
+ auto *Em = cast<EmphasisNode>(P->Children[1]);
+ ASSERT_EQ(Em->Children.size(), 1u);
+ EXPECT_EQ(cast<TextNode>(Em->Children[0])->Text, "bar");
+}
+
+// CommonMark §6.1 Example 331: a code span strips one leading and trailing
+// space when both are present.
+TEST_F(MarkdownParserTest, CodeSpanStripsSurroundingSpaces) {
+ auto Nodes = parseMarkdown("`` x ``", Arena);
+ ASSERT_EQ(Nodes.size(), 1u);
+ auto *P = cast<ParagraphNode>(Nodes[0]);
+ ASSERT_EQ(P->Children.size(), 1u);
+ EXPECT_EQ(cast<InlineCodeNode>(P->Children[0])->Code, "x");
+}
+
} // namespace
>From e9e6b8d7b1509d36d5c93f604f31b3e4ad9a63ea Mon Sep 17 00:00:00 2001
From: Neil-N4 <neilnair4 at gmail.com>
Date: Sat, 13 Jun 2026 00:03:54 -0400
Subject: [PATCH 21/27] [clang-doc] Add block quote parsing with recursive
inner parsing
---
.../clang-doc/support/Markdown.cpp | 38 ++++++++++++++
.../clang-doc/MarkdownParserTest.cpp | 50 +++++++++++++++++++
2 files changed, 88 insertions(+)
diff --git a/clang-tools-extra/clang-doc/support/Markdown.cpp b/clang-tools-extra/clang-doc/support/Markdown.cpp
index 2f0cc5bffe566..fdfc619e0ea05 100644
--- a/clang-tools-extra/clang-doc/support/Markdown.cpp
+++ b/clang-tools-extra/clang-doc/support/Markdown.cpp
@@ -14,6 +14,7 @@
#include "llvm/Support/DebugLog.h"
#include "llvm/Support/StringSaver.h"
#include <cassert>
+#include <string>
#define DEBUG_TYPE "clang-doc"
@@ -71,6 +72,12 @@ static bool isThematicBreak(StringRef Line) {
return Count >= 3;
}
+// Returns true if Line is a block quote line: it starts with "> ", or is a bare
+// ">" marking an empty quote line.
+static bool isBlockQuote(StringRef Line) {
+ return Line.starts_with("> ") || Line == ">";
+}
+
// Returns the ATX heading level (1 to 6) when Line is an ATX heading: one to
// six leading # characters followed by a space. Returns 0 otherwise, so seven
// or more # characters fall back to plain text.
@@ -394,6 +401,31 @@ static HeadingNode *parseHeading(LineReader &Reader, BumpPtrAllocator &Arena,
return Heading;
}
+// Parses a block quote: one or more consecutive lines beginning with "> ". The
+// > marker and one following space are stripped from each line, and the
+// collected text is parsed recursively, so a quote's children are block-level
+// nodes and nested quotes fall out naturally.
+static BlockQuoteNode *parseBlockQuote(LineReader &Reader,
+ BumpPtrAllocator &Arena) {
+ std::string Inner;
+ bool First = true;
+ while (!Reader.atEnd()) {
+ StringRef L = Reader.peek().trim();
+ if (!isBlockQuote(L))
+ break;
+ if (!First)
+ Inner += '\n';
+ First = false;
+ StringRef Content = L.starts_with("> ") ? L.drop_front(2) : L.drop_front(1);
+ Inner.append(Content.data(), Content.size());
+ Reader.advance();
+ }
+ ArrayRef<MDNode *> Children = parseMarkdown(Inner, Arena);
+ auto *Quote = new (Arena) BlockQuoteNode(Children);
+ LDBG() << "emitting BlockQuoteNode children=" << Children.size();
+ return Quote;
+}
+
ArrayRef<MDNode *> parseMarkdown(StringRef ParagraphText,
BumpPtrAllocator &Arena) {
if (ParagraphText.trim().empty())
@@ -435,6 +467,12 @@ ArrayRef<MDNode *> parseMarkdown(StringRef ParagraphText,
continue;
}
+ // Block quote: consecutive lines beginning with "> ".
+ if (isBlockQuote(Line)) {
+ Nodes.push_back(parseBlockQuote(Reader, Arena));
+ continue;
+ }
+
// Pipe table: current line has | and next line is a separator row.
if (Line.contains('|') && isSepRow(Reader.peek(1).trim())) {
Nodes.push_back(parsePipeTable(Reader, Arena, Saver));
diff --git a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
index 350b15c2541ed..aedcd9407b197 100644
--- a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
+++ b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
@@ -614,4 +614,54 @@ TEST_F(MarkdownParserTest, CodeSpanStripsSurroundingSpaces) {
EXPECT_EQ(cast<InlineCodeNode>(P->Children[0])->Code, "x");
}
+TEST_F(MarkdownParserTest, BlockQuote) {
+ auto Nodes = parseMarkdown("> hello", Arena);
+ ASSERT_EQ(Nodes.size(), 1u);
+ auto *Q = cast<BlockQuoteNode>(Nodes[0]);
+ ASSERT_EQ(Q->Children.size(), 1u);
+ auto *P = cast<ParagraphNode>(Q->Children[0]);
+ ASSERT_EQ(P->Children.size(), 1u);
+ EXPECT_EQ(cast<TextNode>(P->Children[0])->Text, "hello");
+}
+
+TEST_F(MarkdownParserTest, BlockQuoteWithFencedCode) {
+ auto Nodes = parseMarkdown(R"(> ```cpp
+> int x = 0;
+> ```)",
+ Arena);
+ ASSERT_EQ(Nodes.size(), 1u);
+ auto *Q = cast<BlockQuoteNode>(Nodes[0]);
+ ASSERT_EQ(Q->Children.size(), 1u);
+ auto *Code = cast<FencedCodeNode>(Q->Children[0]);
+ EXPECT_EQ(Code->Lang, "cpp");
+ ASSERT_EQ(Code->Lines.size(), 1u);
+ EXPECT_EQ(Code->Lines[0], "int x = 0;");
+}
+
+TEST_F(MarkdownParserTest, BlockQuoteWithEmphasis) {
+ auto Nodes = parseMarkdown("> an *important* note", Arena);
+ ASSERT_EQ(Nodes.size(), 1u);
+ auto *Q = cast<BlockQuoteNode>(Nodes[0]);
+ ASSERT_EQ(Q->Children.size(), 1u);
+ auto *P = cast<ParagraphNode>(Q->Children[0]);
+ ASSERT_EQ(P->Children.size(), 3u);
+ EXPECT_EQ(cast<TextNode>(P->Children[0])->Text, "an ");
+ auto *Em = cast<EmphasisNode>(P->Children[1]);
+ ASSERT_EQ(Em->Children.size(), 1u);
+ EXPECT_EQ(cast<TextNode>(Em->Children[0])->Text, "important");
+ EXPECT_EQ(cast<TextNode>(P->Children[2])->Text, " note");
+}
+
+TEST_F(MarkdownParserTest, NestedBlockQuote) {
+ auto Nodes = parseMarkdown("> > deep", Arena);
+ ASSERT_EQ(Nodes.size(), 1u);
+ auto *Outer = cast<BlockQuoteNode>(Nodes[0]);
+ ASSERT_EQ(Outer->Children.size(), 1u);
+ auto *Inner = cast<BlockQuoteNode>(Outer->Children[0]);
+ ASSERT_EQ(Inner->Children.size(), 1u);
+ auto *P = cast<ParagraphNode>(Inner->Children[0]);
+ ASSERT_EQ(P->Children.size(), 1u);
+ EXPECT_EQ(cast<TextNode>(P->Children[0])->Text, "deep");
+}
+
} // namespace
>From 9325916d90af28470b9c7dd634b3c12471cc6dc5 Mon Sep 17 00:00:00 2001
From: Neil-N4 <neilnair4 at gmail.com>
Date: Sat, 13 Jun 2026 12:22:13 -0400
Subject: [PATCH 22/27] [clang-doc] Address review feedback: review comments
---
.../clang-doc/support/Markdown.cpp | 2 +-
.../clang-doc/support/Markdown.h | 52 +++++++++----------
2 files changed, 27 insertions(+), 27 deletions(-)
diff --git a/clang-tools-extra/clang-doc/support/Markdown.cpp b/clang-tools-extra/clang-doc/support/Markdown.cpp
index fdfc619e0ea05..08277b1405e0b 100644
--- a/clang-tools-extra/clang-doc/support/Markdown.cpp
+++ b/clang-tools-extra/clang-doc/support/Markdown.cpp
@@ -293,7 +293,7 @@ static ArrayRef<MDNode *> parseInline(StringRef S, BumpPtrAllocator &Arena,
// Parses a fenced code block opened with ``` or ~~~. The cursor must be on the
// opening fence; the fence, body lines, and closing fence are consumed.
//
-// TODO: Follow CommonMark spec §4.5 more closely -- opening fences may be
+// TODO: Follow CommonMark spec §4.5 more closely. Opening fences may be
// indented up to 3 spaces, the closing fence must use the same character and be
// at least as long as the opening fence, and the closing fence may only be
// followed by spaces. Doxygen specifics should be handled on a case-by-case
diff --git a/clang-tools-extra/clang-doc/support/Markdown.h b/clang-tools-extra/clang-doc/support/Markdown.h
index 8c2055868671a..a9b00a5c10225 100644
--- a/clang-tools-extra/clang-doc/support/Markdown.h
+++ b/clang-tools-extra/clang-doc/support/Markdown.h
@@ -15,27 +15,27 @@
/// llvm::isa<>/cast<>/dyn_cast<> to check what a node is.
///
/// Inline nodes (appear inside ParagraphNode, HeadingNode, etc.):
-/// TextNode -- plain text run
-/// SoftBreakNode -- soft line break
-/// HardBreakNode -- hard line break (trailing spaces or backslash)
-/// InlineCodeNode -- inline code span (`code`)
-/// EmphasisNode -- emphasis (*text* or _text_)
-/// StrongNode -- strong emphasis (**text** or __text__)
+/// TextNode: plain text run
+/// SoftBreakNode: soft line break
+/// HardBreakNode: hard line break (trailing spaces or backslash)
+/// InlineCodeNode: inline code span (`code`)
+/// EmphasisNode: emphasis (*text* or _text_)
+/// StrongNode: strong emphasis (**text** or __text__)
///
/// Block nodes:
-/// ParagraphNode -- sequence of inline nodes
-/// HeadingNode -- ATX heading (# through ######), level 1-6
-/// FencedCodeNode -- fenced code block (``` or ~~~)
-/// TableNode -- pipe table (raw row text; TODO: structured cells)
-/// UnorderedListNode -- bullet list (-, *, +)
-/// OrderedListNode -- numbered list with explicit start number
-/// ListItemNode -- single item inside a list
-/// BlockQuoteNode -- block quote (>)
-/// ThematicBreakNode -- horizontal rule (---, ***, ___)
+/// ParagraphNode: sequence of inline nodes
+/// HeadingNode: ATX heading (# through ######), level 1-6
+/// FencedCodeNode: fenced code block (``` or ~~~)
+/// TableNode: pipe table (raw row text; TODO: structured cells)
+/// UnorderedListNode: bullet list (-, *, +)
+/// OrderedListNode: numbered list with explicit start number
+/// ListItemNode: single item inside a list
+/// BlockQuoteNode: block quote (>)
+/// ThematicBreakNode: horizontal rule (---, ***, ___)
///
/// All nodes are arena-allocated. The caller owns the arena and must keep it
-/// alive for the lifetime of any returned nodes. The parser never crashes on
-/// malformed input; unrecognized text falls back to TextNode.
+/// alive for the lifetime of any returned nodes. Malformed input is parsed as
+/// plain text rather than rejected; unrecognized text falls back to TextNode.
///
//===----------------------------------------------------------------------===//
@@ -58,7 +58,7 @@ enum class NodeKind {
NK_InlineCode,
NK_Emphasis,
NK_Strong,
- NK_LastInline = NK_Strong, // sentinel -- all inline kinds are <= this
+ NK_LastInline = NK_Strong, // sentinel: all inline kinds are <= this
// Block nodes
NK_Paragraph,
@@ -70,12 +70,12 @@ enum class NodeKind {
NK_ListItem,
NK_BlockQuote,
NK_ThematicBreak,
- NK_FirstBlock = NK_Paragraph, // sentinel -- all block kinds are >= this
+ NK_FirstBlock = NK_Paragraph, // sentinel: all block kinds are >= this
};
-/// Base type for all Markdown AST nodes. Carries only the kind discriminator.
-/// Nodes are arena-allocated and have no virtual destructor; use
-/// llvm::isa<>/cast<>/dyn_cast<> for type-safe downcasting.
+/// Base type for all Markdown AST nodes. Nodes are arena-allocated and have no
+/// virtual destructor; use llvm::isa<>/cast<>/dyn_cast<> for type-safe
+/// downcasting.
struct MDNode {
NodeKind Kind;
explicit MDNode(NodeKind K) : Kind(K) {}
@@ -93,7 +93,7 @@ struct TextNode : MDNode {
static bool classof(const MDNode *N) { return N->Kind == NodeKind::NK_Text; }
};
-/// Soft line break -- a newline that does not end the paragraph.
+/// Soft line break: a newline that does not end the paragraph.
struct SoftBreakNode : MDNode {
SoftBreakNode() : MDNode(NodeKind::NK_SoftBreak) {}
static bool classof(const MDNode *N) {
@@ -101,7 +101,7 @@ struct SoftBreakNode : MDNode {
}
};
-/// Hard line break -- two trailing spaces or a backslash before a newline.
+/// Hard line break: two trailing spaces or a backslash before a newline.
struct HardBreakNode : MDNode {
HardBreakNode() : MDNode(NodeKind::NK_HardBreak) {}
static bool classof(const MDNode *N) {
@@ -143,7 +143,7 @@ struct StrongNode : MDNode {
// Block nodes
//===----------------------------------------------------------------------===//
-/// A paragraph -- sequence of inline nodes separated from other blocks by
+/// A paragraph: sequence of inline nodes separated from other blocks by
/// blank lines.
struct ParagraphNode : MDNode {
llvm::ArrayRef<MDNode *> Children;
@@ -169,7 +169,7 @@ struct HeadingNode : MDNode {
/// "cpp"); empty when no language was specified. Lines contains the raw text
/// of each interior line, without the opening or closing fence.
///
-/// TODO: Follow CommonMark spec §4.5 -- the opening fence may be indented up
+/// TODO: Follow CommonMark spec §4.5. The opening fence may be indented up
/// to 3 spaces; the closing fence must use the same character and be at least
/// as long as the opening fence; only spaces may follow the closing fence.
struct FencedCodeNode : MDNode {
>From 9061cd48f9ec27d72f252414c047626bc1add513 Mon Sep 17 00:00:00 2001
From: Neil-N4 <neilnair4 at gmail.com>
Date: Sat, 13 Jun 2026 12:42:12 -0400
Subject: [PATCH 23/27] [clang-doc] Implement CommonMark delimiter stack for
emphasis and strong parsing
---
.../clang-doc/support/Markdown.cpp | 256 ++++++++++++++----
.../clang-doc/MarkdownParserTest.cpp | 14 +-
2 files changed, 202 insertions(+), 68 deletions(-)
diff --git a/clang-tools-extra/clang-doc/support/Markdown.cpp b/clang-tools-extra/clang-doc/support/Markdown.cpp
index 08277b1405e0b..9ce5339fc8cb6 100644
--- a/clang-tools-extra/clang-doc/support/Markdown.cpp
+++ b/clang-tools-extra/clang-doc/support/Markdown.cpp
@@ -11,6 +11,7 @@
#include "llvm/ADT/StringExtras.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/Support/Allocator.h"
+#include "llvm/Support/Casting.h"
#include "llvm/Support/DebugLog.h"
#include "llvm/Support/StringSaver.h"
#include <cassert>
@@ -184,50 +185,108 @@ static StringRef trimCodeSpan(StringRef Code) {
return Code;
}
-// Finds the start index of a closing emphasis run of exactly DelimLen copies of
-// DelimChar, searching forward from StartPos. Requires non-whitespace
-// immediately inside both the opening and closing delimiters and non-empty
-// content, a simplified take on the CommonMark §6.2 flanking rules. Returns
-// StringRef::npos if no valid closing run exists.
-static size_t findClosingDelim(StringRef S, size_t StartPos, char DelimChar,
- size_t DelimLen) {
- size_t E = S.size();
- // Opening delimiter is not left-flanking if whitespace follows it.
- if (StartPos >= E || isSpace(S[StartPos]))
- return StringRef::npos;
- for (size_t J = StartPos; J + DelimLen <= E; ++J) {
- if (S[J] != DelimChar)
- continue;
- size_t Run = countRun(S, J, DelimChar);
- if (Run != DelimLen) {
- J += Run - 1; // Skip the whole run; the loop's ++J lands past it.
- continue;
- }
- // Reject empty content and closing runs that are not right-flanking.
- if (J == StartPos || isSpace(S[J - 1]))
- continue;
- return J;
+// Treats the start and end of the string (passed as '\0') as whitespace for the
+// CommonMark flanking rules.
+static bool isFlankWhitespace(char C) { return C == '\0' || isSpace(C); }
+
+// Computes whether a delimiter run can open or close emphasis, from the
+// characters immediately before and after the run, per the CommonMark §6.2
+// flanking rules. Before and After are '\0' at the string boundaries.
+static void computeFlanking(char Before, char Marker, char After, bool &CanOpen,
+ bool &CanClose) {
+ bool AfterWS = isFlankWhitespace(After);
+ bool BeforeWS = isFlankWhitespace(Before);
+ bool AfterPunct = isPunct(After);
+ bool BeforePunct = isPunct(Before);
+ bool LeftFlanking = !AfterWS && (!AfterPunct || BeforeWS || BeforePunct);
+ bool RightFlanking = !BeforeWS && (!BeforePunct || AfterWS || AfterPunct);
+ if (Marker == '_') {
+ // Underscore does not open or close emphasis intraword.
+ CanOpen = LeftFlanking && (!RightFlanking || BeforePunct);
+ CanClose = RightFlanking && (!LeftFlanking || AfterPunct);
+ } else {
+ CanOpen = LeftFlanking;
+ CanClose = RightFlanking;
}
- return StringRef::npos;
}
+namespace {
+// One piece of inline content while emphasis is being resolved. A piece is
+// either a finished content node (text, code span, or a built emphasis or
+// strong node) or a run of delimiter characters that may still open or close
+// emphasis. Pieces form a doubly linked list through Prev/Next so matched runs
+// can be spliced out without shifting the others.
+struct InlinePiece {
+ MDNode *Node = nullptr; // content node, or null while this is a delimiter run
+ char Ch = 0; // '*' or '_' for a delimiter run
+ size_t Len = 0; // delimiters still available in the run
+ unsigned OrigLen = 0; // original run length, for the multiple-of-three rule
+ bool CanOpen = false;
+ bool CanClose = false;
+ int Prev = -1;
+ int Next = -1;
+};
+} // namespace
+
// Parses the inline content of a single line into a sequence of inline nodes:
-// inline code (`code`), strong (**text** or __text__), and emphasis (*text* or
-// _text_). Runs that match no construct become TextNodes. Emphasis and strong
-// recurse so their content may itself contain inline constructs. Text with no
-// markers yields a single TextNode.
+// inline code (`code`), emphasis (*text* or _text_), and strong (**text** or
+// __text__). Emphasis is resolved with a CommonMark-style delimiter stack: a
+// first pass tokenizes the line into text, code spans, and delimiter runs (each
+// tagged with its flanking flags), then a second pass walks closers back to
+// openers, honoring the multiple-of-three rule. Unmatched runs stay as text.
//
-// TODO: This covers the common cases but not the full CommonMark §6 inline
-// model (delimiter stacks, intraword underscore rules, links, autolinks).
+// TODO: This does not yet handle links, autolinks, or backslash escapes.
static ArrayRef<MDNode *> parseInline(StringRef S, BumpPtrAllocator &Arena,
StringSaver &Saver) {
- SmallVector<MDNode *> Nodes;
+ SmallVector<InlinePiece> Pool;
+ int Head = -1, Tail = -1;
+
+ auto makePiece = [&]() -> int {
+ Pool.emplace_back();
+ return Pool.size() - 1;
+ };
+ auto linkAtTail = [&](int Idx) {
+ Pool[Idx].Prev = Tail;
+ (Tail != -1 ? Pool[Tail].Next : Head) = Idx;
+ Tail = Idx;
+ };
+ auto appendNode = [&](MDNode *N) {
+ int Idx = makePiece();
+ Pool[Idx].Node = N;
+ linkAtTail(Idx);
+ };
+ // Content nodes pass through; a leftover delimiter run becomes a TextNode of
+ // its remaining characters.
+ auto pieceNode = [&](int P) -> MDNode * {
+ if (Pool[P].Node)
+ return Pool[P].Node;
+ return new (Arena)
+ TextNode(Saver.save(std::string(Pool[P].Len, Pool[P].Ch)));
+ };
+ // Merges adjacent TextNodes so unmatched delimiters coalesce with neighboring
+ // text, then copies the result into the arena.
+ auto finalize = [&](SmallVectorImpl<MDNode *> &Nodes) -> ArrayRef<MDNode *> {
+ SmallVector<MDNode *> Merged;
+ for (MDNode *Nd : Nodes) {
+ if (isa<TextNode>(Nd) && !Merged.empty() &&
+ isa<TextNode>(Merged.back())) {
+ StringRef Prev = cast<TextNode>(Merged.back())->Text;
+ StringRef Cur = cast<TextNode>(Nd)->Text;
+ Merged.back() =
+ new (Arena) TextNode(Saver.save(Prev.str() + Cur.str()));
+ } else {
+ Merged.push_back(Nd);
+ }
+ }
+ return allocateArray(Merged, Arena);
+ };
+
+ // Phase 1: tokenize the line into text, code spans, and delimiter runs.
CharReader Reader(S);
size_t TextStart = 0;
-
auto flushText = [&](size_t End) {
if (End > TextStart)
- Nodes.push_back(new (Arena) TextNode(
+ appendNode(new (Arena) TextNode(
Saver.save(S.substr(TextStart, End - TextStart))));
};
@@ -246,7 +305,7 @@ static ArrayRef<MDNode *> parseInline(StringRef S, BumpPtrAllocator &Arena,
flushText(Pos);
StringRef Code =
trimCodeSpan(S.substr(Pos + OpenLen, ClosePos - (Pos + OpenLen)));
- Nodes.push_back(new (Arena) InlineCodeNode(Saver.save(Code)));
+ appendNode(new (Arena) InlineCodeNode(Saver.save(Code)));
Reader.seek(ClosePos + OpenLen);
TextStart = Reader.position();
continue;
@@ -256,38 +315,117 @@ static ArrayRef<MDNode *> parseInline(StringRef S, BumpPtrAllocator &Arena,
continue;
}
- // Emphasis (*text*, _text_) and strong (**text**, __text__).
+ // Delimiter run for emphasis or strong.
if (C == '*' || C == '_') {
- // Strong binds the two-delimiter form before single-delimiter emphasis.
- if (Reader.peek(1) == C) {
- size_t Close = findClosingDelim(S, Pos + 2, C, 2);
- if (Close != StringRef::npos) {
- flushText(Pos);
- StringRef Inner = S.substr(Pos + 2, Close - (Pos + 2));
- Nodes.push_back(new (Arena)
- StrongNode(parseInline(Inner, Arena, Saver)));
- Reader.seek(Close + 2);
- TextStart = Reader.position();
- continue;
+ size_t RunLen = countRun(S, Pos, C);
+ flushText(Pos);
+ char Before = Pos == 0 ? '\0' : S[Pos - 1];
+ char After = Pos + RunLen < S.size() ? S[Pos + RunLen] : '\0';
+ int Idx = makePiece();
+ InlinePiece &D = Pool[Idx];
+ D.Ch = C;
+ D.Len = RunLen;
+ D.OrigLen = RunLen;
+ computeFlanking(Before, C, After, D.CanOpen, D.CanClose);
+ linkAtTail(Idx);
+ Reader.seek(Pos + RunLen);
+ TextStart = Reader.position();
+ continue;
+ }
+
+ Reader.advance();
+ }
+ flushText(S.size());
+
+ // Phase 2: match closers back to openers. OpenersBottom records, per closer
+ // kind, how far back a failed search needs to look, keyed by delimiter char,
+ // run length mod 3, and whether the closer can also open.
+ int OpenersBottom[12];
+ for (int &B : OpenersBottom)
+ B = -1;
+ auto bucket = [](const InlinePiece &P) {
+ return (P.Ch == '_' ? 6 : 0) + (P.OrigLen % 3) * 2 + (P.CanOpen ? 1 : 0);
+ };
+
+ int Current = Head;
+ while (Current != -1) {
+ // Advance to the next run that can close.
+ while (Current != -1 &&
+ !(Pool[Current].Ch && Pool[Current].CanClose && Pool[Current].Len))
+ Current = Pool[Current].Next;
+ if (Current == -1)
+ break;
+ int Closer = Current;
+ int Key = bucket(Pool[Closer]);
+
+ // Search back for the nearest matching opener.
+ int Opener = Pool[Closer].Prev;
+ bool Found = false;
+ while (Opener != -1 && Opener != OpenersBottom[Key]) {
+ InlinePiece &O = Pool[Opener];
+ if (O.Ch == Pool[Closer].Ch && O.Len && O.CanOpen) {
+ unsigned Sum = O.OrigLen + Pool[Closer].OrigLen;
+ bool OddMatch = (O.CanClose || Pool[Closer].CanOpen) && Sum % 3 == 0 &&
+ !(O.OrigLen % 3 == 0 && Pool[Closer].OrigLen % 3 == 0);
+ if (!OddMatch) {
+ Found = true;
+ break;
}
}
- size_t Close = findClosingDelim(S, Pos + 1, C, 1);
- if (Close != StringRef::npos) {
- flushText(Pos);
- StringRef Inner = S.substr(Pos + 1, Close - (Pos + 1));
- Nodes.push_back(new (Arena)
- EmphasisNode(parseInline(Inner, Arena, Saver)));
- Reader.seek(Close + 1);
- TextStart = Reader.position();
- continue;
- }
+ Opener = Pool[Opener].Prev;
}
- Reader.advance();
+ if (!Found) {
+ OpenersBottom[Key] = Pool[Closer].Prev;
+ // A run that cannot also open will never match anything; keep its text
+ // but stop treating it as a delimiter.
+ if (!Pool[Closer].CanOpen)
+ Pool[Closer].CanClose = false;
+ Current = Pool[Closer].Next;
+ continue;
+ }
+
+ // Wrap the pieces between opener and closer, consuming one delimiter from
+ // each side for emphasis or two for strong.
+ unsigned Use = Pool[Opener].Len >= 2 && Pool[Closer].Len >= 2 ? 2 : 1;
+ SmallVector<MDNode *> Inner;
+ for (int P = Pool[Opener].Next; P != Closer; P = Pool[P].Next)
+ Inner.push_back(pieceNode(P));
+ Pool[Opener].Len -= Use;
+ Pool[Closer].Len -= Use;
+ MDNode *Emph =
+ Use == 2
+ ? static_cast<MDNode *>(new (Arena) StrongNode(finalize(Inner)))
+ : static_cast<MDNode *>(new (Arena) EmphasisNode(finalize(Inner)));
+ int EP = makePiece();
+ Pool[EP].Node = Emph;
+ Pool[EP].Prev = Opener;
+ Pool[EP].Next = Closer;
+ Pool[Opener].Next = EP;
+ Pool[Closer].Prev = EP;
+
+ // Drop the opener or closer once its run is fully consumed.
+ if (Pool[Opener].Len == 0) {
+ int Pr = Pool[Opener].Prev;
+ Pool[EP].Prev = Pr;
+ (Pr != -1 ? Pool[Pr].Next : Head) = EP;
+ }
+ if (Pool[Closer].Len == 0) {
+ int Nx = Pool[Closer].Next;
+ Pool[EP].Next = Nx;
+ (Nx != -1 ? Pool[Nx].Prev : Tail) = EP;
+ Current = Nx;
+ } else {
+ Current = Closer;
+ }
}
- flushText(S.size());
- return allocateArray(Nodes, Arena);
+ // Phase 3: collect the surviving pieces, dropping fully consumed delimiters.
+ SmallVector<MDNode *> Result;
+ for (int P = Head; P != -1; P = Pool[P].Next)
+ if (Pool[P].Node || Pool[P].Len)
+ Result.push_back(pieceNode(P));
+ return finalize(Result);
}
// Parses a fenced code block opened with ``` or ~~~. The cursor must be on the
diff --git a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
index aedcd9407b197..49e61e8c129fa 100644
--- a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
+++ b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
@@ -590,18 +590,14 @@ TEST_F(MarkdownParserTest, IntrawordStrongAsterisk) {
EXPECT_EQ(cast<TextNode>(St->Children[0])->Text, "bar");
}
-// CommonMark §6.2 Example 360: intraword underscores do NOT open emphasis, so
-// "foo_bar_" is literal text. DIVERGENCE: this parser lacks the intraword
-// underscore rule (see the findClosingDelim TODO) and treats it as emphasis.
-TEST_F(MarkdownParserTest, IntrawordUnderscoreEmphasisDivergence) {
+// CommonMark §6.2 Example 360: intraword underscores do not open or close
+// emphasis, so "foo_bar_" stays as literal text.
+TEST_F(MarkdownParserTest, IntrawordUnderscoreIsText) {
auto Nodes = parseMarkdown("foo_bar_", Arena);
ASSERT_EQ(Nodes.size(), 1u);
auto *P = cast<ParagraphNode>(Nodes[0]);
- ASSERT_EQ(P->Children.size(), 2u);
- EXPECT_EQ(cast<TextNode>(P->Children[0])->Text, "foo");
- auto *Em = cast<EmphasisNode>(P->Children[1]);
- ASSERT_EQ(Em->Children.size(), 1u);
- EXPECT_EQ(cast<TextNode>(Em->Children[0])->Text, "bar");
+ ASSERT_EQ(P->Children.size(), 1u);
+ EXPECT_EQ(cast<TextNode>(P->Children[0])->Text, "foo_bar_");
}
// CommonMark §6.1 Example 331: a code span strips one leading and trailing
>From 3e6c805a5450fb844e4c59dd6c2f0bca0fdef3eb Mon Sep 17 00:00:00 2001
From: Neil-N4 <neilnair4 at gmail.com>
Date: Sat, 13 Jun 2026 13:05:41 -0400
Subject: [PATCH 24/27] [clang-doc] Add delimiter stack edge case tests for
nested emphasis and rule-of-three
---
.../clang-doc/MarkdownParserTest.cpp | 65 +++++++++++++++++++
1 file changed, 65 insertions(+)
diff --git a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
index 49e61e8c129fa..5571478578d93 100644
--- a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
+++ b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
@@ -610,6 +610,71 @@ TEST_F(MarkdownParserTest, CodeSpanStripsSurroundingSpaces) {
EXPECT_EQ(cast<InlineCodeNode>(P->Children[0])->Code, "x");
}
+// CommonMark §6.2 Example 413: a triple run splits across two matches, the
+// inner pair forming strong and the outer pair emphasis, so "***foo***" is
+// emphasis wrapping strong. The old findClosingDelim search matched a run only
+// against an equal-length run and could not split one this way.
+TEST_F(MarkdownParserTest, TripleDelimiterBoldItalic) {
+ auto Nodes = parseMarkdown("***foo***", Arena);
+ ASSERT_EQ(Nodes.size(), 1u);
+ auto *P = cast<ParagraphNode>(Nodes[0]);
+ ASSERT_EQ(P->Children.size(), 1u);
+ auto *Em = cast<EmphasisNode>(P->Children[0]);
+ ASSERT_EQ(Em->Children.size(), 1u);
+ auto *St = cast<StrongNode>(Em->Children[0]);
+ ASSERT_EQ(St->Children.size(), 1u);
+ EXPECT_EQ(cast<TextNode>(St->Children[0])->Text, "foo");
+}
+
+// CommonMark §6.2: emphasis containing a strong span, "*foo **bar** baz*". The
+// outer emphasis spans delimiter runs of two different lengths, which the
+// equal-length findClosingDelim search could not pair.
+TEST_F(MarkdownParserTest, MixedDelimitersEmStrongEm) {
+ auto Nodes = parseMarkdown("*foo **bar** baz*", Arena);
+ ASSERT_EQ(Nodes.size(), 1u);
+ auto *P = cast<ParagraphNode>(Nodes[0]);
+ ASSERT_EQ(P->Children.size(), 1u);
+ auto *Em = cast<EmphasisNode>(P->Children[0]);
+ ASSERT_EQ(Em->Children.size(), 3u);
+ EXPECT_EQ(cast<TextNode>(Em->Children[0])->Text, "foo ");
+ auto *St = cast<StrongNode>(Em->Children[1]);
+ ASSERT_EQ(St->Children.size(), 1u);
+ EXPECT_EQ(cast<TextNode>(St->Children[0])->Text, "bar");
+ EXPECT_EQ(cast<TextNode>(Em->Children[2])->Text, " baz");
+}
+
+// CommonMark §6.2: strong containing emphasis with text on both sides,
+// "**foo *bar* baz**". The inner emphasis closes before the outer strong does,
+// which the single forward scan handled only by coincidence of nesting order.
+TEST_F(MarkdownParserTest, NestedEmphasisInsideStrong) {
+ auto Nodes = parseMarkdown("**foo *bar* baz**", Arena);
+ ASSERT_EQ(Nodes.size(), 1u);
+ auto *P = cast<ParagraphNode>(Nodes[0]);
+ ASSERT_EQ(P->Children.size(), 1u);
+ auto *St = cast<StrongNode>(P->Children[0]);
+ ASSERT_EQ(St->Children.size(), 3u);
+ EXPECT_EQ(cast<TextNode>(St->Children[0])->Text, "foo ");
+ auto *Em = cast<EmphasisNode>(St->Children[1]);
+ ASSERT_EQ(Em->Children.size(), 1u);
+ EXPECT_EQ(cast<TextNode>(Em->Children[0])->Text, "bar");
+ EXPECT_EQ(cast<TextNode>(St->Children[2])->Text, " baz");
+}
+
+// CommonMark §6.2 rule of three: when a closer can also open, it may not match
+// an opener whose run length sums with the closer's to a multiple of three. In
+// "**foo*bar*" the leading ** cannot close against the inner *, so ** stays
+// literal and only *bar* becomes emphasis.
+TEST_F(MarkdownParserTest, MultipleOfThreeBlocksClose) {
+ auto Nodes = parseMarkdown("**foo*bar*", Arena);
+ ASSERT_EQ(Nodes.size(), 1u);
+ auto *P = cast<ParagraphNode>(Nodes[0]);
+ ASSERT_EQ(P->Children.size(), 2u);
+ EXPECT_EQ(cast<TextNode>(P->Children[0])->Text, "**foo");
+ auto *Em = cast<EmphasisNode>(P->Children[1]);
+ ASSERT_EQ(Em->Children.size(), 1u);
+ EXPECT_EQ(cast<TextNode>(Em->Children[0])->Text, "bar");
+}
+
TEST_F(MarkdownParserTest, BlockQuote) {
auto Nodes = parseMarkdown("> hello", Arena);
ASSERT_EQ(Nodes.size(), 1u);
>From 518cb8c4205309e71711a11cea0ccaec61a96f28 Mon Sep 17 00:00:00 2001
From: Neil-N4 <neilnair4 at gmail.com>
Date: Sat, 13 Jun 2026 13:33:14 -0400
Subject: [PATCH 25/27] [clang-doc] Parse table cells with inline content in
pipe tables
---
.../clang-doc/support/Markdown.cpp | 42 +++++++++++++++----
.../clang-doc/support/Markdown.h | 25 +++++++----
.../clang-doc/MarkdownParserTest.cpp | 42 ++++++++++++++++++-
3 files changed, 93 insertions(+), 16 deletions(-)
diff --git a/clang-tools-extra/clang-doc/support/Markdown.cpp b/clang-tools-extra/clang-doc/support/Markdown.cpp
index 9ce5339fc8cb6..4e07c56d9a4ad 100644
--- a/clang-tools-extra/clang-doc/support/Markdown.cpp
+++ b/clang-tools-extra/clang-doc/support/Markdown.cpp
@@ -460,18 +460,44 @@ static FencedCodeNode *parseFencedCode(LineReader &Reader,
return Code;
}
+// Splits a pipe table row into cell texts. A single optional leading and
+// trailing pipe are dropped, then the remainder is split on '|' and each cell
+// is trimmed.
+// TODO: A '|' inside a code span or escaped as "\|" should not split a cell.
+static void splitTableRow(StringRef Row, SmallVectorImpl<StringRef> &Cells) {
+ Row = Row.trim();
+ if (Row.starts_with("|"))
+ Row = Row.drop_front();
+ if (Row.ends_with("|"))
+ Row = Row.drop_back();
+ SmallVector<StringRef> Parts;
+ Row.split(Parts, '|');
+ for (StringRef Part : Parts)
+ Cells.push_back(Part.trim());
+}
+
// Parses a pipe table. The cursor must be on the header row, with a separator
-// row following; consecutive lines containing a | are taken as rows.
+// row following; consecutive lines containing a | are taken as body rows. Each
+// cell's text is parsed into inline nodes.
static TableNode *parsePipeTable(LineReader &Reader, BumpPtrAllocator &Arena,
StringSaver &Saver) {
- SmallVector<StringRef> Rows;
- // TODO: Rows are kept as raw line text for now. Table cells may contain
- // inline content (emphasis, code spans, links), so each row may need to be
- // split on '|' and parsed further into structured cells.
+ auto parseRow = [&](StringRef Line) -> TableRow {
+ SmallVector<StringRef> CellTexts;
+ splitTableRow(Line, CellTexts);
+ SmallVector<TableCell> Cells;
+ for (StringRef Text : CellTexts)
+ Cells.push_back(TableCell{parseInline(Text, Arena, Saver)});
+ return TableRow{allocateArray(Cells, Arena)};
+ };
+
+ TableRow Header = parseRow(Reader.advance());
+ Reader.advance(); // skip the alignment separator row
+ SmallVector<TableRow> Body;
while (!Reader.atEnd() && Reader.peek().trim().contains('|'))
- Rows.push_back(Saver.save(Reader.advance().trim()));
- auto *Table = new (Arena) TableNode(allocateArray(Rows, Arena));
- LDBG() << "emitting TableNode rows=" << Rows.size();
+ Body.push_back(parseRow(Reader.advance()));
+ auto *Table = new (Arena) TableNode(Header, allocateArray(Body, Arena));
+ LDBG() << "emitting TableNode header_cells=" << Header.Cells.size()
+ << " body_rows=" << Body.size();
return Table;
}
diff --git a/clang-tools-extra/clang-doc/support/Markdown.h b/clang-tools-extra/clang-doc/support/Markdown.h
index a9b00a5c10225..622f3cbb3fc63 100644
--- a/clang-tools-extra/clang-doc/support/Markdown.h
+++ b/clang-tools-extra/clang-doc/support/Markdown.h
@@ -26,7 +26,7 @@
/// ParagraphNode: sequence of inline nodes
/// HeadingNode: ATX heading (# through ######), level 1-6
/// FencedCodeNode: fenced code block (``` or ~~~)
-/// TableNode: pipe table (raw row text; TODO: structured cells)
+/// TableNode: pipe table (a header row and body rows of cells)
/// UnorderedListNode: bullet list (-, *, +)
/// OrderedListNode: numbered list with explicit start number
/// ListItemNode: single item inside a list
@@ -182,13 +182,24 @@ struct FencedCodeNode : MDNode {
}
};
-/// Pipe table. Rows contains the raw text of each row line including the
-/// header and separator rows.
-/// TODO: replace with a structured header/body/cell representation.
+/// A single table cell. Children holds the cell's parsed inline content.
+struct TableCell {
+ llvm::ArrayRef<MDNode *> Children;
+};
+
+/// A table row, split into cells on the row's pipe characters.
+struct TableRow {
+ llvm::ArrayRef<TableCell> Cells;
+};
+
+/// Pipe table. Header is the first row; Body holds the rows following the
+/// alignment separator. Each cell's text is parsed into inline nodes.
+/// TODO: capture per-column alignment from the separator row.
struct TableNode : MDNode {
- llvm::ArrayRef<llvm::StringRef> Rows;
- explicit TableNode(llvm::ArrayRef<llvm::StringRef> Rows)
- : MDNode(NodeKind::NK_Table), Rows(Rows) {}
+ TableRow Header;
+ llvm::ArrayRef<TableRow> Body;
+ TableNode(TableRow Header, llvm::ArrayRef<TableRow> Body)
+ : MDNode(NodeKind::NK_Table), Header(Header), Body(Body) {}
static bool classof(const MDNode *N) { return N->Kind == NodeKind::NK_Table; }
};
diff --git a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
index 5571478578d93..d07f1fe1a92e0 100644
--- a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
+++ b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
@@ -78,7 +78,47 @@ TEST_F(MarkdownParserTest, PipeTable) {
| 1 | 2 |)",
Arena);
ASSERT_EQ(Nodes.size(), 1u);
- EXPECT_TRUE(isa<TableNode>(Nodes[0]));
+ auto *T = cast<TableNode>(Nodes[0]);
+ ASSERT_EQ(T->Header.Cells.size(), 2u);
+ ASSERT_EQ(T->Header.Cells[0].Children.size(), 1u);
+ EXPECT_EQ(cast<TextNode>(T->Header.Cells[0].Children[0])->Text, "A");
+ ASSERT_EQ(T->Header.Cells[1].Children.size(), 1u);
+ EXPECT_EQ(cast<TextNode>(T->Header.Cells[1].Children[0])->Text, "B");
+ ASSERT_EQ(T->Body.size(), 1u);
+ ASSERT_EQ(T->Body[0].Cells.size(), 2u);
+ EXPECT_EQ(cast<TextNode>(T->Body[0].Cells[0].Children[0])->Text, "1");
+ EXPECT_EQ(cast<TextNode>(T->Body[0].Cells[1].Children[0])->Text, "2");
+}
+
+// A table cell's text runs through the inline parser, so emphasis inside a cell
+// becomes an EmphasisNode rather than literal text.
+TEST_F(MarkdownParserTest, TableCellWithEmphasis) {
+ auto Nodes = parseMarkdown(R"(| *a* | b |
+|---|---|
+| c | d |)",
+ Arena);
+ ASSERT_EQ(Nodes.size(), 1u);
+ auto *T = cast<TableNode>(Nodes[0]);
+ ASSERT_EQ(T->Header.Cells.size(), 2u);
+ ASSERT_EQ(T->Header.Cells[0].Children.size(), 1u);
+ auto *Em = cast<EmphasisNode>(T->Header.Cells[0].Children[0]);
+ ASSERT_EQ(Em->Children.size(), 1u);
+ EXPECT_EQ(cast<TextNode>(Em->Children[0])->Text, "a");
+ EXPECT_EQ(cast<TextNode>(T->Header.Cells[1].Children[0])->Text, "b");
+}
+
+// A code span inside a table cell becomes an InlineCodeNode.
+TEST_F(MarkdownParserTest, TableCellWithInlineCode) {
+ auto Nodes = parseMarkdown(R"(| `x` | y |
+|---|---|
+| z | w |)",
+ Arena);
+ ASSERT_EQ(Nodes.size(), 1u);
+ auto *T = cast<TableNode>(Nodes[0]);
+ ASSERT_EQ(T->Header.Cells.size(), 2u);
+ ASSERT_EQ(T->Header.Cells[0].Children.size(), 1u);
+ EXPECT_EQ(cast<InlineCodeNode>(T->Header.Cells[0].Children[0])->Code, "x");
+ EXPECT_EQ(cast<TextNode>(T->Body[0].Cells[0].Children[0])->Text, "z");
}
TEST_F(MarkdownParserTest, PipeCharacterWithoutSepRowIsPlainText) {
>From 2319b205870a405d501ddde6d39bdbbbb8d3572a Mon Sep 17 00:00:00 2001
From: Neil-N4 <neilnair4 at gmail.com>
Date: Sat, 13 Jun 2026 16:32:44 -0400
Subject: [PATCH 26/27] [clang-doc] Trim over-explained comments and fix
inaccurate parseMarkdown doc
---
clang-tools-extra/clang-doc/support/Markdown.cpp | 16 +++++-----------
clang-tools-extra/clang-doc/support/Markdown.h | 6 +-----
2 files changed, 6 insertions(+), 16 deletions(-)
diff --git a/clang-tools-extra/clang-doc/support/Markdown.cpp b/clang-tools-extra/clang-doc/support/Markdown.cpp
index 4e07c56d9a4ad..079ae5d12e9f2 100644
--- a/clang-tools-extra/clang-doc/support/Markdown.cpp
+++ b/clang-tools-extra/clang-doc/support/Markdown.cpp
@@ -89,10 +89,8 @@ static unsigned atxHeadingLevel(StringRef Line) {
return Level;
}
-// A forward cursor over the lines of a paragraph. Encapsulates the parse
-// position so the loop can inspect the current or an upcoming line and consume
-// lines without manual index arithmetic. Lines are stored untrimmed; callers
-// trim where they need a normalized view.
+// A forward cursor over the lines of a paragraph. Lines are stored untrimmed;
+// callers trim where they need a normalized view.
class LineReader {
public:
explicit LineReader(ArrayRef<StringRef> Lines) : Lines(Lines) {}
@@ -125,11 +123,8 @@ class LineReader {
size_t Pos = 0;
};
-// A forward cursor over the characters of a string. The character-level analog
-// of LineReader: the inline scanner inspects the current or an upcoming
-// character and consumes characters without manual index arithmetic. position()
-// and seek() let it interoperate with the index-based run and delimiter helpers
-// below, since inline constructs are not consumed one character at a time.
+// A forward cursor over the characters of a string. position() and seek() let
+// it interoperate with the index-based run and delimiter helpers below.
class CharReader {
public:
explicit CharReader(StringRef S) : S(S) {}
@@ -434,8 +429,7 @@ static ArrayRef<MDNode *> parseInline(StringRef S, BumpPtrAllocator &Arena,
// TODO: Follow CommonMark spec §4.5 more closely. Opening fences may be
// indented up to 3 spaces, the closing fence must use the same character and be
// at least as long as the opening fence, and the closing fence may only be
-// followed by spaces. Doxygen specifics should be handled on a case-by-case
-// basis.
+// followed by spaces.
static FencedCodeNode *parseFencedCode(LineReader &Reader,
BumpPtrAllocator &Arena,
StringSaver &Saver) {
diff --git a/clang-tools-extra/clang-doc/support/Markdown.h b/clang-tools-extra/clang-doc/support/Markdown.h
index 622f3cbb3fc63..01027c170e51e 100644
--- a/clang-tools-extra/clang-doc/support/Markdown.h
+++ b/clang-tools-extra/clang-doc/support/Markdown.h
@@ -168,10 +168,6 @@ struct HeadingNode : MDNode {
/// Fenced code block opened with ``` or ~~~. Lang is the info string (e.g.
/// "cpp"); empty when no language was specified. Lines contains the raw text
/// of each interior line, without the opening or closing fence.
-///
-/// TODO: Follow CommonMark spec §4.5. The opening fence may be indented up
-/// to 3 spaces; the closing fence must use the same character and be at least
-/// as long as the opening fence; only spaces may follow the closing fence.
struct FencedCodeNode : MDNode {
llvm::StringRef Lang;
llvm::ArrayRef<llvm::StringRef> Lines;
@@ -259,7 +255,7 @@ struct ThematicBreakNode : MDNode {
/// Parse Markdown from a single paragraph of plain text. Returns a list of
/// top-level nodes allocated in Arena. Returns an empty ArrayRef only for empty
/// or whitespace-only input; plain text with no Markdown constructs returns a
-/// single TextNode.
+/// single ParagraphNode.
///
/// The caller must keep Arena alive for the lifetime of any returned nodes.
llvm::ArrayRef<MDNode *> parseMarkdown(llvm::StringRef ParagraphText,
>From 6a3d7c120676e25bc48aa1aae53cce1c00f8d8a8 Mon Sep 17 00:00:00 2001
From: Neil-N4 <neilnair4 at gmail.com>
Date: Sat, 13 Jun 2026 16:32:51 -0400
Subject: [PATCH 27/27] [clang-doc] Trim over-explained comments and fix
inaccurate parseMarkdown doc
---
.gitignore | 1 +
CLANG_COMMENT_PARSER_NOTES.md | 175 +++++++++++++++++++++++++++
MARKDOWN_PARSER_RESEARCH.md | 220 ++++++++++++++++++++++++++++++++++
followed | 0
prefix | 0
5 files changed, 396 insertions(+)
create mode 100644 CLANG_COMMENT_PARSER_NOTES.md
create mode 100644 MARKDOWN_PARSER_RESEARCH.md
create mode 100644 followed
create mode 100644 prefix
diff --git a/.gitignore b/.gitignore
index 9d4e86ab10caa..5addf57e504b5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -88,3 +88,4 @@ pythonenv*
/clang/utils/analyzer/projects/*/RefScanBuildResults
# automodapi puts generated documentation files here.
/lldb/docs/python_api/
+GSOC_CONTEXT.md
diff --git a/CLANG_COMMENT_PARSER_NOTES.md b/CLANG_COMMENT_PARSER_NOTES.md
new file mode 100644
index 0000000000000..6e7193368e2f9
--- /dev/null
+++ b/CLANG_COMMENT_PARSER_NOTES.md
@@ -0,0 +1,175 @@
+# Clang Comment Parser Notes (Markdown integration scouting)
+
+Scope: understand how Clang turns a doc comment into a comment AST, where
+paragraph boundaries get drawn, and where the standalone Markdown library could
+eventually hook in. Read against `clang/lib/AST/CommentLexer.cpp`,
+`clang/lib/AST/CommentParser.cpp`, and `clang/include/clang/AST/Comment.h`.
+
+The pipeline is three stages: raw text -> tokens (lexer) -> comment AST
+(parser, via Sema) -> consumer (clang-doc's serializer today).
+
+## 1. Tokenization (CommentLexer)
+
+The lexer emits a flat stream of comment tokens. The token kinds live in
+`tok::TokenKind` (`clang/include/clang/AST/CommentLexer.h:32`):
+
+- `eof`, `newline`, `text` -- the common cases.
+- `unknown_command`, `backslash_command`, `at_command` -- `\foo` / `@foo`.
+- `verbatim_block_begin`, `verbatim_block_line`, `verbatim_block_end` --
+ `\code ... \endcode` and friends.
+- `verbatim_line_name`, `verbatim_line_text` -- single-line verbatim commands.
+- `html_start_tag`, `html_ident`, `html_equals`, `html_quoted_string`,
+ `html_greater`, `html_slash_greater`, `html_end_tag` -- raw HTML passthrough.
+
+Key behaviors:
+
+- The lexer runs a two-level state machine: a comment state (`//` vs `/* */`)
+ and a lexing state (`LS_Normal`, verbatim block/line states, HTML states),
+ both declared in `CommentLexer.h`.
+- Plain text and newlines: `Lexer::lexCommentText()`
+ (`CommentLexer.cpp:305`) walks a line, using `skipTextToken()`
+ (`CommentLexer.cpp:282`) to find the next boundary and emitting a single
+ `tok::text` for the run, then a `tok::newline` at each line end. Newlines are
+ their own tokens -- this matters for paragraph detection downstream.
+- Leading decoration is stripped per line by `skipLineStartingDecorations()`
+ (`CommentLexer.cpp:90`): horizontal whitespace plus a single leading `*` in
+ `/* */` comments. It is called after each newline (`CommentLexer.cpp:322`).
+ So by the time text reaches the parser, the `*` gutter and `///` markers are
+ already gone, but inner text is otherwise verbatim (interior spaces kept).
+- Verbatim blocks are NOT decomposed. `setupAndLexVerbatimBlock()`
+ (`CommentLexer.cpp:468`) and `lexVerbatimBlockFirstLine()`
+ (`CommentLexer.cpp:493`) emit one `tok::verbatim_block_line` per line with the
+ raw line text preserved. This is Doxygen's existing `\code` path: content is
+ already kept intact, not re-lexed.
+
+## 2. Parsing and paragraph boundaries (CommentParser)
+
+Entry point: `Parser::parseFullComment()` (`CommentParser.cpp:923`) returns a
+`FullComment *`. It skips leading newlines, then loops calling
+`parseBlockContent()` until `eof`, collecting `BlockContentComment *` blocks
+(`CommentParser.cpp:928-935`).
+
+`parseBlockContent()` (`CommentParser.cpp:892`) dispatches on the current token:
+text / command / HTML -> `parseParagraphOrBlockCommand()`;
+`verbatim_block_begin` -> `parseVerbatimBlock()`;
+`verbatim_line_name` -> `parseVerbatimLine()`.
+
+Paragraph assembly and boundaries happen in
+`parseParagraphOrBlockCommand()` (`CommentParser.cpp:719`). It accumulates a
+`SmallVector<InlineContentComment *> Content` and ends the paragraph on:
+
+- `verbatim_block_begin`, `verbatim_line_name`, or `eof`
+ (`CommentParser.cpp:724-727`).
+- A block command (`Info->IsBlockCommand`), e.g. `\brief`, `\param`
+ (`CommentParser.cpp:739-743`). If the paragraph is still empty it becomes a
+ block command instead (`parseBlockCommand()`).
+- **A blank line.** This is the core boundary rule
+ (`CommentParser.cpp:765-786`): on a `tok::newline`, peek the next token; two
+ consecutive newlines (or `newline`, whitespace-only `text`, `newline`) end
+ the paragraph. A single newline is non-terminating -- it just calls
+ `addTrailingNewline()` on the last inline node and continues.
+
+So paragraph boundaries are purely lexical (blank line / block command / EOF),
+decided here and nowhere else. There is no Markdown-aware blocking: a fenced
+code block written with ```` ``` ```` is, to this parser, just `text` lines
+inside one paragraph (unless the author used `\code`, which becomes a verbatim
+block instead).
+
+Inline content within a paragraph is built through Sema factory calls, not
+constructors: `S.actOnText()` for `tok::text` (`CommentParser.cpp:798`),
+`parseInlineCommand()` for inline `\foo`, `parseHTMLStartTag()` /
+`parseHTMLEndTag()` for HTML. The finished paragraph is created by
+`S.actOnParagraphComment(S.copyArray(ArrayRef(Content)))`
+(`CommentParser.cpp:817`).
+
+## 3. The comment AST (Comment.h)
+
+Two abstract bases under `Comment`:
+
+- `InlineContentComment` (`Comment.h:271`): `TextComment` (`Comment.h:297`),
+ `InlineCommandComment`, `HTMLStartTagComment`, `HTMLEndTagComment`.
+- `BlockContentComment` (`Comment.h:559`): `ParagraphComment` (`Comment.h:576`),
+ `BlockCommandComment` (and its `ParamCommandComment` / `TParamCommandComment`),
+ `VerbatimBlockComment` (`Comment.h:900`), `VerbatimLineComment`.
+
+`FullComment` (`Comment.h:1106`) is the top node, holding the block list.
+
+For Markdown, the text we care about lives in:
+
+- `ParagraphComment::Content` -- `ArrayRef<InlineContentComment *>`
+ (`Comment.h:577`), iterable via `child_begin()`/`child_end()`.
+- `TextComment::getText()` -> `StringRef` (`Comment.h:315`). This is the leaf
+ source of all plain text. Inline command and HTML nodes wrap/annotate text
+ but don't hold paragraph prose directly.
+
+To reconstruct a paragraph's plain text you walk `ParagraphComment`'s children
+and concatenate each `TextComment::getText()` (re-inserting the soft newlines
+the lexer split on). That is exactly the shape `parseMarkdown()` already
+expects: one paragraph of plain text in, block nodes out.
+
+## 4. Where clang-doc consumes this today
+
+clang-doc does not touch the comment parser directly. In `Serialize.cpp` it
+calls `RawComment::parse()` to get a `FullComment`
+(`Serialize.cpp:611, 931`), then `Serializer::parseFullComment()`
+(`Serialize.cpp:385`) runs `ClangDocCommentVisitor` (`Serialize.cpp:204`),
+which mirrors the comment AST into clang-doc's own `CommentInfo` tree
+(`visitTextComment`, etc.). `CommentInfo` is what the generators see.
+
+The current JSON Markdown integration (on `md-json-integration`) parses at the
+**generator** level: in `JSONGenerator.cpp`, when a `CK_ParagraphComment`'s
+children are all `CK_TextComment`, it concatenates their text and runs
+`parseMarkdown()`. So the prose has already made a round trip through tokens ->
+`FullComment` -> `CommentInfo` before we parse it.
+
+## 5. Candidate hook points (shallow to deep)
+
+1. **Generator level (current).** Parse in `JSONGenerator.cpp` over
+ `CommentInfo`. Pros: isolated, no Clang AST changes, already working. Cons:
+ clang-doc-only; every other comment consumer (the other generators, anything
+ in Clang) gets nothing; relies on `CommentInfo` faithfully preserving the
+ text and on the all-text-children heuristic.
+
+2. **clang-doc serializer level.** Parse inside `ClangDocCommentVisitor` when a
+ `ParagraphComment` is visited (`Serialize.cpp:~204-249`), attaching parsed
+ Markdown to `CommentInfo` once for all generators. Pros: every clang-doc
+ backend benefits, single place, still no core-Clang change. Cons: still
+ clang-doc-only; needs a `CommentInfo` field to carry the parsed tree.
+
+3. **Comment parser / Sema level (long-term goal).** Teach the comment layer
+ itself about Markdown. The clean seam is `parseParagraphOrBlockCommand()`
+ (`CommentParser.cpp:719`) / `actOnParagraphComment()`: a paragraph's
+ collected text is available right before the node is built, and paragraph
+ boundaries are already computed here. A Markdown pass could run over each
+ `ParagraphComment`'s text (or over `FullComment` as a post-pass) and produce
+ structured nodes shared by all of Clang, not just clang-doc. Cons: largest
+ blast radius (touches a core AST that many tools depend on), needs new AST
+ node types or a side table, and has to coexist with the existing Doxygen
+ constructs (`\code` verbatim blocks, inline commands, HTML) rather than
+ re-interpreting them.
+
+### Practical notes for whichever hook
+
+- Verbatim blocks already solve the fenced-code case via `\code`/`\endcode`;
+ Markdown ```` ``` ```` fences arrive as ordinary `text` lines. A hook should
+ decide whether to treat ```` ``` ```` as code (parse it) or defer to existing
+ `\code` blocks, and per Erick's guidance, skip paragraphs that already carry
+ Doxygen code tags.
+- Paragraph text reaches us with the `///` and `*` gutters stripped
+ (`skipLineStartingDecorations`, `CommentLexer.cpp:90`) but soft newlines
+ intact as separate tokens, so reassembly needs to re-join lines with `\n`,
+ which is what the generator hook already does.
+- Boundaries are blank-line based, so a multi-line Markdown construct (table,
+ list, fenced block) survives as a single paragraph only if it has no blank
+ lines inside it. Anything separated by a blank line is already a distinct
+ `ParagraphComment` and would be parsed independently.
+
+## Open questions / follow-ups
+
+- Does `CommentInfo` preserve enough to reconstruct the original line breaks
+ losslessly for option 1/2, or do we need to carry the raw paragraph text
+ through? (Check what `visitTextComment` stores vs. drops.)
+- For option 3, would mentors want new `comments::` AST nodes, or a separate
+ Markdown tree hung off `FullComment` as a side channel?
+- How should this interact with `-fparse-all-comments` and with non-Doxygen
+ comment styles?
diff --git a/MARKDOWN_PARSER_RESEARCH.md b/MARKDOWN_PARSER_RESEARCH.md
new file mode 100644
index 0000000000000..c6581e881357e
--- /dev/null
+++ b/MARKDOWN_PARSER_RESEARCH.md
@@ -0,0 +1,220 @@
+# Markdown Parser Research: Block vs Inline Split and Inline Scanners
+
+Scope: how three established Markdown parsers (cmark, pulldown-cmark, goldmark)
+separate block parsing from inline parsing, how their character-level inline
+scanners are built, and which patterns are worth pulling into the clang-doc
+parser (`clang-tools-extra/clang-doc/support/Markdown.cpp`).
+
+Sources read:
+- cmark `src/blocks.c` and `src/inlines.c` (reference CommonMark C library)
+- pulldown-cmark `pulldown-cmark/src/parse.rs` (Rust pull parser)
+- goldmark `parser/parser.go` (Go CommonMark parser)
+
+## The common architecture: two phases
+
+All three parsers make the same top-level decision: **build the block tree
+first from raw line spans, then parse inline content in a second pass.** Block
+parsing never looks inside a run of text for emphasis or code; it only decides
+where paragraphs, code fences, lists, headings, and quotes begin and end. Inline
+parsing runs afterward over the text that the block phase collected.
+
+This split exists because block boundaries are decided line by line (a blank
+line ends a paragraph, a fence line opens a code block) while inline structure is
+decided character by character within already-delimited text. Keeping them
+separate means the inline scanner can assume it is looking at a single
+self-contained run of text with no block transitions to worry about.
+
+---
+
+## cmark (C)
+
+### Block / inline split
+- Lines are fed through `S_parser_feed` -> `S_process_line`. Each line runs
+ three steps: `check_open_blocks` (do existing open containers continue?),
+ `open_new_blocks` (does a new block start here?), and
+ `add_text_to_container` (append the raw line text to the matched block).
+- Open blocks are tracked as a tree of `cmark_node` with a
+ `CMARK_NODE__OPEN` flag and a `parser->current` pointer. `add_child` walks up
+ the tree until it finds a parent that can accept the new child.
+- Block boundaries are detected by small dedicated scanners:
+ `scan_open_code_fence` / `scan_close_code_fence`, `scan_atx_heading_start`,
+ `parse_list_marker`, `parse_block_quote_prefix`, `S_scan_thematic_break`,
+ `scan_setext_heading_line`. Indentation is normalized by
+ `S_find_first_nonspace` and `S_advance_offset` (with tab expansion).
+- Inline parsing is explicitly deferred. `cmark_parser_finish` calls
+ `finalize_document`, then `process_inlines` iterates the finished tree with
+ `cmark_iter_new` and, for blocks where `contains_inlines()` is true
+ (paragraphs and headings only), calls `cmark_parse_inlines`. After a block's
+ inlines are parsed, its raw text buffer is freed.
+
+### Inline scanner
+- The scanner is a character loop over a `subject` struct that holds the input
+ chunk and a `pos` cursor, plus a `last_delim` delimiter stack pointer and a
+ `last_bracket` pointer for links.
+- `parse_inline` reads the current byte with `peek_char` and dispatches with a
+ switch: backtick -> `handle_backticks`, backslash -> `handle_backslash`,
+ `*` `_` `'` `"` -> `handle_delim`, `[` / `]` -> bracket handling, and a
+ default case that grabs a whole text run.
+- Text runs are accumulated in bulk: `subject_find_special_char` scans forward
+ using a lookup table of special bytes (`\r\n\\` `` ` `` `&_*[]<!`) and the
+ default case copies everything up to the next special byte into one text node.
+ The scanner does not advance one character at a time through ordinary text.
+- Code spans: `handle_backticks` records the opening backtick run length, then
+ `scan_to_closing_backticks` searches forward for a run of identical length.
+ Content is normalized by `S_normalize_code` (newlines to spaces, strip one
+ leading and trailing space).
+- Emphasis: a **delimiter stack**. `scan_delims` computes left/right flanking
+ (`can_open`, `can_close`) from the surrounding character classes.
+ `push_delimiter` records each `*`/`_` run. After the run is scanned,
+ `process_emphasis` walks the stack backward pairing closers with openers,
+ honoring the length-mod-3 rule via an `openers_bottom` table, and
+ `S_insert_emph` builds the EMPH/STRONG nodes and splices children between the
+ matched delimiters.
+
+---
+
+## pulldown-cmark (Rust)
+
+### Block / inline split
+- Self-described as a "tree-based two pass parser." `run_first_pass` builds a
+ `Tree<Item>` of block structure. Each `Item` stores `start`/`end` byte offsets
+ into the original source rather than copied strings.
+- The second pass is lazy: during event iteration, when the cursor reaches an
+ item whose body `is_maybe_inline()`, it calls `handle_inline`, which scans that
+ item's byte range for inline constructs and rewrites the tree in place.
+- `ItemBody` is an enum that distinguishes block kinds (`Paragraph`, `Heading`,
+ `List`) from unresolved inline markers (`MaybeEmphasis`, `MaybeCode`,
+ `MaybeLinkOpen`) and resolved inlines (`Code`, `Link`, `Emphasis`). The first
+ pass emits the "Maybe" variants; the inline pass resolves them.
+
+### Inline scanner
+- Two stages. `handle_inline_pass1` resolves code spans, HTML, and links first
+ (constructs whose interior must not be reprocessed), walking sibling tree nodes
+ via `tree[cur].next`. `handle_emphasis_and_hard_break` then resolves emphasis.
+- Emphasis uses an `InlineStack` of `InlineEl { start, count, run_length, c,
+ both }`. On a closer, `find_match` searches the stack backward for a
+ compatible opener, applying the same `(count + el.count) % 3 != 0` rule cmark
+ uses, and nests matches from the inside out. Leftover delimiters become text.
+- Code spans use a `CodeDelims` map keyed by backtick-run length
+ (`HashMap<usize, VecDeque<TreeIndex>>`), so a closing run of a given length can
+ be found without rescanning. `make_code_span` extracts and normalizes the span
+ and interns it through an allocation pool (`allocate_cow`).
+- The notable structural idea is that nodes carry source spans, and the inline
+ pass mutates tree indices in place rather than allocating a fresh node list.
+
+---
+
+## goldmark (Go)
+
+### Block / inline split
+- Interface driven. `BlockParser` has `Open` / `Continue` / `Close`;
+ `InlineParser` has `Parse` and `Trigger`. `Parse()` runs `parseBlocks` to build
+ the block AST, then `walkBlock` does a post-order traversal calling
+ `parseBlock`, which performs inline parsing on each finished block.
+- Open blocks are tracked as `openedBlocks []Block` (each a node plus its
+ parser) on the parser `Context`. Per line, open blocks try to `Continue`; if
+ they decline, `openBlocks` tries to start new ones. This is what implements
+ lazy paragraph continuation.
+
+### Inline scanner
+- Dispatch is table driven. Goldmark keeps a `[256][]InlineParser` array indexed
+ by trigger byte. Each `InlineParser` registers the bytes it cares about via
+ `Trigger()`. During `parseBlock`, the current byte selects
+ `inlineParsers[parserChar]` and only those parsers run, instead of testing
+ every parser at every position.
+- Emphasis uses a doubly-linked delimiter list on the `Context`
+ (`delimiters`, `lastDelimiter`) with `PushDelimiter`, `RemoveDelimiter`,
+ `ClearDelimiters`. `ProcessDelimiters` runs at block end to pair openers and
+ closers and build emphasis nodes, again decoupled from the initial scan.
+
+---
+
+## Cross-cutting comparison
+
+| Concern | cmark | pulldown-cmark | goldmark | clang-doc (current) |
+|---|---|---|---|---|
+| Block vs inline | block tree, then `process_inlines` | first pass tree, lazy inline pass | `parseBlocks` then `walkBlock`/`parseBlock` | block loop per line, inline run per plain-text line inline in the loop |
+| Inline dispatch | switch on current byte | enum rewrite over tree | `[256][]InlineParser` trigger table | if-chain on `` ` `` `*` `_` |
+| Text runs | bulk scan to next special byte | span offsets on tree items | parser-driven segments | char-by-char with a `flushText` lambda |
+| Code spans | matching backtick run | length-keyed delimiter map | trigger parser | matching backtick run (already aligned) |
+| Emphasis | delimiter stack + flanking + mod-3 | `InlineStack` + `find_match` + mod-3 | delimiter linked list + `ProcessDelimiters` | recursive `findClosingDelim` forward search (simplified subset) |
+| Text storage | raw buffer, freed after inline | source byte spans | reader segments | copied/interned via `StringSaver` |
+
+The three production parsers converge on the same two ideas: a deferred inline
+pass over text the block phase collected, and a delimiter-stack emphasis
+algorithm with flanking rules and the length-mod-3 opener constraint.
+
+---
+
+## Relevance to our LLVM clang-doc parser
+
+Current state of `support/Markdown.cpp`:
+- `parseMarkdown` is the block phase. It splits the paragraph into lines, walks
+ them with the `LineReader` cursor, and recognizes fenced code, pipe tables, and
+ unordered lists, falling back to plain text otherwise.
+- `parseInline` is the inline phase: a character scanner over one line with a
+ `Pos` cursor and a `flushText` lambda, dispatching on `` ` ``, `*`, `_`.
+- Emphasis/strong closing is found by `findClosingDelim`, a recursive forward
+ search, documented as a simplified subset of the CommonMark flanking rules.
+ Code spans already use matching backtick-run length via `countRun`.
+
+What maps cleanly, in rough priority order:
+
+1. **Make the inline pass a real second pass over all text-bearing nodes.**
+ Today `parseInline` is called only on the plain-text fallback line, so table
+ cells and list item text never get inline parsing. cmark, pulldown, and
+ goldmark all run inline parsing as a distinct phase over every block that
+ contains text. The lightweight version for us: after the block loop builds the
+ node list, walk it once and run `parseInline` on each text-bearing node
+ (paragraph text, list item text, and eventually table cells). This removes the
+ "inline only happens for loose paragraph lines" gap and matches the TODO
+ already noted on the table rows loop. It does not require a full tree rewrite,
+ just a second traversal of what `parseMarkdown` already produces.
+
+2. **Adopt the delimiter-stack emphasis algorithm if we want conformance.**
+ This is the single biggest structural difference. Our `findClosingDelim` is a
+ forward search that cannot resolve the general overlap and nesting cases
+ (triple runs like `***x***`, runs of differing length, interleaved `*`/`_`).
+ All three references implement the same standard algorithm: scan a run,
+ compute `can_open`/`can_close` from flanking, push onto a stack, then walk the
+ stack backward pairing closers to openers with the length-mod-3 opener
+ constraint. If emphasis conformance becomes a goal, this is the known-correct
+ shape to port, and our current code already isolates the flanking check
+ (`isSpace` neighbor tests) that would feed `can_open`/`can_close`.
+
+3. **Trigger-style dispatch when the inline grammar grows.** Our if-chain on
+ `` ` `` `*` `_` is fine for three constructs. If we add links, autolinks, or
+ entities, goldmark's trigger table (dispatch keyed by the current byte) or
+ cmark's switch keeps the scanner flat instead of a growing if-ladder. Not
+ needed yet; worth keeping in mind so the scanner does not accrete nested ifs.
+
+4. **Bulk text-run scanning.** cmark's `subject_find_special_char` jumps to the
+ next special byte and emits one text node, rather than advancing one byte at a
+ time. Our scanner advances character by character in the default case. A
+ `find_first_of("`*_")` style jump would cut per-character work on long plain
+ runs. Minor for doc-comment-sized input, but a cheap, local improvement.
+
+5. **Source spans vs copying (consider, do not rush).** pulldown stores byte
+ offsets into the original text instead of copying strings; cmark frees raw
+ buffers after the inline pass. We copy via `StringSaver`. For comment-sized
+ input the copy cost is negligible, and our source text is assembled from
+ concatenated comment fragments that are not guaranteed contiguous or stable,
+ so spans are not a clean fit today. Note it only as a future memory option if
+ the parser ever runs over large contiguous inputs.
+
+### Suggested sequencing
+- Near term: item 1 (deferred inline pass over text-bearing nodes) is the
+ highest-value, lowest-risk change and directly closes the table-cell and
+ list-item inline gap.
+- Medium term: item 2 (delimiter stack) only if CommonMark emphasis conformance
+ is prioritized over the current pragmatic subset.
+- Opportunistic: items 3 and 4 as the inline grammar and input sizes grow.
+
+### One caveat
+The three references are full CommonMark implementations. Our parser
+deliberately targets a doc-comment subset (the existing TODOs say as much), so
+the goal is to borrow structure (the two-phase split, the delimiter-stack shape,
+trigger dispatch) rather than match feature for feature. The most defensible next
+step is structural: turn inline parsing into a real second pass so every
+text-bearing node is handled uniformly, which is the one design choice all three
+parsers share and the one our current code most clearly diverges from.
diff --git a/followed b/followed
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/prefix b/prefix
new file mode 100644
index 0000000000000..e69de29bb2d1d
More information about the cfe-commits
mailing list