[clang-tools-extra] [clang-doc] Add standalone Markdown parsing library (PR #202991)
Neil Nair via cfe-commits
cfe-commits at lists.llvm.org
Sat Jun 13 09:42:35 PDT 2026
https://github.com/Neil-N4 updated https://github.com/llvm/llvm-project/pull/202991
>From b4623400ed04066b222882d46bbddf1819ffca9e Mon Sep 17 00:00:00 2001
From: Neil-N4 <neilnair4 at gmail.com>
Date: Wed, 10 Jun 2026 09:51:48 -0400
Subject: [PATCH 01/23] [clang-doc] Add standalone Markdown parsing library
---
.../clang-doc/support/CMakeLists.txt | 3 +-
.../clang-doc/support/Markdown.cpp | 145 ++++++++++++++++++
.../clang-doc/support/Markdown.h | 72 +++++++++
.../unittests/clang-doc/CMakeLists.txt | 4 +-
.../clang-doc/MarkdownParserTest.cpp | 94 ++++++++++++
5 files changed, 316 insertions(+), 2 deletions(-)
create mode 100644 clang-tools-extra/clang-doc/support/Markdown.cpp
create mode 100644 clang-tools-extra/clang-doc/support/Markdown.h
create mode 100644 clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
diff --git a/clang-tools-extra/clang-doc/support/CMakeLists.txt b/clang-tools-extra/clang-doc/support/CMakeLists.txt
index 8ac913ffbe998..acff865190ff9 100644
--- a/clang-tools-extra/clang-doc/support/CMakeLists.txt
+++ b/clang-tools-extra/clang-doc/support/CMakeLists.txt
@@ -6,5 +6,6 @@ set(LLVM_LINK_COMPONENTS
add_clang_library(clangDocSupport STATIC
File.cpp
+ Markdown.cpp
Utils.cpp
- )
+ )
\ No newline at end of file
diff --git a/clang-tools-extra/clang-doc/support/Markdown.cpp b/clang-tools-extra/clang-doc/support/Markdown.cpp
new file mode 100644
index 0000000000000..776150b939d27
--- /dev/null
+++ b/clang-tools-extra/clang-doc/support/Markdown.cpp
@@ -0,0 +1,145 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "Markdown.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/DebugLog.h"
+
+#define DEBUG_TYPE "clang-doc-markdown"
+
+using namespace llvm;
+
+namespace clang::doc::markdown {
+
+static MDNode makeText(StringRef S) {
+ return {NodeKind::NK_Text, S, {}};
+}
+
+// A line is a table separator if it only contains |, -, :, and spaces,
+// and has at least one -.
+static bool isSepRow(StringRef Line) {
+ return Line.contains('-') &&
+ Line.find_first_not_of("|-: ") == StringRef::npos;
+}
+
+// Returns true if Line begins with a bullet list marker (-, *, or +)
+// followed by a space.
+static bool isListItem(StringRef Line) {
+ return Line.starts_with("- ") || Line.starts_with("* ") ||
+ Line.starts_with("+ ");
+}
+
+static ArrayRef<MDNode> allocateNodes(const SmallVectorImpl<MDNode> &Nodes,
+ BumpPtrAllocator &Arena) {
+ if (Nodes.empty())
+ return {};
+ MDNode *Allocated = Arena.Allocate<MDNode>(Nodes.size());
+ std::uninitialized_copy(Nodes.begin(), Nodes.end(), Allocated);
+ return ArrayRef<MDNode>(Allocated, Nodes.size());
+}
+
+ArrayRef<MDNode> parseMarkdown(StringRef ParagraphText,
+ BumpPtrAllocator &Arena) {
+ if (ParagraphText.trim().empty())
+ return {};
+
+ SmallVector<StringRef, 16> Lines;
+ ParagraphText.split(Lines, '\n');
+
+ SmallVector<MDNode> Nodes;
+ size_t I = 0, E = Lines.size();
+
+ while (I < E) {
+ StringRef Line = Lines[I].trim();
+
+ if (Line.empty()) {
+ ++I;
+ continue;
+ }
+
+ // TODO: Follow CommonMark spec §4.5 more closely -- opening fences may be
+ // indented up to 3 spaces, the closing fence must use the same character
+ // and be at least as long as the opening fence, and the closing fence may
+ // only be followed by spaces. Doxygen specifics should be handled on a
+ // case-by-case basis.
+ if (Line.starts_with("```") || Line.starts_with("~~~")) {
+ char Fence = Line[0];
+ StringRef Lang = Line.drop_front(3).trim();
+ SmallVector<MDNode> CodeLines;
+ ++I;
+ while (I < E) {
+ StringRef CodeLine = Lines[I].trim();
+ if (CodeLine.size() >= 3 &&
+ all_of(CodeLine.take_front(3),
+ [Fence](char C) { return C == Fence; }))
+ break;
+ CodeLines.push_back(makeText(Lines[I]));
+ ++I;
+ }
+ ++I; // skip closing fence
+ MDNode Code;
+ Code.Kind = NodeKind::NK_FencedCode;
+ Code.Content = Lang;
+ Code.Children = allocateNodes(CodeLines, Arena);
+ LDBG() << "emitting NK_FencedCode lang='" << Lang
+ << "' lines=" << CodeLines.size();
+ Nodes.push_back(Code);
+ continue;
+ }
+
+ // Pipe table: current line has | and next line is a separator row.
+ if (Line.contains('|') && I + 1 < E && isSepRow(Lines[I + 1].trim())) {
+ SmallVector<MDNode> Rows;
+ while (I < E && Lines[I].trim().contains('|')) {
+ Rows.push_back(makeText(Lines[I].trim()));
+ ++I;
+ }
+ MDNode Table;
+ Table.Kind = NodeKind::NK_Table;
+ Table.Content = {};
+ Table.Children = allocateNodes(Rows, Arena);
+ LDBG() << "emitting NK_Table rows=" << Rows.size();
+ Nodes.push_back(Table);
+ continue;
+ }
+
+ // Unordered list item.
+ if (isListItem(Line)) {
+ SmallVector<MDNode> Items;
+ while (I < E) {
+ StringRef L = Lines[I].trim();
+ if (!isListItem(L))
+ break;
+ MDNode Item;
+ Item.Kind = NodeKind::NK_ListItem;
+ Item.Content = L.drop_front(2).trim();
+ Item.Children = {};
+ Items.push_back(Item);
+ ++I;
+ }
+ MDNode List;
+ List.Kind = NodeKind::NK_UnorderedList;
+ List.Content = {};
+ List.Children = allocateNodes(Items, Arena);
+ LDBG() << "emitting NK_UnorderedList items=" << Items.size();
+ Nodes.push_back(List);
+ continue;
+ }
+
+ // Plain text fallback.
+ Nodes.push_back(makeText(Line));
+ ++I;
+ }
+
+ LDBG() << "parseMarkdown done nodes=" << Nodes.size();
+ return allocateNodes(Nodes, Arena);
+}
+
+} // namespace clang::doc::markdown
\ No newline at end of file
diff --git a/clang-tools-extra/clang-doc/support/Markdown.h b/clang-tools-extra/clang-doc/support/Markdown.h
new file mode 100644
index 0000000000000..890f764f937b1
--- /dev/null
+++ b/clang-tools-extra/clang-doc/support/Markdown.h
@@ -0,0 +1,72 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines a standalone Markdown parsing library for the LLVM
+/// ecosystem. The parser takes plain text and returns a tree of typed nodes
+/// with no knowledge of comments, Doxygen, or Clang-Doc internals.
+///
+/// This is a simple Markdown parser for use inside Clang-Doc's comment
+/// pipeline. You give it a paragraph of text and an arena allocator, and it
+/// gives back a list of typed nodes describing the Markdown structure it found.
+///
+/// The main entry point is parseMarkdown(). If the text has no Markdown in it,
+/// you get back an empty list and can fall back to plain-text output. If it
+/// does, you get a tree of MDNode structs where each node has a kind, optional
+/// content (like the language tag on a code fence), and optional children.
+///
+/// All nodes are allocated in the arena you pass in. You own the arena and are
+/// responsible for keeping it alive as long as you use the nodes.
+///
+/// The parser handles fenced code blocks, pipe tables, and unordered lists.
+/// Anything it does not recognize comes back as a plain text node. It will
+/// never crash on bad input.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_DOC_MARKDOWN_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_DOC_MARKDOWN_H
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Allocator.h"
+
+namespace clang::doc::markdown {
+
+enum class NodeKind {
+ // Block nodes
+ NK_Paragraph,
+ NK_FencedCode,
+ NK_Table,
+ NK_UnorderedList,
+ NK_OrderedList,
+ NK_ListItem,
+ NK_ThematicBreak,
+ // Inline nodes
+ NK_Text,
+ NK_InlineCode,
+ NK_Emphasis,
+ NK_Strong,
+ NK_SoftBreak,
+};
+
+struct MDNode {
+ NodeKind Kind;
+ llvm::StringRef Content; // lang tag for FencedCode, leaf text for Text
+ llvm::ArrayRef<MDNode> Children; // arena allocated
+};
+
+/// Parses Markdown from a single comment paragraph's text.
+/// Returns an empty ArrayRef if no Markdown constructs are found,
+/// so generators can fall back to plain-text rendering at zero cost.
+llvm::ArrayRef<MDNode> parseMarkdown(llvm::StringRef ParagraphText,
+ llvm::BumpPtrAllocator &Arena);
+
+} // namespace clang::doc::markdown
+
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_DOC_MARKDOWN_H
\ No newline at end of file
diff --git a/clang-tools-extra/unittests/clang-doc/CMakeLists.txt b/clang-tools-extra/unittests/clang-doc/CMakeLists.txt
index 01b34ec9a791e..b74207ac88fa7 100644
--- a/clang-tools-extra/unittests/clang-doc/CMakeLists.txt
+++ b/clang-tools-extra/unittests/clang-doc/CMakeLists.txt
@@ -26,6 +26,7 @@ add_extra_unittest(ClangDocTests
ClangDocTest.cpp
GeneratorTest.cpp
HTMLGeneratorTest.cpp
+ MarkdownParserTest.cpp
MDGeneratorTest.cpp
MergeTest.cpp
SerializeTest.cpp
@@ -49,5 +50,6 @@ clang_target_link_libraries(ClangDocTests
target_link_libraries(ClangDocTests
PRIVATE
clangDoc
+ clangDocSupport
LLVMTestingSupport
- )
+ )
\ No newline at end of file
diff --git a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
new file mode 100644
index 0000000000000..8df5efc7f1d5f
--- /dev/null
+++ b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
@@ -0,0 +1,94 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "support/Markdown.h"
+#include "llvm/Support/Allocator.h"
+#include "gtest/gtest.h"
+
+using namespace clang::doc::markdown;
+
+namespace {
+
+TEST(MarkdownParserTest, EmptyInput) {
+ llvm::BumpPtrAllocator Arena;
+ auto Nodes = parseMarkdown("", Arena);
+ EXPECT_TRUE(Nodes.empty());
+}
+
+TEST(MarkdownParserTest, WhitespaceOnlyInput) {
+ llvm::BumpPtrAllocator Arena;
+ auto Nodes = parseMarkdown(" \n \n", Arena);
+ EXPECT_TRUE(Nodes.empty());
+}
+
+TEST(MarkdownParserTest, PlainText) {
+ llvm::BumpPtrAllocator Arena;
+ auto Nodes = parseMarkdown("hello world", Arena);
+ ASSERT_EQ(Nodes.size(), 1u);
+ EXPECT_EQ(Nodes[0].Kind, NodeKind::NK_Text);
+ EXPECT_EQ(Nodes[0].Content, "hello world");
+}
+
+TEST(MarkdownParserTest, FencedCodeBlock) {
+ llvm::BumpPtrAllocator Arena;
+ auto Nodes = parseMarkdown("```cpp\nint x = 0;\n```", Arena);
+ ASSERT_EQ(Nodes.size(), 1u);
+ EXPECT_EQ(Nodes[0].Kind, NodeKind::NK_FencedCode);
+ EXPECT_EQ(Nodes[0].Content, "cpp");
+ ASSERT_EQ(Nodes[0].Children.size(), 1u);
+}
+
+TEST(MarkdownParserTest, FencedCodeBlockNoLang) {
+ llvm::BumpPtrAllocator Arena;
+ auto Nodes = parseMarkdown("```\nsome code\n```", Arena);
+ ASSERT_EQ(Nodes.size(), 1u);
+ EXPECT_EQ(Nodes[0].Kind, NodeKind::NK_FencedCode);
+ EXPECT_TRUE(Nodes[0].Content.empty());
+}
+
+TEST(MarkdownParserTest, UnterminatedFenceReturnsEmpty) {
+ llvm::BumpPtrAllocator Arena;
+ auto Nodes = parseMarkdown("```cpp\nint x = 0;", Arena);
+ // Unterminated fence should not crash and should produce a code node
+ // with whatever lines were found.
+ EXPECT_FALSE(Nodes.empty());
+}
+
+TEST(MarkdownParserTest, PipeTable) {
+ llvm::BumpPtrAllocator Arena;
+ auto Nodes = parseMarkdown("| A | B |\n|---|---|\n| 1 | 2 |", Arena);
+ ASSERT_EQ(Nodes.size(), 1u);
+ EXPECT_EQ(Nodes[0].Kind, NodeKind::NK_Table);
+}
+
+TEST(MarkdownParserTest, PipeCharacterWithoutSepRowIsPlainText) {
+ llvm::BumpPtrAllocator Arena;
+ auto Nodes = parseMarkdown("a | b\nc | d", Arena);
+ // No separator row so should not be parsed as a table
+ for (const auto &Node : Nodes)
+ EXPECT_NE(Node.Kind, NodeKind::NK_Table);
+}
+
+TEST(MarkdownParserTest, UnorderedList) {
+ llvm::BumpPtrAllocator Arena;
+ auto Nodes = parseMarkdown("- foo\n- bar\n- baz", Arena);
+ ASSERT_EQ(Nodes.size(), 1u);
+ EXPECT_EQ(Nodes[0].Kind, NodeKind::NK_UnorderedList);
+ ASSERT_EQ(Nodes[0].Children.size(), 3u);
+ EXPECT_EQ(Nodes[0].Children[0].Content, "foo");
+ EXPECT_EQ(Nodes[0].Children[1].Content, "bar");
+ EXPECT_EQ(Nodes[0].Children[2].Content, "baz");
+}
+
+TEST(MarkdownParserTest, MixedContent) {
+ llvm::BumpPtrAllocator Arena;
+ auto Nodes = parseMarkdown("some text\n```\ncode\n```\n- item", Arena);
+ EXPECT_EQ(Nodes.size(), 3u);
+}
+
+} // namespace
\ No newline at end of file
>From f4cb4a28630e0f91289bfd4416c59114c5654ff7 Mon Sep 17 00:00:00 2001
From: Neil-N4 <neilnair4 at gmail.com>
Date: Wed, 10 Jun 2026 11:35:54 -0400
Subject: [PATCH 02/23] [clang-doc] Address review feedback: test fixture, raw
strings, DEBUG_TYPE, EOF newlines
---
.../clang-doc/support/Markdown.cpp | 4 +-
.../clang-doc/support/Markdown.h | 2 +-
.../clang-doc/MarkdownParserTest.cpp | 97 +++++++++++--------
3 files changed, 61 insertions(+), 42 deletions(-)
diff --git a/clang-tools-extra/clang-doc/support/Markdown.cpp b/clang-tools-extra/clang-doc/support/Markdown.cpp
index 776150b939d27..9e008abf8b08d 100644
--- a/clang-tools-extra/clang-doc/support/Markdown.cpp
+++ b/clang-tools-extra/clang-doc/support/Markdown.cpp
@@ -12,7 +12,7 @@
#include "llvm/Support/Allocator.h"
#include "llvm/Support/DebugLog.h"
-#define DEBUG_TYPE "clang-doc-markdown"
+#define DEBUG_TYPE "clang-doc"
using namespace llvm;
@@ -142,4 +142,4 @@ ArrayRef<MDNode> parseMarkdown(StringRef ParagraphText,
return allocateNodes(Nodes, Arena);
}
-} // namespace clang::doc::markdown
\ No newline at end of file
+} // namespace clang::doc::markdown
diff --git a/clang-tools-extra/clang-doc/support/Markdown.h b/clang-tools-extra/clang-doc/support/Markdown.h
index 890f764f937b1..09b79cc8f2437 100644
--- a/clang-tools-extra/clang-doc/support/Markdown.h
+++ b/clang-tools-extra/clang-doc/support/Markdown.h
@@ -69,4 +69,4 @@ llvm::ArrayRef<MDNode> parseMarkdown(llvm::StringRef ParagraphText,
} // namespace clang::doc::markdown
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_DOC_MARKDOWN_H
\ No newline at end of file
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_DOC_MARKDOWN_H
diff --git a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
index 8df5efc7f1d5f..ff9bad88da136 100644
--- a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
+++ b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
@@ -14,80 +14,99 @@ using namespace clang::doc::markdown;
namespace {
-TEST(MarkdownParserTest, EmptyInput) {
+struct MarkdownParserTest : public ::testing::Test {
llvm::BumpPtrAllocator Arena;
+};
+
+TEST_F(MarkdownParserTest, EmptyInput) {
auto Nodes = parseMarkdown("", Arena);
EXPECT_TRUE(Nodes.empty());
}
-TEST(MarkdownParserTest, WhitespaceOnlyInput) {
- llvm::BumpPtrAllocator Arena;
+TEST_F(MarkdownParserTest, WhitespaceOnlyInput) {
auto Nodes = parseMarkdown(" \n \n", Arena);
EXPECT_TRUE(Nodes.empty());
}
-TEST(MarkdownParserTest, PlainText) {
- llvm::BumpPtrAllocator Arena;
+TEST_F(MarkdownParserTest, PlainText) {
auto Nodes = parseMarkdown("hello world", Arena);
ASSERT_EQ(Nodes.size(), 1u);
- EXPECT_EQ(Nodes[0].Kind, NodeKind::NK_Text);
- EXPECT_EQ(Nodes[0].Content, "hello world");
+ const auto &N = Nodes[0];
+ EXPECT_EQ(N.Kind, NodeKind::NK_Text);
+ EXPECT_EQ(N.Content, "hello world");
}
-TEST(MarkdownParserTest, FencedCodeBlock) {
- llvm::BumpPtrAllocator Arena;
- auto Nodes = parseMarkdown("```cpp\nint x = 0;\n```", Arena);
+TEST_F(MarkdownParserTest, FencedCodeBlock) {
+ auto Nodes = parseMarkdown(R"(```cpp
+int x = 0;
+````)",
+ Arena);
ASSERT_EQ(Nodes.size(), 1u);
- EXPECT_EQ(Nodes[0].Kind, NodeKind::NK_FencedCode);
- EXPECT_EQ(Nodes[0].Content, "cpp");
- ASSERT_EQ(Nodes[0].Children.size(), 1u);
+ const auto &N = Nodes[0];
+ EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode);
+ EXPECT_EQ(N.Content, "cpp");
+ ASSERT_EQ(N.Children.size(), 1u);
}
-TEST(MarkdownParserTest, FencedCodeBlockNoLang) {
- llvm::BumpPtrAllocator Arena;
- auto Nodes = parseMarkdown("```\nsome code\n```", Arena);
+TEST_F(MarkdownParserTest, FencedCodeBlockNoLang) {
+ auto Nodes = parseMarkdown(R"(```
+some code
+```)",
+ Arena);
ASSERT_EQ(Nodes.size(), 1u);
- EXPECT_EQ(Nodes[0].Kind, NodeKind::NK_FencedCode);
- EXPECT_TRUE(Nodes[0].Content.empty());
+ const auto &N = Nodes[0];
+ EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode);
+ EXPECT_TRUE(N.Content.empty());
}
-TEST(MarkdownParserTest, UnterminatedFenceReturnsEmpty) {
- llvm::BumpPtrAllocator Arena;
- auto Nodes = parseMarkdown("```cpp\nint x = 0;", Arena);
+TEST_F(MarkdownParserTest, UnterminatedFenceReturnsEmpty) {
+ auto Nodes = parseMarkdown(R"(```cpp
+int x = 0;)",
+ Arena);
// Unterminated fence should not crash and should produce a code node
// with whatever lines were found.
EXPECT_FALSE(Nodes.empty());
}
-TEST(MarkdownParserTest, PipeTable) {
- llvm::BumpPtrAllocator Arena;
- auto Nodes = parseMarkdown("| A | B |\n|---|---|\n| 1 | 2 |", Arena);
+TEST_F(MarkdownParserTest, PipeTable) {
+ auto Nodes = parseMarkdown(R"(| A | B |
+|---|---|
+| 1 | 2 |)",
+ Arena);
ASSERT_EQ(Nodes.size(), 1u);
EXPECT_EQ(Nodes[0].Kind, NodeKind::NK_Table);
}
-TEST(MarkdownParserTest, PipeCharacterWithoutSepRowIsPlainText) {
- llvm::BumpPtrAllocator Arena;
- auto Nodes = parseMarkdown("a | b\nc | d", Arena);
- // No separator row so should not be parsed as a table
+TEST_F(MarkdownParserTest, PipeCharacterWithoutSepRowIsPlainText) {
+ auto Nodes = parseMarkdown(R"(a | b
+c | d)",
+ Arena);
+ // No separator row so should not be parsed as a table.
for (const auto &Node : Nodes)
EXPECT_NE(Node.Kind, NodeKind::NK_Table);
}
-TEST(MarkdownParserTest, UnorderedList) {
- llvm::BumpPtrAllocator Arena;
- auto Nodes = parseMarkdown("- foo\n- bar\n- baz", Arena);
+TEST_F(MarkdownParserTest, UnorderedList) {
+ auto Nodes = parseMarkdown(R"(- foo
+- bar
+- baz)",
+ Arena);
ASSERT_EQ(Nodes.size(), 1u);
- EXPECT_EQ(Nodes[0].Kind, NodeKind::NK_UnorderedList);
- ASSERT_EQ(Nodes[0].Children.size(), 3u);
- EXPECT_EQ(Nodes[0].Children[0].Content, "foo");
- EXPECT_EQ(Nodes[0].Children[1].Content, "bar");
- EXPECT_EQ(Nodes[0].Children[2].Content, "baz");
+ const auto &N = Nodes[0];
+ EXPECT_EQ(N.Kind, NodeKind::NK_UnorderedList);
+ ASSERT_EQ(N.Children.size(), 3u);
+ EXPECT_EQ(N.Children[0].Content, "foo");
+ EXPECT_EQ(N.Children[1].Content, "bar");
+ EXPECT_EQ(N.Children[2].Content, "baz");
}
-TEST(MarkdownParserTest, MixedContent) {
- llvm::BumpPtrAllocator Arena;
- auto Nodes = parseMarkdown("some text\n```\ncode\n```\n- item", Arena);
+TEST_F(MarkdownParserTest, MixedContent) {
+ auto Nodes = parseMarkdown(R"(some text
+```
+code
+````
+- item)",
+ Arena);
EXPECT_EQ(Nodes.size(), 3u);
}
>From 3ef8f62edab311caff0907ab2b9a0c3aaeb14353 Mon Sep 17 00:00:00 2001
From: Neil-N4 <neilnair4 at gmail.com>
Date: Wed, 10 Jun 2026 13:45:44 -0400
Subject: [PATCH 03/23] [clang-doc] Add CommonMark spec tests for fenced code
blocks
---
.../clang-doc/MarkdownParserTest.cpp | 112 +++++++++++++++++-
1 file changed, 108 insertions(+), 4 deletions(-)
diff --git a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
index ff9bad88da136..4ca979c1f1d24 100644
--- a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
+++ b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
@@ -39,7 +39,7 @@ TEST_F(MarkdownParserTest, PlainText) {
TEST_F(MarkdownParserTest, FencedCodeBlock) {
auto Nodes = parseMarkdown(R"(```cpp
int x = 0;
-````)",
+````````)",
Arena);
ASSERT_EQ(Nodes.size(), 1u);
const auto &N = Nodes[0];
@@ -51,7 +51,7 @@ int x = 0;
TEST_F(MarkdownParserTest, FencedCodeBlockNoLang) {
auto Nodes = parseMarkdown(R"(```
some code
-```)",
+```````)",
Arena);
ASSERT_EQ(Nodes.size(), 1u);
const auto &N = Nodes[0];
@@ -102,12 +102,116 @@ TEST_F(MarkdownParserTest, UnorderedList) {
TEST_F(MarkdownParserTest, MixedContent) {
auto Nodes = parseMarkdown(R"(some text
-```
+```````
code
-````
+````````
- item)",
Arena);
EXPECT_EQ(Nodes.size(), 3u);
}
+// CommonMark §4.5 example 120: tilde fences work the same as backtick fences.
+TEST_F(MarkdownParserTest, TildeFence) {
+ auto Nodes = parseMarkdown(R"(~~~
+int x = 0;
+~~~)",
+ Arena);
+ ASSERT_EQ(Nodes.size(), 1u);
+ const auto &N = Nodes[0];
+ EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode);
+ EXPECT_TRUE(N.Content.empty());
+ ASSERT_EQ(N.Children.size(), 1u);
+}
+
+// CommonMark §4.5 example 120: tilde fence with a language tag.
+TEST_F(MarkdownParserTest, TildeFenceWithLang) {
+ auto Nodes = parseMarkdown(R"(~~~cpp
+int x = 0;
+~~~)",
+ Arena);
+ ASSERT_EQ(Nodes.size(), 1u);
+ const auto &N = Nodes[0];
+ EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode);
+ EXPECT_EQ(N.Content, "cpp");
+ ASSERT_EQ(N.Children.size(), 1u);
+}
+
+// CommonMark §4.5 example 122: a tilde line does not close a backtick fence.
+TEST_F(MarkdownParserTest, ClosingFenceMustMatchOpeningChar) {
+ auto Nodes = parseMarkdown(R"(```
+aaa
+~~~
+````````)",
+ Arena);
+ ASSERT_EQ(Nodes.size(), 1u);
+ const auto &N = Nodes[0];
+ EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode);
+ // ~~~ is content, not a closing fence.
+ ASSERT_EQ(N.Children.size(), 2u);
+}
+
+// CommonMark §4.5 example 130: a code block can be empty.
+TEST_F(MarkdownParserTest, EmptyFencedCodeBlock) {
+ auto Nodes = parseMarkdown(R"(```
+```````)",
+ Arena);
+ ASSERT_EQ(Nodes.size(), 1u);
+ const auto &N = Nodes[0];
+ EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode);
+ EXPECT_TRUE(N.Children.empty());
+}
+
+// CommonMark §4.5 example 129: a code block may contain only blank lines.
+TEST_F(MarkdownParserTest, FencedCodeBlockBlankLineContent) {
+ auto Nodes = parseMarkdown("```\n\n \n```", Arena);
+ ASSERT_EQ(Nodes.size(), 1u);
+ const auto &N = Nodes[0];
+ EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode);
+ ASSERT_EQ(N.Children.size(), 2u);
+}
+
+// CommonMark §4.5 example 142: lang tag is captured from the info string.
+TEST_F(MarkdownParserTest, InfoStringLangTag) {
+ auto Nodes = parseMarkdown(R"(```ruby
+def foo(x)
+ return 3
+end
+``````)",
+ Arena);
+ ASSERT_EQ(Nodes.size(), 1u);
+ const auto &N = Nodes[0];
+ EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode);
+ EXPECT_EQ(N.Content, "ruby");
+ ASSERT_EQ(N.Children.size(), 3u);
+}
+
+// CommonMark §4.5 example 146: tilde fence info string may contain backticks.
+TEST_F(MarkdownParserTest, TildeFenceInfoStringWithBackticks) {
+ auto Nodes = parseMarkdown(R"(~~~ aa ``` ~~~
+foo
+~~~)",
+ Arena);
+ ASSERT_EQ(Nodes.size(), 1u);
+ const auto &N = Nodes[0];
+ EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode);
+ EXPECT_EQ(N.Content, "aa ``` ~~~");
+ ASSERT_EQ(N.Children.size(), 1u);
+}
+
+// CommonMark §4.5 example 124: closing fence must be at least as long as the
+// opening fence.
+// TODO: our parser currently closes on the first line with 3 matching fence
+// chars regardless of opening fence length. Fix as part of the CommonMark
+// TODO in parseMarkdown().
+TEST_F(MarkdownParserTest, ClosingFenceLengthTODO) {
+ auto Nodes = parseMarkdown("````\naaa\n```", Arena);
+ // The ``` line should not close the ```` fence per CommonMark, but our
+ // parser currently treats it as a closing fence. This test documents the
+ // current (non-conformant) behavior.
+ ASSERT_EQ(Nodes.size(), 1u);
+ const auto &N = Nodes[0];
+ EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode);
+ ASSERT_EQ(N.Children.size(), 1u);
+}
+
} // namespace
\ No newline at end of file
>From ffb56028d83a542a775119a7b0c2f88271b2df84 Mon Sep 17 00:00:00 2001
From: Neil-N4 <neilnair4 at gmail.com>
Date: Wed, 10 Jun 2026 13:59:52 -0400
Subject: [PATCH 04/23] [clang-doc] Replace flat MDNode with typed node
hierarchy using LLVM RTTI
---
.../clang-doc/support/Markdown.cpp | 84 +++---
.../clang-doc/support/Markdown.h | 264 ++++++++++++++++--
.../clang-doc/MarkdownParserTest.cpp | 84 +++---
3 files changed, 312 insertions(+), 120 deletions(-)
diff --git a/clang-tools-extra/clang-doc/support/Markdown.cpp b/clang-tools-extra/clang-doc/support/Markdown.cpp
index 9e008abf8b08d..bee15c3e23ec3 100644
--- a/clang-tools-extra/clang-doc/support/Markdown.cpp
+++ b/clang-tools-extra/clang-doc/support/Markdown.cpp
@@ -18,8 +18,24 @@ using namespace llvm;
namespace clang::doc::markdown {
-static MDNode makeText(StringRef S) {
- return {NodeKind::NK_Text, S, {}};
+// Allocates a contiguous array of T in the arena and returns an ArrayRef.
+template <typename T>
+static ArrayRef<T> allocateArray(SmallVectorImpl<T> &Vec,
+ BumpPtrAllocator &Arena) {
+ if (Vec.empty())
+ return {};
+ T *Allocated = Arena.Allocate<T>(Vec.size());
+ std::uninitialized_copy(Vec.begin(), Vec.end(), Allocated);
+ return ArrayRef<T>(Allocated, Vec.size());
+}
+
+// Interns a StringRef into the arena so it outlives the parse loop.
+static StringRef internString(StringRef S, BumpPtrAllocator &Arena) {
+ if (S.empty())
+ return {};
+ char *Buf = Arena.Allocate<char>(S.size());
+ std::copy(S.begin(), S.end(), Buf);
+ return StringRef(Buf, S.size());
}
// A line is a table separator if it only contains |, -, :, and spaces,
@@ -36,24 +52,15 @@ static bool isListItem(StringRef Line) {
Line.starts_with("+ ");
}
-static ArrayRef<MDNode> allocateNodes(const SmallVectorImpl<MDNode> &Nodes,
- BumpPtrAllocator &Arena) {
- if (Nodes.empty())
- return {};
- MDNode *Allocated = Arena.Allocate<MDNode>(Nodes.size());
- std::uninitialized_copy(Nodes.begin(), Nodes.end(), Allocated);
- return ArrayRef<MDNode>(Allocated, Nodes.size());
-}
-
-ArrayRef<MDNode> parseMarkdown(StringRef ParagraphText,
- BumpPtrAllocator &Arena) {
+ArrayRef<MDNode *> parseMarkdown(StringRef ParagraphText,
+ BumpPtrAllocator &Arena) {
if (ParagraphText.trim().empty())
return {};
SmallVector<StringRef, 16> Lines;
ParagraphText.split(Lines, '\n');
- SmallVector<MDNode> Nodes;
+ SmallVector<MDNode *> Nodes;
size_t I = 0, E = Lines.size();
while (I < E) {
@@ -71,8 +78,8 @@ ArrayRef<MDNode> parseMarkdown(StringRef ParagraphText,
// case-by-case basis.
if (Line.starts_with("```") || Line.starts_with("~~~")) {
char Fence = Line[0];
- StringRef Lang = Line.drop_front(3).trim();
- SmallVector<MDNode> CodeLines;
+ StringRef Lang = internString(Line.drop_front(3).trim(), Arena);
+ SmallVector<StringRef> CodeLines;
++I;
while (I < E) {
StringRef CodeLine = Lines[I].trim();
@@ -80,15 +87,13 @@ ArrayRef<MDNode> parseMarkdown(StringRef ParagraphText,
all_of(CodeLine.take_front(3),
[Fence](char C) { return C == Fence; }))
break;
- CodeLines.push_back(makeText(Lines[I]));
+ CodeLines.push_back(internString(Lines[I], Arena));
++I;
}
++I; // skip closing fence
- MDNode Code;
- Code.Kind = NodeKind::NK_FencedCode;
- Code.Content = Lang;
- Code.Children = allocateNodes(CodeLines, Arena);
- LDBG() << "emitting NK_FencedCode lang='" << Lang
+ auto *Code =
+ new (Arena) FencedCodeNode(Lang, allocateArray(CodeLines, Arena));
+ LDBG() << "emitting FencedCodeNode lang='" << Lang
<< "' lines=" << CodeLines.size();
Nodes.push_back(Code);
continue;
@@ -96,50 +101,45 @@ ArrayRef<MDNode> parseMarkdown(StringRef ParagraphText,
// Pipe table: current line has | and next line is a separator row.
if (Line.contains('|') && I + 1 < E && isSepRow(Lines[I + 1].trim())) {
- SmallVector<MDNode> Rows;
+ SmallVector<StringRef> Rows;
while (I < E && Lines[I].trim().contains('|')) {
- Rows.push_back(makeText(Lines[I].trim()));
+ Rows.push_back(internString(Lines[I].trim(), Arena));
++I;
}
- MDNode Table;
- Table.Kind = NodeKind::NK_Table;
- Table.Content = {};
- Table.Children = allocateNodes(Rows, Arena);
- LDBG() << "emitting NK_Table rows=" << Rows.size();
+ auto *Table = new (Arena) TableNode(allocateArray(Rows, Arena));
+ LDBG() << "emitting TableNode rows=" << Rows.size();
Nodes.push_back(Table);
continue;
}
// Unordered list item.
if (isListItem(Line)) {
- SmallVector<MDNode> Items;
+ SmallVector<ListItemNode *> Items;
while (I < E) {
StringRef L = Lines[I].trim();
if (!isListItem(L))
break;
- MDNode Item;
- Item.Kind = NodeKind::NK_ListItem;
- Item.Content = L.drop_front(2).trim();
- Item.Children = {};
+ StringRef ItemText = internString(L.drop_front(2).trim(), Arena);
+ SmallVector<MDNode *> ItemChildren;
+ ItemChildren.push_back(new (Arena) TextNode(ItemText));
+ auto *Item =
+ new (Arena) ListItemNode(allocateArray(ItemChildren, Arena));
Items.push_back(Item);
++I;
}
- MDNode List;
- List.Kind = NodeKind::NK_UnorderedList;
- List.Content = {};
- List.Children = allocateNodes(Items, Arena);
- LDBG() << "emitting NK_UnorderedList items=" << Items.size();
+ auto *List = new (Arena) UnorderedListNode(allocateArray(Items, Arena));
+ LDBG() << "emitting UnorderedListNode items=" << Items.size();
Nodes.push_back(List);
continue;
}
// Plain text fallback.
- Nodes.push_back(makeText(Line));
+ Nodes.push_back(new (Arena) TextNode(internString(Line, Arena)));
++I;
}
LDBG() << "parseMarkdown done nodes=" << Nodes.size();
- return allocateNodes(Nodes, Arena);
+ return allocateArray(Nodes, Arena);
}
-} // namespace clang::doc::markdown
+} // namespace clang::doc::markdown
\ No newline at end of file
diff --git a/clang-tools-extra/clang-doc/support/Markdown.h b/clang-tools-extra/clang-doc/support/Markdown.h
index 09b79cc8f2437..3d457bcddfac6 100644
--- a/clang-tools-extra/clang-doc/support/Markdown.h
+++ b/clang-tools-extra/clang-doc/support/Markdown.h
@@ -7,30 +7,50 @@
//===----------------------------------------------------------------------===//
///
/// \file
-/// This file defines a standalone Markdown parsing library for the LLVM
-/// ecosystem. The parser takes plain text and returns a tree of typed nodes
-/// with no knowledge of comments, Doxygen, or Clang-Doc internals.
+/// Standalone Markdown parsing library for the LLVM ecosystem.
///
-/// This is a simple Markdown parser for use inside Clang-Doc's comment
-/// pipeline. You give it a paragraph of text and an arena allocator, and it
-/// gives back a list of typed nodes describing the Markdown structure it found.
+/// The parser takes plain paragraph text and returns a polymorphic tree of
+/// MDNode-derived objects allocated in a caller-supplied BumpPtrAllocator.
+/// Node types form a closed class hierarchy rooted at MDNode. Each concrete
+/// type carries exactly the fields it needs -- no overloaded Content field,
+/// no unused arrays. Use llvm::isa<>/cast<>/dyn_cast<> for type-safe
+/// downcasting; each concrete type provides classof() for this purpose.
///
-/// The main entry point is parseMarkdown(). If the text has no Markdown in it,
-/// you get back an empty list and can fall back to plain-text output. If it
-/// does, you get a tree of MDNode structs where each node has a kind, optional
-/// content (like the language tag on a code fence), and optional children.
+/// See
+/// https://llvm.org/docs/ProgrammerManual.html#the-isa-cast-and-dyn-cast-templates
///
-/// All nodes are allocated in the arena you pass in. You own the arena and are
-/// responsible for keeping it alive as long as you use the nodes.
+/// Field ordering in each derived struct is chosen to minimize padding:
+/// 4-byte fields (like Level or Start) are declared before 16-byte fields
+/// (ArrayRef, StringRef) so that no implicit padding is inserted between the
+/// base class's 4-byte Kind and the first derived field.
///
-/// The parser handles fenced code blocks, pipe tables, and unordered lists.
-/// Anything it does not recognize comes back as a plain text node. It will
-/// never crash on bad input.
+/// Inline nodes (appear inside ParagraphNode, HeadingNode, etc.):
+/// TextNode -- plain text run
+/// SoftBreakNode -- soft line break
+/// HardBreakNode -- hard line break (trailing spaces or backslash)
+/// InlineCodeNode -- inline code span (`code`)
+/// EmphasisNode -- emphasis (*text* or _text_)
+/// StrongNode -- strong emphasis (**text** or __text__)
+///
+/// Block nodes:
+/// ParagraphNode -- sequence of inline nodes
+/// HeadingNode -- ATX heading (# through ######), level 1-6
+/// FencedCodeNode -- fenced code block (``` or ~~~)
+/// TableNode -- pipe table (raw row text; TODO: structured cells)
+/// UnorderedListNode -- bullet list (-, *, +)
+/// OrderedListNode -- numbered list with explicit start number
+/// ListItemNode -- single item inside a list
+/// BlockQuoteNode -- block quote (>)
+/// ThematicBreakNode -- horizontal rule (---, ***, ___)
+///
+/// All nodes are arena-allocated. The caller owns the arena and must keep it
+/// alive for the lifetime of any returned nodes. The parser never crashes on
+/// malformed input; unrecognized text falls back to TextNode.
///
//===----------------------------------------------------------------------===//
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_DOC_MARKDOWN_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_DOC_MARKDOWN_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_DOC_SUPPORT_MARKDOWN_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_DOC_SUPPORT_MARKDOWN_H
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/StringRef.h"
@@ -38,35 +58,217 @@
namespace clang::doc::markdown {
+/// Discriminator for all Markdown AST nodes. Inline kinds are grouped before
+/// block kinds so that the sentinels NK_LastInline and NK_FirstBlock enable
+/// cheap range-based checks in classof() implementations.
enum class NodeKind {
+ // Inline nodes
+ NK_Text,
+ NK_SoftBreak,
+ NK_HardBreak,
+ NK_InlineCode,
+ NK_Emphasis,
+ NK_Strong,
+ NK_LastInline = NK_Strong, // sentinel -- all inline kinds are <= this
+
// Block nodes
NK_Paragraph,
+ NK_Heading,
NK_FencedCode,
NK_Table,
NK_UnorderedList,
NK_OrderedList,
NK_ListItem,
+ NK_BlockQuote,
NK_ThematicBreak,
- // Inline nodes
- NK_Text,
- NK_InlineCode,
- NK_Emphasis,
- NK_Strong,
- NK_SoftBreak,
+ NK_FirstBlock = NK_Paragraph, // sentinel -- all block kinds are >= this
};
+/// Base type for all Markdown AST nodes. Carries only the kind discriminator.
+/// Nodes are arena-allocated and have no virtual destructor; use
+/// llvm::isa<>/cast<>/dyn_cast<> for type-safe downcasting.
struct MDNode {
NodeKind Kind;
- llvm::StringRef Content; // lang tag for FencedCode, leaf text for Text
- llvm::ArrayRef<MDNode> Children; // arena allocated
+ explicit MDNode(NodeKind K) : Kind(K) {}
+};
+
+//===----------------------------------------------------------------------===//
+// Inline nodes
+//===----------------------------------------------------------------------===//
+
+/// Plain text run.
+struct TextNode : MDNode {
+ llvm::StringRef Text;
+ explicit TextNode(llvm::StringRef Text)
+ : MDNode(NodeKind::NK_Text), Text(Text) {}
+ static bool classof(const MDNode *N) { return N->Kind == NodeKind::NK_Text; }
+};
+
+/// Soft line break -- a newline that does not end the paragraph.
+struct SoftBreakNode : MDNode {
+ SoftBreakNode() : MDNode(NodeKind::NK_SoftBreak) {}
+ static bool classof(const MDNode *N) {
+ return N->Kind == NodeKind::NK_SoftBreak;
+ }
+};
+
+/// Hard line break -- two trailing spaces or a backslash before a newline.
+struct HardBreakNode : MDNode {
+ HardBreakNode() : MDNode(NodeKind::NK_HardBreak) {}
+ static bool classof(const MDNode *N) {
+ return N->Kind == NodeKind::NK_HardBreak;
+ }
+};
+
+/// Inline code span: `code`. Code does not include the surrounding backticks.
+struct InlineCodeNode : MDNode {
+ llvm::StringRef Code;
+ explicit InlineCodeNode(llvm::StringRef Code)
+ : MDNode(NodeKind::NK_InlineCode), Code(Code) {}
+ static bool classof(const MDNode *N) {
+ return N->Kind == NodeKind::NK_InlineCode;
+ }
+};
+
+/// Emphasized text: *text* or _text_.
+struct EmphasisNode : MDNode {
+ llvm::ArrayRef<MDNode *> Children;
+ explicit EmphasisNode(llvm::ArrayRef<MDNode *> Children)
+ : MDNode(NodeKind::NK_Emphasis), Children(Children) {}
+ static bool classof(const MDNode *N) {
+ return N->Kind == NodeKind::NK_Emphasis;
+ }
};
-/// Parses Markdown from a single comment paragraph's text.
-/// Returns an empty ArrayRef if no Markdown constructs are found,
-/// so generators can fall back to plain-text rendering at zero cost.
-llvm::ArrayRef<MDNode> parseMarkdown(llvm::StringRef ParagraphText,
- llvm::BumpPtrAllocator &Arena);
+/// Strongly emphasized text: **text** or __text__.
+struct StrongNode : MDNode {
+ llvm::ArrayRef<MDNode *> Children;
+ explicit StrongNode(llvm::ArrayRef<MDNode *> Children)
+ : MDNode(NodeKind::NK_Strong), Children(Children) {}
+ static bool classof(const MDNode *N) {
+ return N->Kind == NodeKind::NK_Strong;
+ }
+};
+
+//===----------------------------------------------------------------------===//
+// Block nodes
+//===----------------------------------------------------------------------===//
+
+/// A paragraph -- sequence of inline nodes separated from other blocks by
+/// blank lines.
+struct ParagraphNode : MDNode {
+ llvm::ArrayRef<MDNode *> Children;
+ explicit ParagraphNode(llvm::ArrayRef<MDNode *> Children)
+ : MDNode(NodeKind::NK_Paragraph), Children(Children) {}
+ static bool classof(const MDNode *N) {
+ return N->Kind == NodeKind::NK_Paragraph;
+ }
+};
+
+/// ATX heading: one to six leading # characters. Level is declared before
+/// Children to avoid padding between the base class's 4-byte Kind and the
+/// 8-byte-aligned ArrayRef, keeping sizeof(HeadingNode) at 24 bytes.
+struct HeadingNode : MDNode {
+ unsigned Level; // 1-6
+ llvm::ArrayRef<MDNode *> Children; // inline content
+ HeadingNode(unsigned Level, llvm::ArrayRef<MDNode *> Children)
+ : MDNode(NodeKind::NK_Heading), Level(Level), Children(Children) {}
+ static bool classof(const MDNode *N) {
+ return N->Kind == NodeKind::NK_Heading;
+ }
+};
+
+/// Fenced code block opened with ``` or ~~~. Lang is the info string (e.g.
+/// "cpp"); empty when no language was specified. Lines contains the raw text
+/// of each interior line, without the opening or closing fence.
+///
+/// TODO: Follow CommonMark spec §4.5 -- the opening fence may be indented up
+/// to 3 spaces; the closing fence must use the same character and be at least
+/// as long as the opening fence; only spaces may follow the closing fence.
+struct FencedCodeNode : MDNode {
+ llvm::StringRef Lang;
+ llvm::ArrayRef<llvm::StringRef> Lines;
+ FencedCodeNode(llvm::StringRef Lang, llvm::ArrayRef<llvm::StringRef> Lines)
+ : MDNode(NodeKind::NK_FencedCode), Lang(Lang), Lines(Lines) {}
+ static bool classof(const MDNode *N) {
+ return N->Kind == NodeKind::NK_FencedCode;
+ }
+};
+
+/// Pipe table. Rows contains the raw text of each row line including the
+/// header and separator rows.
+/// TODO: replace with a structured header/body/cell representation.
+struct TableNode : MDNode {
+ llvm::ArrayRef<llvm::StringRef> Rows;
+ explicit TableNode(llvm::ArrayRef<llvm::StringRef> Rows)
+ : MDNode(NodeKind::NK_Table), Rows(Rows) {}
+ static bool classof(const MDNode *N) { return N->Kind == NodeKind::NK_Table; }
+};
+
+/// A single list item. Children may contain block-level nodes for loose
+/// lists, or a single inline sequence for tight lists.
+struct ListItemNode : MDNode {
+ llvm::ArrayRef<MDNode *> Children;
+ explicit ListItemNode(llvm::ArrayRef<MDNode *> Children)
+ : MDNode(NodeKind::NK_ListItem), Children(Children) {}
+ static bool classof(const MDNode *N) {
+ return N->Kind == NodeKind::NK_ListItem;
+ }
+};
+
+/// Unordered (bullet) list. Markers are -, *, or +.
+struct UnorderedListNode : MDNode {
+ llvm::ArrayRef<ListItemNode *> Items;
+ explicit UnorderedListNode(llvm::ArrayRef<ListItemNode *> Items)
+ : MDNode(NodeKind::NK_UnorderedList), Items(Items) {}
+ static bool classof(const MDNode *N) {
+ return N->Kind == NodeKind::NK_UnorderedList;
+ }
+};
+
+/// Ordered (numbered) list. Start is the number on the first item. Start is
+/// declared before Items to avoid padding, keeping sizeof at 24 bytes.
+struct OrderedListNode : MDNode {
+ unsigned Start;
+ llvm::ArrayRef<ListItemNode *> Items;
+ OrderedListNode(unsigned Start, llvm::ArrayRef<ListItemNode *> Items)
+ : MDNode(NodeKind::NK_OrderedList), Start(Start), Items(Items) {}
+ static bool classof(const MDNode *N) {
+ return N->Kind == NodeKind::NK_OrderedList;
+ }
+};
+
+/// Block quote (> ...). Children are block-level nodes inside the quote.
+struct BlockQuoteNode : MDNode {
+ llvm::ArrayRef<MDNode *> Children;
+ explicit BlockQuoteNode(llvm::ArrayRef<MDNode *> Children)
+ : MDNode(NodeKind::NK_BlockQuote), Children(Children) {}
+ static bool classof(const MDNode *N) {
+ return N->Kind == NodeKind::NK_BlockQuote;
+ }
+};
+
+/// Thematic break: a line of three or more ---, ***, or ___ characters.
+struct ThematicBreakNode : MDNode {
+ ThematicBreakNode() : MDNode(NodeKind::NK_ThematicBreak) {}
+ static bool classof(const MDNode *N) {
+ return N->Kind == NodeKind::NK_ThematicBreak;
+ }
+};
+
+//===----------------------------------------------------------------------===//
+// Parser entry point
+//===----------------------------------------------------------------------===//
+
+/// Parse Markdown from a single paragraph of plain text. Returns a list of
+/// top-level block nodes allocated in Arena. Returns an empty ArrayRef if no
+/// Markdown constructs are found, letting callers fall back to plain-text
+/// rendering at zero cost. The parser never crashes on malformed input.
+///
+/// The caller must keep Arena alive for the lifetime of any returned nodes.
+llvm::ArrayRef<MDNode *> parseMarkdown(llvm::StringRef ParagraphText,
+ llvm::BumpPtrAllocator &Arena);
} // namespace clang::doc::markdown
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_DOC_MARKDOWN_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_DOC_SUPPORT_MARKDOWN_H
diff --git a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
index 4ca979c1f1d24..b61094f034375 100644
--- a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
+++ b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
@@ -8,9 +8,11 @@
#include "support/Markdown.h"
#include "llvm/Support/Allocator.h"
+#include "llvm/Support/Casting.h"
#include "gtest/gtest.h"
using namespace clang::doc::markdown;
+using namespace llvm;
namespace {
@@ -31,9 +33,8 @@ TEST_F(MarkdownParserTest, WhitespaceOnlyInput) {
TEST_F(MarkdownParserTest, PlainText) {
auto Nodes = parseMarkdown("hello world", Arena);
ASSERT_EQ(Nodes.size(), 1u);
- const auto &N = Nodes[0];
- EXPECT_EQ(N.Kind, NodeKind::NK_Text);
- EXPECT_EQ(N.Content, "hello world");
+ auto *N = cast<TextNode>(Nodes[0]);
+ EXPECT_EQ(N->Text, "hello world");
}
TEST_F(MarkdownParserTest, FencedCodeBlock) {
@@ -42,10 +43,9 @@ int x = 0;
````````)",
Arena);
ASSERT_EQ(Nodes.size(), 1u);
- const auto &N = Nodes[0];
- EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode);
- EXPECT_EQ(N.Content, "cpp");
- ASSERT_EQ(N.Children.size(), 1u);
+ auto *N = cast<FencedCodeNode>(Nodes[0]);
+ EXPECT_EQ(N->Lang, "cpp");
+ ASSERT_EQ(N->Lines.size(), 1u);
}
TEST_F(MarkdownParserTest, FencedCodeBlockNoLang) {
@@ -54,9 +54,8 @@ some code
```````)",
Arena);
ASSERT_EQ(Nodes.size(), 1u);
- const auto &N = Nodes[0];
- EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode);
- EXPECT_TRUE(N.Content.empty());
+ auto *N = cast<FencedCodeNode>(Nodes[0]);
+ EXPECT_TRUE(N->Lang.empty());
}
TEST_F(MarkdownParserTest, UnterminatedFenceReturnsEmpty) {
@@ -74,7 +73,7 @@ TEST_F(MarkdownParserTest, PipeTable) {
| 1 | 2 |)",
Arena);
ASSERT_EQ(Nodes.size(), 1u);
- EXPECT_EQ(Nodes[0].Kind, NodeKind::NK_Table);
+ EXPECT_TRUE(isa<TableNode>(Nodes[0]));
}
TEST_F(MarkdownParserTest, PipeCharacterWithoutSepRowIsPlainText) {
@@ -82,8 +81,8 @@ TEST_F(MarkdownParserTest, PipeCharacterWithoutSepRowIsPlainText) {
c | d)",
Arena);
// No separator row so should not be parsed as a table.
- for (const auto &Node : Nodes)
- EXPECT_NE(Node.Kind, NodeKind::NK_Table);
+ for (const auto *Node : Nodes)
+ EXPECT_FALSE(isa<TableNode>(Node));
}
TEST_F(MarkdownParserTest, UnorderedList) {
@@ -92,12 +91,11 @@ TEST_F(MarkdownParserTest, UnorderedList) {
- baz)",
Arena);
ASSERT_EQ(Nodes.size(), 1u);
- const auto &N = Nodes[0];
- EXPECT_EQ(N.Kind, NodeKind::NK_UnorderedList);
- ASSERT_EQ(N.Children.size(), 3u);
- EXPECT_EQ(N.Children[0].Content, "foo");
- EXPECT_EQ(N.Children[1].Content, "bar");
- EXPECT_EQ(N.Children[2].Content, "baz");
+ auto *N = cast<UnorderedListNode>(Nodes[0]);
+ ASSERT_EQ(N->Items.size(), 3u);
+ EXPECT_EQ(cast<TextNode>(N->Items[0]->Children[0])->Text, "foo");
+ EXPECT_EQ(cast<TextNode>(N->Items[1]->Children[0])->Text, "bar");
+ EXPECT_EQ(cast<TextNode>(N->Items[2]->Children[0])->Text, "baz");
}
TEST_F(MarkdownParserTest, MixedContent) {
@@ -117,10 +115,9 @@ int x = 0;
~~~)",
Arena);
ASSERT_EQ(Nodes.size(), 1u);
- const auto &N = Nodes[0];
- EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode);
- EXPECT_TRUE(N.Content.empty());
- ASSERT_EQ(N.Children.size(), 1u);
+ auto *N = cast<FencedCodeNode>(Nodes[0]);
+ EXPECT_TRUE(N->Lang.empty());
+ ASSERT_EQ(N->Lines.size(), 1u);
}
// CommonMark §4.5 example 120: tilde fence with a language tag.
@@ -130,10 +127,9 @@ int x = 0;
~~~)",
Arena);
ASSERT_EQ(Nodes.size(), 1u);
- const auto &N = Nodes[0];
- EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode);
- EXPECT_EQ(N.Content, "cpp");
- ASSERT_EQ(N.Children.size(), 1u);
+ auto *N = cast<FencedCodeNode>(Nodes[0]);
+ EXPECT_EQ(N->Lang, "cpp");
+ ASSERT_EQ(N->Lines.size(), 1u);
}
// CommonMark §4.5 example 122: a tilde line does not close a backtick fence.
@@ -144,10 +140,9 @@ aaa
````````)",
Arena);
ASSERT_EQ(Nodes.size(), 1u);
- const auto &N = Nodes[0];
- EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode);
+ auto *N = cast<FencedCodeNode>(Nodes[0]);
// ~~~ is content, not a closing fence.
- ASSERT_EQ(N.Children.size(), 2u);
+ ASSERT_EQ(N->Lines.size(), 2u);
}
// CommonMark §4.5 example 130: a code block can be empty.
@@ -156,18 +151,16 @@ TEST_F(MarkdownParserTest, EmptyFencedCodeBlock) {
```````)",
Arena);
ASSERT_EQ(Nodes.size(), 1u);
- const auto &N = Nodes[0];
- EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode);
- EXPECT_TRUE(N.Children.empty());
+ auto *N = cast<FencedCodeNode>(Nodes[0]);
+ EXPECT_TRUE(N->Lines.empty());
}
// CommonMark §4.5 example 129: a code block may contain only blank lines.
TEST_F(MarkdownParserTest, FencedCodeBlockBlankLineContent) {
auto Nodes = parseMarkdown("```\n\n \n```", Arena);
ASSERT_EQ(Nodes.size(), 1u);
- const auto &N = Nodes[0];
- EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode);
- ASSERT_EQ(N.Children.size(), 2u);
+ auto *N = cast<FencedCodeNode>(Nodes[0]);
+ ASSERT_EQ(N->Lines.size(), 2u);
}
// CommonMark §4.5 example 142: lang tag is captured from the info string.
@@ -179,10 +172,9 @@ end
``````)",
Arena);
ASSERT_EQ(Nodes.size(), 1u);
- const auto &N = Nodes[0];
- EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode);
- EXPECT_EQ(N.Content, "ruby");
- ASSERT_EQ(N.Children.size(), 3u);
+ auto *N = cast<FencedCodeNode>(Nodes[0]);
+ EXPECT_EQ(N->Lang, "ruby");
+ ASSERT_EQ(N->Lines.size(), 3u);
}
// CommonMark §4.5 example 146: tilde fence info string may contain backticks.
@@ -192,10 +184,9 @@ foo
~~~)",
Arena);
ASSERT_EQ(Nodes.size(), 1u);
- const auto &N = Nodes[0];
- EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode);
- EXPECT_EQ(N.Content, "aa ``` ~~~");
- ASSERT_EQ(N.Children.size(), 1u);
+ auto *N = cast<FencedCodeNode>(Nodes[0]);
+ EXPECT_EQ(N->Lang, "aa ``` ~~~");
+ ASSERT_EQ(N->Lines.size(), 1u);
}
// CommonMark §4.5 example 124: closing fence must be at least as long as the
@@ -209,9 +200,8 @@ TEST_F(MarkdownParserTest, ClosingFenceLengthTODO) {
// parser currently treats it as a closing fence. This test documents the
// current (non-conformant) behavior.
ASSERT_EQ(Nodes.size(), 1u);
- const auto &N = Nodes[0];
- EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode);
- ASSERT_EQ(N.Children.size(), 1u);
+ auto *N = cast<FencedCodeNode>(Nodes[0]);
+ ASSERT_EQ(N->Lines.size(), 1u);
}
} // namespace
\ No newline at end of file
>From 25fe7daff183f51a8b31ed0d8481b9a2f1fbdbd8 Mon Sep 17 00:00:00 2001
From: Neil-N4 <neilnair4 at gmail.com>
Date: Fri, 12 Jun 2026 02:33:44 -0400
Subject: [PATCH 05/23] [clang-doc] Introduce LineReader cursor for the
Markdown parse loop
Replace the raw size_t I = 0, E = Lines.size() index arithmetic in
parseMarkdown() with a LineReader cursor that encapsulates the position
and exposes peek(), peek(Offset), advance(), and atEnd(). The parse
logic and emitted nodes are unchanged; this only removes manual index
bookkeeping. All 18 MarkdownParserTest cases still pass.
Co-Authored-By: Claude Opus 4.8 <noreply at anthropic.com>
---
.../clang-doc/support/Markdown.cpp | 73 ++++++++++++++-----
1 file changed, 54 insertions(+), 19 deletions(-)
diff --git a/clang-tools-extra/clang-doc/support/Markdown.cpp b/clang-tools-extra/clang-doc/support/Markdown.cpp
index bee15c3e23ec3..f171457e73046 100644
--- a/clang-tools-extra/clang-doc/support/Markdown.cpp
+++ b/clang-tools-extra/clang-doc/support/Markdown.cpp
@@ -11,6 +11,7 @@
#include "llvm/ADT/StringRef.h"
#include "llvm/Support/Allocator.h"
#include "llvm/Support/DebugLog.h"
+#include <cassert>
#define DEBUG_TYPE "clang-doc"
@@ -52,6 +53,42 @@ static bool isListItem(StringRef Line) {
Line.starts_with("+ ");
}
+// A forward cursor over the lines of a paragraph. Encapsulates the parse
+// position so the loop can inspect the current or an upcoming line and consume
+// lines without manual index arithmetic. Lines are stored untrimmed; callers
+// trim where they need a normalized view.
+class LineReader {
+public:
+ explicit LineReader(ArrayRef<StringRef> Lines) : Lines(Lines) {}
+
+ // True once every line has been consumed.
+ bool atEnd() const { return Pos >= Lines.size(); }
+
+ // The current line, untrimmed. Must not be called when atEnd().
+ StringRef peek() const {
+ assert(!atEnd() && "peek past end of input");
+ return Lines[Pos];
+ }
+
+ // The line Offset positions ahead of the cursor, or an empty StringRef when
+ // that position is past the end. peek(0) is the current line.
+ StringRef peek(size_t Offset) const {
+ size_t Target = Pos + Offset;
+ return Target < Lines.size() ? Lines[Target] : StringRef();
+ }
+
+ // Consume the current line and return it, untrimmed. Must not be called when
+ // atEnd().
+ StringRef advance() {
+ assert(!atEnd() && "advance past end of input");
+ return Lines[Pos++];
+ }
+
+private:
+ ArrayRef<StringRef> Lines;
+ size_t Pos = 0;
+};
+
ArrayRef<MDNode *> parseMarkdown(StringRef ParagraphText,
BumpPtrAllocator &Arena) {
if (ParagraphText.trim().empty())
@@ -61,13 +98,13 @@ ArrayRef<MDNode *> parseMarkdown(StringRef ParagraphText,
ParagraphText.split(Lines, '\n');
SmallVector<MDNode *> Nodes;
- size_t I = 0, E = Lines.size();
+ LineReader Reader(Lines);
- while (I < E) {
- StringRef Line = Lines[I].trim();
+ while (!Reader.atEnd()) {
+ StringRef Line = Reader.peek().trim();
if (Line.empty()) {
- ++I;
+ Reader.advance();
continue;
}
@@ -79,18 +116,18 @@ ArrayRef<MDNode *> parseMarkdown(StringRef ParagraphText,
if (Line.starts_with("```") || Line.starts_with("~~~")) {
char Fence = Line[0];
StringRef Lang = internString(Line.drop_front(3).trim(), Arena);
+ Reader.advance(); // consume opening fence
SmallVector<StringRef> CodeLines;
- ++I;
- while (I < E) {
- StringRef CodeLine = Lines[I].trim();
+ while (!Reader.atEnd()) {
+ StringRef CodeLine = Reader.peek().trim();
if (CodeLine.size() >= 3 &&
all_of(CodeLine.take_front(3),
[Fence](char C) { return C == Fence; }))
break;
- CodeLines.push_back(internString(Lines[I], Arena));
- ++I;
+ CodeLines.push_back(internString(Reader.advance(), Arena));
}
- ++I; // skip closing fence
+ if (!Reader.atEnd())
+ Reader.advance(); // consume closing fence
auto *Code =
new (Arena) FencedCodeNode(Lang, allocateArray(CodeLines, Arena));
LDBG() << "emitting FencedCodeNode lang='" << Lang
@@ -100,12 +137,10 @@ ArrayRef<MDNode *> parseMarkdown(StringRef ParagraphText,
}
// Pipe table: current line has | and next line is a separator row.
- if (Line.contains('|') && I + 1 < E && isSepRow(Lines[I + 1].trim())) {
+ if (Line.contains('|') && isSepRow(Reader.peek(1).trim())) {
SmallVector<StringRef> Rows;
- while (I < E && Lines[I].trim().contains('|')) {
- Rows.push_back(internString(Lines[I].trim(), Arena));
- ++I;
- }
+ while (!Reader.atEnd() && Reader.peek().trim().contains('|'))
+ Rows.push_back(internString(Reader.advance().trim(), Arena));
auto *Table = new (Arena) TableNode(allocateArray(Rows, Arena));
LDBG() << "emitting TableNode rows=" << Rows.size();
Nodes.push_back(Table);
@@ -115,8 +150,8 @@ ArrayRef<MDNode *> parseMarkdown(StringRef ParagraphText,
// Unordered list item.
if (isListItem(Line)) {
SmallVector<ListItemNode *> Items;
- while (I < E) {
- StringRef L = Lines[I].trim();
+ while (!Reader.atEnd()) {
+ StringRef L = Reader.peek().trim();
if (!isListItem(L))
break;
StringRef ItemText = internString(L.drop_front(2).trim(), Arena);
@@ -125,7 +160,7 @@ ArrayRef<MDNode *> parseMarkdown(StringRef ParagraphText,
auto *Item =
new (Arena) ListItemNode(allocateArray(ItemChildren, Arena));
Items.push_back(Item);
- ++I;
+ Reader.advance();
}
auto *List = new (Arena) UnorderedListNode(allocateArray(Items, Arena));
LDBG() << "emitting UnorderedListNode items=" << Items.size();
@@ -135,7 +170,7 @@ ArrayRef<MDNode *> parseMarkdown(StringRef ParagraphText,
// Plain text fallback.
Nodes.push_back(new (Arena) TextNode(internString(Line, Arena)));
- ++I;
+ Reader.advance();
}
LDBG() << "parseMarkdown done nodes=" << Nodes.size();
>From 060bf63fe9f19fa45ef941f10594897351591d56 Mon Sep 17 00:00:00 2001
From: Neil-N4 <neilnair4 at gmail.com>
Date: Fri, 12 Jun 2026 02:44:27 -0400
Subject: [PATCH 06/23] [clang-doc] Parse inline emphasis, strong, and code in
Markdown
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Add an inline pass over paragraph text that recognizes emphasis
(*text* or _text_), strong (**text** or __text__), and inline code
(`code`), emitting the EmphasisNode, StrongNode, and InlineCodeNode
types already in the hierarchy. Emphasis and strong recurse into their
content, and runs that match no construct stay plain TextNodes.
Delimiter matching uses a simplified subset of the CommonMark §6
flanking rules: a delimiter opens only with non-whitespace inside it and
closes only with non-whitespace before it, and code spans close on a
backtick run of equal length. The full delimiter-stack model is left as
a TODO. Adds 12 unit tests covering each construct plus the unmatched
and unterminated cases.
Co-Authored-By: Claude Opus 4.8 <noreply at anthropic.com>
---
.../clang-doc/support/Markdown.cpp | 121 +++++++++++++++++-
.../clang-doc/MarkdownParserTest.cpp | 97 ++++++++++++++
2 files changed, 216 insertions(+), 2 deletions(-)
diff --git a/clang-tools-extra/clang-doc/support/Markdown.cpp b/clang-tools-extra/clang-doc/support/Markdown.cpp
index f171457e73046..f1af4f5430772 100644
--- a/clang-tools-extra/clang-doc/support/Markdown.cpp
+++ b/clang-tools-extra/clang-doc/support/Markdown.cpp
@@ -8,6 +8,7 @@
#include "Markdown.h"
#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/Support/Allocator.h"
#include "llvm/Support/DebugLog.h"
@@ -89,6 +90,121 @@ class LineReader {
size_t Pos = 0;
};
+// Returns the number of consecutive copies of C starting at S[Start].
+static size_t countRun(StringRef S, size_t Start, char C) {
+ size_t I = Start;
+ while (I < S.size() && S[I] == C)
+ ++I;
+ return I - Start;
+}
+
+// Strips one leading and one trailing space from a code span's content when
+// both are present and the content is not all spaces, per CommonMark §6.1.
+static StringRef trimCodeSpan(StringRef Code) {
+ if (Code.size() >= 2 && Code.front() == ' ' && Code.back() == ' ' &&
+ Code.find_first_not_of(' ') != StringRef::npos)
+ return Code.drop_front().drop_back();
+ return Code;
+}
+
+// Finds the start index of a closing emphasis run of exactly Count copies of C,
+// searching forward from From. Requires non-whitespace immediately inside both
+// the opening and closing delimiters and non-empty content, a simplified take
+// on the CommonMark §6.2 flanking rules. Returns StringRef::npos if no valid
+// closing run exists.
+static size_t findClosingDelim(StringRef S, size_t From, char C, size_t Count) {
+ size_t E = S.size();
+ // Opening delimiter is not left-flanking if whitespace follows it.
+ if (From >= E || isSpace(S[From]))
+ return StringRef::npos;
+ for (size_t J = From; J + Count <= E; ++J) {
+ if (S[J] != C)
+ continue;
+ size_t Run = countRun(S, J, C);
+ if (Run != Count) {
+ J += Run - 1; // Skip the whole run; the loop's ++J lands past it.
+ continue;
+ }
+ // Reject empty content and closing runs that are not right-flanking.
+ if (J == From || isSpace(S[J - 1]))
+ continue;
+ return J;
+ }
+ return StringRef::npos;
+}
+
+// Parses the inline content of a single line into a sequence of inline nodes:
+// inline code (`code`), strong (**text** or __text__), and emphasis (*text* or
+// _text_). Runs that match no construct become TextNodes. Emphasis and strong
+// recurse so their content may itself contain inline constructs. Text with no
+// markers yields a single TextNode.
+//
+// TODO: This covers the common cases but not the full CommonMark §6 inline
+// model (delimiter stacks, intraword underscore rules, links, autolinks).
+static ArrayRef<MDNode *> parseInline(StringRef S, BumpPtrAllocator &Arena) {
+ SmallVector<MDNode *> Nodes;
+ size_t TextStart = 0, I = 0, E = S.size();
+
+ auto flushText = [&](size_t End) {
+ if (End > TextStart)
+ Nodes.push_back(new (Arena) TextNode(
+ internString(S.substr(TextStart, End - TextStart), Arena)));
+ };
+
+ while (I < E) {
+ char C = S[I];
+
+ // Inline code span: a run of N backticks closed by a run of N backticks.
+ if (C == '`') {
+ size_t N = countRun(S, I, '`');
+ size_t J = I + N;
+ while (J < E && countRun(S, J, '`') != N)
+ J += S[J] == '`' ? countRun(S, J, '`') : 1;
+ if (J < E) {
+ flushText(I);
+ StringRef Code = trimCodeSpan(S.substr(I + N, J - (I + N)));
+ Nodes.push_back(new (Arena) InlineCodeNode(internString(Code, Arena)));
+ I = J + N;
+ TextStart = I;
+ continue;
+ }
+ // No closing run; leave the backticks as literal text.
+ I += N;
+ continue;
+ }
+
+ // Emphasis (*text*, _text_) and strong (**text**, __text__).
+ if (C == '*' || C == '_') {
+ // Strong binds the two-delimiter form before single-delimiter emphasis.
+ if (I + 1 < E && S[I + 1] == C) {
+ size_t Close = findClosingDelim(S, I + 2, C, 2);
+ if (Close != StringRef::npos) {
+ flushText(I);
+ StringRef Inner = S.substr(I + 2, Close - (I + 2));
+ Nodes.push_back(new (Arena) StrongNode(parseInline(Inner, Arena)));
+ I = Close + 2;
+ TextStart = I;
+ continue;
+ }
+ }
+ size_t Close = findClosingDelim(S, I + 1, C, 1);
+ if (Close != StringRef::npos) {
+ flushText(I);
+ StringRef Inner = S.substr(I + 1, Close - (I + 1));
+ Nodes.push_back(new (Arena) EmphasisNode(parseInline(Inner, Arena)));
+ I = Close + 1;
+ TextStart = I;
+ continue;
+ }
+ }
+
+ ++I;
+ }
+
+ flushText(E);
+ return allocateArray(Nodes, Arena);
+}
+
ArrayRef<MDNode *> parseMarkdown(StringRef ParagraphText,
BumpPtrAllocator &Arena) {
if (ParagraphText.trim().empty())
@@ -168,8 +284,9 @@ ArrayRef<MDNode *> parseMarkdown(StringRef ParagraphText,
continue;
}
- // Plain text fallback.
- Nodes.push_back(new (Arena) TextNode(internString(Line, Arena)));
+ // Plain text, scanned for inline constructs (emphasis, strong, code).
+ for (MDNode *Inline : parseInline(Line, Arena))
+ Nodes.push_back(Inline);
Reader.advance();
}
diff --git a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
index b61094f034375..ea72dacfb08e5 100644
--- a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
+++ b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
@@ -204,4 +204,101 @@ TEST_F(MarkdownParserTest, ClosingFenceLengthTODO) {
ASSERT_EQ(N->Lines.size(), 1u);
}
+TEST_F(MarkdownParserTest, EmphasisAsterisk) {
+ auto Nodes = parseMarkdown("an *important* word", Arena);
+ ASSERT_EQ(Nodes.size(), 3u);
+ EXPECT_EQ(cast<TextNode>(Nodes[0])->Text, "an ");
+ auto *Em = cast<EmphasisNode>(Nodes[1]);
+ ASSERT_EQ(Em->Children.size(), 1u);
+ EXPECT_EQ(cast<TextNode>(Em->Children[0])->Text, "important");
+ EXPECT_EQ(cast<TextNode>(Nodes[2])->Text, " word");
+}
+
+TEST_F(MarkdownParserTest, EmphasisUnderscore) {
+ auto Nodes = parseMarkdown("_em_", Arena);
+ ASSERT_EQ(Nodes.size(), 1u);
+ auto *Em = cast<EmphasisNode>(Nodes[0]);
+ ASSERT_EQ(Em->Children.size(), 1u);
+ EXPECT_EQ(cast<TextNode>(Em->Children[0])->Text, "em");
+}
+
+TEST_F(MarkdownParserTest, StrongAsterisk) {
+ auto Nodes = parseMarkdown("**bold**", Arena);
+ ASSERT_EQ(Nodes.size(), 1u);
+ auto *St = cast<StrongNode>(Nodes[0]);
+ ASSERT_EQ(St->Children.size(), 1u);
+ EXPECT_EQ(cast<TextNode>(St->Children[0])->Text, "bold");
+}
+
+TEST_F(MarkdownParserTest, StrongUnderscore) {
+ auto Nodes = parseMarkdown("__bold__", Arena);
+ ASSERT_EQ(Nodes.size(), 1u);
+ auto *St = cast<StrongNode>(Nodes[0]);
+ ASSERT_EQ(St->Children.size(), 1u);
+ EXPECT_EQ(cast<TextNode>(St->Children[0])->Text, "bold");
+}
+
+// Two delimiters must be parsed as strong, not as nested emphasis.
+TEST_F(MarkdownParserTest, StrongBindsBeforeEmphasis) {
+ auto Nodes = parseMarkdown("**strong**", Arena);
+ ASSERT_EQ(Nodes.size(), 1u);
+ EXPECT_TRUE(isa<StrongNode>(Nodes[0]));
+}
+
+TEST_F(MarkdownParserTest, InlineCode) {
+ auto Nodes = parseMarkdown("call `foo()` here", Arena);
+ ASSERT_EQ(Nodes.size(), 3u);
+ EXPECT_EQ(cast<TextNode>(Nodes[0])->Text, "call ");
+ EXPECT_EQ(cast<InlineCodeNode>(Nodes[1])->Code, "foo()");
+ EXPECT_EQ(cast<TextNode>(Nodes[2])->Text, " here");
+}
+
+// CommonMark §6.1: a doubled backtick fence lets the span contain a single
+// backtick.
+TEST_F(MarkdownParserTest, InlineCodeDoubleBacktick) {
+ auto Nodes = parseMarkdown("``a`b``", Arena);
+ ASSERT_EQ(Nodes.size(), 1u);
+ EXPECT_EQ(cast<InlineCodeNode>(Nodes[0])->Code, "a`b");
+}
+
+// Emphasis and strong recurse, so a code span inside emphasis is parsed.
+TEST_F(MarkdownParserTest, CodeSpanInsideEmphasis) {
+ auto Nodes = parseMarkdown("*see `x`*", Arena);
+ ASSERT_EQ(Nodes.size(), 1u);
+ auto *Em = cast<EmphasisNode>(Nodes[0]);
+ ASSERT_EQ(Em->Children.size(), 2u);
+ EXPECT_EQ(cast<TextNode>(Em->Children[0])->Text, "see ");
+ EXPECT_EQ(cast<InlineCodeNode>(Em->Children[1])->Code, "x");
+}
+
+TEST_F(MarkdownParserTest, CodeSpanInsideStrong) {
+ auto Nodes = parseMarkdown("**a `b`**", Arena);
+ ASSERT_EQ(Nodes.size(), 1u);
+ auto *St = cast<StrongNode>(Nodes[0]);
+ ASSERT_EQ(St->Children.size(), 2u);
+ EXPECT_EQ(cast<TextNode>(St->Children[0])->Text, "a ");
+ EXPECT_EQ(cast<InlineCodeNode>(St->Children[1])->Code, "b");
+}
+
+// A delimiter with whitespace on the inside does not open emphasis.
+TEST_F(MarkdownParserTest, UnmatchedDelimiterIsText) {
+ auto Nodes = parseMarkdown("a * b", Arena);
+ ASSERT_EQ(Nodes.size(), 1u);
+ EXPECT_EQ(cast<TextNode>(Nodes[0])->Text, "a * b");
+}
+
+// An unterminated code span leaves the backtick as literal text.
+TEST_F(MarkdownParserTest, UnterminatedCodeSpanIsText) {
+ auto Nodes = parseMarkdown("a `b c", Arena);
+ ASSERT_EQ(Nodes.size(), 1u);
+ EXPECT_EQ(cast<TextNode>(Nodes[0])->Text, "a `b c");
+}
+
+// Inline parsing must not disturb plain text with no markers.
+TEST_F(MarkdownParserTest, PlainTextHasNoInlineNodes) {
+ auto Nodes = parseMarkdown("just words", Arena);
+ ASSERT_EQ(Nodes.size(), 1u);
+ EXPECT_EQ(cast<TextNode>(Nodes[0])->Text, "just words");
+}
+
} // namespace
\ No newline at end of file
>From 0af1c8e2999a20e2044cc337a8c4f0d8112d208b Mon Sep 17 00:00:00 2001
From: Neil-N4 <neilnair4 at gmail.com>
Date: Fri, 12 Jun 2026 10:18:32 -0400
Subject: [PATCH 07/23] [clang-doc] Address review feedback: rename inline
parser variables, simplify header docs
---
.../clang-doc/support/Markdown.cpp | 54 ++++++++++---------
.../clang-doc/support/Markdown.h | 25 +++------
2 files changed, 34 insertions(+), 45 deletions(-)
diff --git a/clang-tools-extra/clang-doc/support/Markdown.cpp b/clang-tools-extra/clang-doc/support/Markdown.cpp
index f1af4f5430772..ef29daa76a166 100644
--- a/clang-tools-extra/clang-doc/support/Markdown.cpp
+++ b/clang-tools-extra/clang-doc/support/Markdown.cpp
@@ -143,7 +143,7 @@ static size_t findClosingDelim(StringRef S, size_t From, char C, size_t Count) {
// model (delimiter stacks, intraword underscore rules, links, autolinks).
static ArrayRef<MDNode *> parseInline(StringRef S, BumpPtrAllocator &Arena) {
SmallVector<MDNode *> Nodes;
- size_t TextStart = 0, I = 0, E = S.size();
+ size_t TextStart = 0, Pos = 0, E = S.size();
auto flushText = [&](size_t End) {
if (End > TextStart)
@@ -151,54 +151,56 @@ static ArrayRef<MDNode *> parseInline(StringRef S, BumpPtrAllocator &Arena) {
internString(S.substr(TextStart, End - TextStart), Arena)));
};
- while (I < E) {
- char C = S[I];
+ while (Pos < E) {
+ char C = S[Pos];
- // Inline code span: a run of N backticks closed by a run of N backticks.
+ // Inline code span: an opening backtick run closed by a run of the same
+ // length.
if (C == '`') {
- size_t N = countRun(S, I, '`');
- size_t J = I + N;
- while (J < E && countRun(S, J, '`') != N)
- J += S[J] == '`' ? countRun(S, J, '`') : 1;
- if (J < E) {
- flushText(I);
- StringRef Code = trimCodeSpan(S.substr(I + N, J - (I + N)));
+ size_t OpenLen = countRun(S, Pos, '`');
+ size_t ClosePos = Pos + OpenLen;
+ while (ClosePos < E && countRun(S, ClosePos, '`') != OpenLen)
+ ClosePos += S[ClosePos] == '`' ? countRun(S, ClosePos, '`') : 1;
+ if (ClosePos < E) {
+ flushText(Pos);
+ StringRef Code =
+ trimCodeSpan(S.substr(Pos + OpenLen, ClosePos - (Pos + OpenLen)));
Nodes.push_back(new (Arena) InlineCodeNode(internString(Code, Arena)));
- I = J + N;
- TextStart = I;
+ Pos = ClosePos + OpenLen;
+ TextStart = Pos;
continue;
}
// No closing run; leave the backticks as literal text.
- I += N;
+ Pos += OpenLen;
continue;
}
// Emphasis (*text*, _text_) and strong (**text**, __text__).
if (C == '*' || C == '_') {
// Strong binds the two-delimiter form before single-delimiter emphasis.
- if (I + 1 < E && S[I + 1] == C) {
- size_t Close = findClosingDelim(S, I + 2, C, 2);
+ if (Pos + 1 < E && S[Pos + 1] == C) {
+ size_t Close = findClosingDelim(S, Pos + 2, C, 2);
if (Close != StringRef::npos) {
- flushText(I);
- StringRef Inner = S.substr(I + 2, Close - (I + 2));
+ flushText(Pos);
+ StringRef Inner = S.substr(Pos + 2, Close - (Pos + 2));
Nodes.push_back(new (Arena) StrongNode(parseInline(Inner, Arena)));
- I = Close + 2;
- TextStart = I;
+ Pos = Close + 2;
+ TextStart = Pos;
continue;
}
}
- size_t Close = findClosingDelim(S, I + 1, C, 1);
+ size_t Close = findClosingDelim(S, Pos + 1, C, 1);
if (Close != StringRef::npos) {
- flushText(I);
- StringRef Inner = S.substr(I + 1, Close - (I + 1));
+ flushText(Pos);
+ StringRef Inner = S.substr(Pos + 1, Close - (Pos + 1));
Nodes.push_back(new (Arena) EmphasisNode(parseInline(Inner, Arena)));
- I = Close + 1;
- TextStart = I;
+ Pos = Close + 1;
+ TextStart = Pos;
continue;
}
}
- ++I;
+ ++Pos;
}
flushText(E);
diff --git a/clang-tools-extra/clang-doc/support/Markdown.h b/clang-tools-extra/clang-doc/support/Markdown.h
index 3d457bcddfac6..60390465588c3 100644
--- a/clang-tools-extra/clang-doc/support/Markdown.h
+++ b/clang-tools-extra/clang-doc/support/Markdown.h
@@ -9,20 +9,10 @@
/// \file
/// Standalone Markdown parsing library for the LLVM ecosystem.
///
-/// The parser takes plain paragraph text and returns a polymorphic tree of
-/// MDNode-derived objects allocated in a caller-supplied BumpPtrAllocator.
-/// Node types form a closed class hierarchy rooted at MDNode. Each concrete
-/// type carries exactly the fields it needs -- no overloaded Content field,
-/// no unused arrays. Use llvm::isa<>/cast<>/dyn_cast<> for type-safe
-/// downcasting; each concrete type provides classof() for this purpose.
-///
-/// See
-/// https://llvm.org/docs/ProgrammerManual.html#the-isa-cast-and-dyn-cast-templates
-///
-/// Field ordering in each derived struct is chosen to minimize padding:
-/// 4-byte fields (like Level or Start) are declared before 16-byte fields
-/// (ArrayRef, StringRef) so that no implicit padding is inserted between the
-/// base class's 4-byte Kind and the first derived field.
+/// The parser takes a single paragraph of plain text and returns a list of
+/// nodes describing the Markdown it found. Each kind of construct has its own
+/// node type, and every node shares a common MDNode base, so you can use
+/// llvm::isa<>/cast<>/dyn_cast<> to check what a node is.
///
/// Inline nodes (appear inside ParagraphNode, HeadingNode, etc.):
/// TextNode -- plain text run
@@ -165,9 +155,7 @@ struct ParagraphNode : MDNode {
}
};
-/// ATX heading: one to six leading # characters. Level is declared before
-/// Children to avoid padding between the base class's 4-byte Kind and the
-/// 8-byte-aligned ArrayRef, keeping sizeof(HeadingNode) at 24 bytes.
+/// ATX heading: one to six leading # characters.
struct HeadingNode : MDNode {
unsigned Level; // 1-6
llvm::ArrayRef<MDNode *> Children; // inline content
@@ -226,8 +214,7 @@ struct UnorderedListNode : MDNode {
}
};
-/// Ordered (numbered) list. Start is the number on the first item. Start is
-/// declared before Items to avoid padding, keeping sizeof at 24 bytes.
+/// Ordered (numbered) list. Start is the number on the first item.
struct OrderedListNode : MDNode {
unsigned Start;
llvm::ArrayRef<ListItemNode *> Items;
>From b76bfa182db40e7a358ffb7d42506aff24453e14 Mon Sep 17 00:00:00 2001
From: Neil-N4 <neilnair4 at gmail.com>
Date: Fri, 12 Jun 2026 11:50:32 -0400
Subject: [PATCH 08/23] [clang-doc] Add libFuzzer harness for parseMarkdown()
---
clang-tools-extra/clang-doc/CMakeLists.txt | 1 +
.../clang-doc/fuzzer/CMakeLists.txt | 21 +++++++++++++
.../clang-doc/fuzzer/DummyMarkdownFuzzer.cpp | 21 +++++++++++++
.../clang-doc/fuzzer/FuzzMarkdown.cpp | 30 +++++++++++++++++++
4 files changed, 73 insertions(+)
create mode 100644 clang-tools-extra/clang-doc/fuzzer/CMakeLists.txt
create mode 100644 clang-tools-extra/clang-doc/fuzzer/DummyMarkdownFuzzer.cpp
create mode 100644 clang-tools-extra/clang-doc/fuzzer/FuzzMarkdown.cpp
diff --git a/clang-tools-extra/clang-doc/CMakeLists.txt b/clang-tools-extra/clang-doc/CMakeLists.txt
index 22e2c8159e9f6..f64d1129ed4af 100644
--- a/clang-tools-extra/clang-doc/CMakeLists.txt
+++ b/clang-tools-extra/clang-doc/CMakeLists.txt
@@ -44,6 +44,7 @@ target_link_libraries(clangDoc
)
add_subdirectory(tool)
+add_subdirectory(fuzzer)
if (LLVM_INCLUDE_BENCHMARKS)
add_subdirectory(benchmarks)
diff --git a/clang-tools-extra/clang-doc/fuzzer/CMakeLists.txt b/clang-tools-extra/clang-doc/fuzzer/CMakeLists.txt
new file mode 100644
index 0000000000000..5e6e943891052
--- /dev/null
+++ b/clang-tools-extra/clang-doc/fuzzer/CMakeLists.txt
@@ -0,0 +1,21 @@
+# Resolve "support/Markdown.h" against the parent clang-doc directory.
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/..)
+
+set(LLVM_LINK_COMPONENTS
+ FuzzerCLI
+ Support
+ )
+
+# This fuzzer runs on oss-fuzz, so keep it around even if it looks unreferenced.
+# With a fuzzing engine configured (LLVM_USE_SANITIZE_COVERAGE or an external
+# LLVM_LIB_FUZZING_ENGINE) this builds a real fuzz target; otherwise DUMMY_MAIN
+# provides a main() so it still builds and can be replayed over saved inputs.
+add_llvm_fuzzer(clang-doc-markdown-fuzzer
+ FuzzMarkdown.cpp
+ DUMMY_MAIN DummyMarkdownFuzzer.cpp
+ )
+
+target_link_libraries(clang-doc-markdown-fuzzer
+ PRIVATE
+ clangDocSupport
+ )
diff --git a/clang-tools-extra/clang-doc/fuzzer/DummyMarkdownFuzzer.cpp b/clang-tools-extra/clang-doc/fuzzer/DummyMarkdownFuzzer.cpp
new file mode 100644
index 0000000000000..61466e0fa4ef6
--- /dev/null
+++ b/clang-tools-extra/clang-doc/fuzzer/DummyMarkdownFuzzer.cpp
@@ -0,0 +1,21 @@
+//===-- DummyMarkdownFuzzer.cpp - Entry point to test the fuzzer ----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Implementation of main so we can build and test the harness without linking
+// libFuzzer. Each command line argument is treated as a file to run the
+// harness on.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/FuzzMutate/FuzzerCLI.h"
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size);
+
+int main(int argc, char *argv[]) {
+ return llvm::runFuzzerOnInputs(argc, argv, LLVMFuzzerTestOneInput);
+}
diff --git a/clang-tools-extra/clang-doc/fuzzer/FuzzMarkdown.cpp b/clang-tools-extra/clang-doc/fuzzer/FuzzMarkdown.cpp
new file mode 100644
index 0000000000000..e407b3baccf2e
--- /dev/null
+++ b/clang-tools-extra/clang-doc/fuzzer/FuzzMarkdown.cpp
@@ -0,0 +1,30 @@
+//===-- FuzzMarkdown.cpp - Fuzzer for the clang-doc Markdown parser -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file implements a libFuzzer harness for parseMarkdown(). It feeds
+/// arbitrary bytes to the parser and checks that it never crashes. The parsed
+/// nodes are walked so the returned tree is exercised, not just allocated.
+///
+//===----------------------------------------------------------------------===//
+
+#include "support/Markdown.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Allocator.h"
+#include <cstddef>
+#include <cstdint>
+
+using namespace clang::doc::markdown;
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) {
+ llvm::BumpPtrAllocator Arena;
+ llvm::StringRef Input(reinterpret_cast<const char *>(Data), Size);
+ for (const MDNode *Node : parseMarkdown(Input, Arena))
+ (void)Node->Kind;
+ return 0;
+}
>From 77e28993d7a167410fd1a1ee97d2824945b44063 Mon Sep 17 00:00:00 2001
From: Neil-N4 <neilnair4 at gmail.com>
Date: Fri, 12 Jun 2026 18:47:47 -0400
Subject: [PATCH 09/23] [clang-doc] Address review feedback: rename
findClosingDelim params, add table TODO, fix EOF newline
---
.../clang-doc/support/Markdown.cpp | 28 +++++++++++--------
.../clang-doc/MarkdownParserTest.cpp | 2 +-
2 files changed, 17 insertions(+), 13 deletions(-)
diff --git a/clang-tools-extra/clang-doc/support/Markdown.cpp b/clang-tools-extra/clang-doc/support/Markdown.cpp
index ef29daa76a166..6a57cd7900ea2 100644
--- a/clang-tools-extra/clang-doc/support/Markdown.cpp
+++ b/clang-tools-extra/clang-doc/support/Markdown.cpp
@@ -107,26 +107,27 @@ static StringRef trimCodeSpan(StringRef Code) {
return Code;
}
-// Finds the start index of a closing emphasis run of exactly Count copies of C,
-// searching forward from From. Requires non-whitespace immediately inside both
-// the opening and closing delimiters and non-empty content, a simplified take
-// on the CommonMark §6.2 flanking rules. Returns StringRef::npos if no valid
-// closing run exists.
-static size_t findClosingDelim(StringRef S, size_t From, char C, size_t Count) {
+// Finds the start index of a closing emphasis run of exactly DelimLen copies of
+// DelimChar, searching forward from StartPos. Requires non-whitespace
+// immediately inside both the opening and closing delimiters and non-empty
+// content, a simplified take on the CommonMark §6.2 flanking rules. Returns
+// StringRef::npos if no valid closing run exists.
+static size_t findClosingDelim(StringRef S, size_t StartPos, char DelimChar,
+ size_t DelimLen) {
size_t E = S.size();
// Opening delimiter is not left-flanking if whitespace follows it.
- if (From >= E || isSpace(S[From]))
+ if (StartPos >= E || isSpace(S[StartPos]))
return StringRef::npos;
- for (size_t J = From; J + Count <= E; ++J) {
- if (S[J] != C)
+ for (size_t J = StartPos; J + DelimLen <= E; ++J) {
+ if (S[J] != DelimChar)
continue;
- size_t Run = countRun(S, J, C);
- if (Run != Count) {
+ size_t Run = countRun(S, J, DelimChar);
+ if (Run != DelimLen) {
J += Run - 1; // Skip the whole run; the loop's ++J lands past it.
continue;
}
// Reject empty content and closing runs that are not right-flanking.
- if (J == From || isSpace(S[J - 1]))
+ if (J == StartPos || isSpace(S[J - 1]))
continue;
return J;
}
@@ -257,6 +258,9 @@ ArrayRef<MDNode *> parseMarkdown(StringRef ParagraphText,
// Pipe table: current line has | and next line is a separator row.
if (Line.contains('|') && isSepRow(Reader.peek(1).trim())) {
SmallVector<StringRef> Rows;
+ // TODO: Rows are kept as raw line text for now. Table cells may contain
+ // inline content (emphasis, code spans, links), so each row may need to
+ // be split on '|' and parsed further into structured cells.
while (!Reader.atEnd() && Reader.peek().trim().contains('|'))
Rows.push_back(internString(Reader.advance().trim(), Arena));
auto *Table = new (Arena) TableNode(allocateArray(Rows, Arena));
diff --git a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
index ea72dacfb08e5..28bb9d567e6bc 100644
--- a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
+++ b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
@@ -301,4 +301,4 @@ TEST_F(MarkdownParserTest, PlainTextHasNoInlineNodes) {
EXPECT_EQ(cast<TextNode>(Nodes[0])->Text, "just words");
}
-} // namespace
\ No newline at end of file
+} // namespace
>From f33ef2ce3f9292e10f1e1dd220a500070ef21bc5 Mon Sep 17 00:00:00 2001
From: Neil-N4 <neilnair4 at gmail.com>
Date: Fri, 12 Jun 2026 22:29:15 -0400
Subject: [PATCH 10/23] [clang-doc] Address review feedback: make
UnterminatedFence and MixedContent tests explicit
---
.../unittests/clang-doc/MarkdownParserTest.cpp | 17 ++++++++++++-----
1 file changed, 12 insertions(+), 5 deletions(-)
diff --git a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
index 28bb9d567e6bc..207ae938c299a 100644
--- a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
+++ b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
@@ -58,13 +58,17 @@ some code
EXPECT_TRUE(N->Lang.empty());
}
-TEST_F(MarkdownParserTest, UnterminatedFenceReturnsEmpty) {
+TEST_F(MarkdownParserTest, UnterminatedFenceProducesCodeNode) {
auto Nodes = parseMarkdown(R"(```cpp
int x = 0;)",
Arena);
- // Unterminated fence should not crash and should produce a code node
- // with whatever lines were found.
- EXPECT_FALSE(Nodes.empty());
+ // An unterminated fence should not crash. The parser falls back to emitting a
+ // FencedCodeNode with whatever lines were found before the end of input.
+ ASSERT_EQ(Nodes.size(), 1u);
+ auto *N = cast<FencedCodeNode>(Nodes[0]);
+ EXPECT_EQ(N->Lang, "cpp");
+ ASSERT_EQ(N->Lines.size(), 1u);
+ EXPECT_EQ(N->Lines[0], "int x = 0;");
}
TEST_F(MarkdownParserTest, PipeTable) {
@@ -105,7 +109,10 @@ code
````````
- item)",
Arena);
- EXPECT_EQ(Nodes.size(), 3u);
+ ASSERT_EQ(Nodes.size(), 3u);
+ EXPECT_TRUE(isa<TextNode>(Nodes[0]));
+ EXPECT_TRUE(isa<FencedCodeNode>(Nodes[1]));
+ EXPECT_TRUE(isa<UnorderedListNode>(Nodes[2]));
}
// CommonMark §4.5 example 120: tilde fences work the same as backtick fences.
>From 4371be42e6ccb7a955301c77b5b732e45675347d Mon Sep 17 00:00:00 2001
From: Neil-N4 <neilnair4 at gmail.com>
Date: Fri, 12 Jun 2026 22:35:54 -0400
Subject: [PATCH 11/23] [clang-doc] Replace internString with
llvm::StringSaver, matching Mustache pattern
---
.../clang-doc/support/Markdown.cpp | 34 ++++++++-----------
1 file changed, 15 insertions(+), 19 deletions(-)
diff --git a/clang-tools-extra/clang-doc/support/Markdown.cpp b/clang-tools-extra/clang-doc/support/Markdown.cpp
index 6a57cd7900ea2..be2800bff5df7 100644
--- a/clang-tools-extra/clang-doc/support/Markdown.cpp
+++ b/clang-tools-extra/clang-doc/support/Markdown.cpp
@@ -12,6 +12,7 @@
#include "llvm/ADT/StringRef.h"
#include "llvm/Support/Allocator.h"
#include "llvm/Support/DebugLog.h"
+#include "llvm/Support/StringSaver.h"
#include <cassert>
#define DEBUG_TYPE "clang-doc"
@@ -31,15 +32,6 @@ static ArrayRef<T> allocateArray(SmallVectorImpl<T> &Vec,
return ArrayRef<T>(Allocated, Vec.size());
}
-// Interns a StringRef into the arena so it outlives the parse loop.
-static StringRef internString(StringRef S, BumpPtrAllocator &Arena) {
- if (S.empty())
- return {};
- char *Buf = Arena.Allocate<char>(S.size());
- std::copy(S.begin(), S.end(), Buf);
- return StringRef(Buf, S.size());
-}
-
// A line is a table separator if it only contains |, -, :, and spaces,
// and has at least one -.
static bool isSepRow(StringRef Line) {
@@ -142,14 +134,15 @@ static size_t findClosingDelim(StringRef S, size_t StartPos, char DelimChar,
//
// TODO: This covers the common cases but not the full CommonMark §6 inline
// model (delimiter stacks, intraword underscore rules, links, autolinks).
-static ArrayRef<MDNode *> parseInline(StringRef S, BumpPtrAllocator &Arena) {
+static ArrayRef<MDNode *> parseInline(StringRef S, BumpPtrAllocator &Arena,
+ StringSaver &Saver) {
SmallVector<MDNode *> Nodes;
size_t TextStart = 0, Pos = 0, E = S.size();
auto flushText = [&](size_t End) {
if (End > TextStart)
Nodes.push_back(new (Arena) TextNode(
- internString(S.substr(TextStart, End - TextStart), Arena)));
+ Saver.save(S.substr(TextStart, End - TextStart))));
};
while (Pos < E) {
@@ -166,7 +159,7 @@ static ArrayRef<MDNode *> parseInline(StringRef S, BumpPtrAllocator &Arena) {
flushText(Pos);
StringRef Code =
trimCodeSpan(S.substr(Pos + OpenLen, ClosePos - (Pos + OpenLen)));
- Nodes.push_back(new (Arena) InlineCodeNode(internString(Code, Arena)));
+ Nodes.push_back(new (Arena) InlineCodeNode(Saver.save(Code)));
Pos = ClosePos + OpenLen;
TextStart = Pos;
continue;
@@ -184,7 +177,8 @@ static ArrayRef<MDNode *> parseInline(StringRef S, BumpPtrAllocator &Arena) {
if (Close != StringRef::npos) {
flushText(Pos);
StringRef Inner = S.substr(Pos + 2, Close - (Pos + 2));
- Nodes.push_back(new (Arena) StrongNode(parseInline(Inner, Arena)));
+ Nodes.push_back(new (Arena)
+ StrongNode(parseInline(Inner, Arena, Saver)));
Pos = Close + 2;
TextStart = Pos;
continue;
@@ -194,7 +188,8 @@ static ArrayRef<MDNode *> parseInline(StringRef S, BumpPtrAllocator &Arena) {
if (Close != StringRef::npos) {
flushText(Pos);
StringRef Inner = S.substr(Pos + 1, Close - (Pos + 1));
- Nodes.push_back(new (Arena) EmphasisNode(parseInline(Inner, Arena)));
+ Nodes.push_back(new (Arena)
+ EmphasisNode(parseInline(Inner, Arena, Saver)));
Pos = Close + 1;
TextStart = Pos;
continue;
@@ -213,6 +208,7 @@ ArrayRef<MDNode *> parseMarkdown(StringRef ParagraphText,
if (ParagraphText.trim().empty())
return {};
+ StringSaver Saver(Arena);
SmallVector<StringRef, 16> Lines;
ParagraphText.split(Lines, '\n');
@@ -234,7 +230,7 @@ ArrayRef<MDNode *> parseMarkdown(StringRef ParagraphText,
// case-by-case basis.
if (Line.starts_with("```") || Line.starts_with("~~~")) {
char Fence = Line[0];
- StringRef Lang = internString(Line.drop_front(3).trim(), Arena);
+ StringRef Lang = Saver.save(Line.drop_front(3).trim());
Reader.advance(); // consume opening fence
SmallVector<StringRef> CodeLines;
while (!Reader.atEnd()) {
@@ -243,7 +239,7 @@ ArrayRef<MDNode *> parseMarkdown(StringRef ParagraphText,
all_of(CodeLine.take_front(3),
[Fence](char C) { return C == Fence; }))
break;
- CodeLines.push_back(internString(Reader.advance(), Arena));
+ CodeLines.push_back(Saver.save(Reader.advance()));
}
if (!Reader.atEnd())
Reader.advance(); // consume closing fence
@@ -262,7 +258,7 @@ ArrayRef<MDNode *> parseMarkdown(StringRef ParagraphText,
// inline content (emphasis, code spans, links), so each row may need to
// be split on '|' and parsed further into structured cells.
while (!Reader.atEnd() && Reader.peek().trim().contains('|'))
- Rows.push_back(internString(Reader.advance().trim(), Arena));
+ Rows.push_back(Saver.save(Reader.advance().trim()));
auto *Table = new (Arena) TableNode(allocateArray(Rows, Arena));
LDBG() << "emitting TableNode rows=" << Rows.size();
Nodes.push_back(Table);
@@ -276,7 +272,7 @@ ArrayRef<MDNode *> parseMarkdown(StringRef ParagraphText,
StringRef L = Reader.peek().trim();
if (!isListItem(L))
break;
- StringRef ItemText = internString(L.drop_front(2).trim(), Arena);
+ StringRef ItemText = Saver.save(L.drop_front(2).trim());
SmallVector<MDNode *> ItemChildren;
ItemChildren.push_back(new (Arena) TextNode(ItemText));
auto *Item =
@@ -291,7 +287,7 @@ ArrayRef<MDNode *> parseMarkdown(StringRef ParagraphText,
}
// Plain text, scanned for inline constructs (emphasis, strong, code).
- for (MDNode *Inline : parseInline(Line, Arena))
+ for (MDNode *Inline : parseInline(Line, Arena, Saver))
Nodes.push_back(Inline);
Reader.advance();
}
>From 0b5f53715fc6e78a56145609893fa61f5cf4f353 Mon Sep 17 00:00:00 2001
From: Neil-N4 <neilnair4 at gmail.com>
Date: Fri, 12 Jun 2026 22:59:09 -0400
Subject: [PATCH 12/23] [clang-doc] Address review feedback: fix comment
accuracy and trim AI-sounding language
---
clang-tools-extra/clang-doc/support/Markdown.h | 12 +++++-------
.../unittests/clang-doc/MarkdownParserTest.cpp | 13 +++++--------
2 files changed, 10 insertions(+), 15 deletions(-)
diff --git a/clang-tools-extra/clang-doc/support/Markdown.h b/clang-tools-extra/clang-doc/support/Markdown.h
index 60390465588c3..8c2055868671a 100644
--- a/clang-tools-extra/clang-doc/support/Markdown.h
+++ b/clang-tools-extra/clang-doc/support/Markdown.h
@@ -49,8 +49,7 @@
namespace clang::doc::markdown {
/// Discriminator for all Markdown AST nodes. Inline kinds are grouped before
-/// block kinds so that the sentinels NK_LastInline and NK_FirstBlock enable
-/// cheap range-based checks in classof() implementations.
+/// block kinds.
enum class NodeKind {
// Inline nodes
NK_Text,
@@ -193,8 +192,7 @@ struct TableNode : MDNode {
static bool classof(const MDNode *N) { return N->Kind == NodeKind::NK_Table; }
};
-/// A single list item. Children may contain block-level nodes for loose
-/// lists, or a single inline sequence for tight lists.
+/// A single list item. Children holds the item's inline content.
struct ListItemNode : MDNode {
llvm::ArrayRef<MDNode *> Children;
explicit ListItemNode(llvm::ArrayRef<MDNode *> Children)
@@ -248,9 +246,9 @@ struct ThematicBreakNode : MDNode {
//===----------------------------------------------------------------------===//
/// Parse Markdown from a single paragraph of plain text. Returns a list of
-/// top-level block nodes allocated in Arena. Returns an empty ArrayRef if no
-/// Markdown constructs are found, letting callers fall back to plain-text
-/// rendering at zero cost. The parser never crashes on malformed input.
+/// top-level nodes allocated in Arena. Returns an empty ArrayRef only for empty
+/// or whitespace-only input; plain text with no Markdown constructs returns a
+/// single TextNode.
///
/// The caller must keep Arena alive for the lifetime of any returned nodes.
llvm::ArrayRef<MDNode *> parseMarkdown(llvm::StringRef ParagraphText,
diff --git a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
index 207ae938c299a..e2fd07159d446 100644
--- a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
+++ b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
@@ -196,16 +196,13 @@ foo
ASSERT_EQ(N->Lines.size(), 1u);
}
-// CommonMark §4.5 example 124: closing fence must be at least as long as the
-// opening fence.
-// TODO: our parser currently closes on the first line with 3 matching fence
-// chars regardless of opening fence length. Fix as part of the CommonMark
-// TODO in parseMarkdown().
+// CommonMark §4.5 example 124: the closing fence must be at least as long as
+// the opening fence. Our parser closes on the first line with 3 matching fence
+// chars regardless of opening length, so this documents the current
+// non-conformant behavior.
+// TODO: fix as part of the CommonMark TODO in parseMarkdown().
TEST_F(MarkdownParserTest, ClosingFenceLengthTODO) {
auto Nodes = parseMarkdown("````\naaa\n```", Arena);
- // The ``` line should not close the ```` fence per CommonMark, but our
- // parser currently treats it as a closing fence. This test documents the
- // current (non-conformant) behavior.
ASSERT_EQ(Nodes.size(), 1u);
auto *N = cast<FencedCodeNode>(Nodes[0]);
ASSERT_EQ(N->Lines.size(), 1u);
>From 14f455ecde0305ec38e20ca6068b0d8f5f259776 Mon Sep 17 00:00:00 2001
From: Neil-N4 <neilnair4 at gmail.com>
Date: Fri, 12 Jun 2026 23:07:02 -0400
Subject: [PATCH 13/23] [clang-doc] Wrap plain-text paragraph lines in
ParagraphNode
---
.../clang-doc/support/Markdown.cpp | 7 +-
.../clang-doc/MarkdownParserTest.cpp | 67 +++++++++++++------
2 files changed, 50 insertions(+), 24 deletions(-)
diff --git a/clang-tools-extra/clang-doc/support/Markdown.cpp b/clang-tools-extra/clang-doc/support/Markdown.cpp
index be2800bff5df7..59e651d2b8b05 100644
--- a/clang-tools-extra/clang-doc/support/Markdown.cpp
+++ b/clang-tools-extra/clang-doc/support/Markdown.cpp
@@ -286,9 +286,10 @@ ArrayRef<MDNode *> parseMarkdown(StringRef ParagraphText,
continue;
}
- // Plain text, scanned for inline constructs (emphasis, strong, code).
- for (MDNode *Inline : parseInline(Line, Arena, Saver))
- Nodes.push_back(Inline);
+ // Plain text line: scan for inline constructs (emphasis, strong, code) and
+ // wrap the result in a paragraph.
+ auto Inlines = parseInline(Line, Arena, Saver);
+ Nodes.push_back(new (Arena) ParagraphNode(Inlines));
Reader.advance();
}
diff --git a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
index e2fd07159d446..63d978061b99b 100644
--- a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
+++ b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
@@ -33,8 +33,9 @@ TEST_F(MarkdownParserTest, WhitespaceOnlyInput) {
TEST_F(MarkdownParserTest, PlainText) {
auto Nodes = parseMarkdown("hello world", Arena);
ASSERT_EQ(Nodes.size(), 1u);
- auto *N = cast<TextNode>(Nodes[0]);
- EXPECT_EQ(N->Text, "hello world");
+ auto *P = cast<ParagraphNode>(Nodes[0]);
+ ASSERT_EQ(P->Children.size(), 1u);
+ EXPECT_EQ(cast<TextNode>(P->Children[0])->Text, "hello world");
}
TEST_F(MarkdownParserTest, FencedCodeBlock) {
@@ -110,7 +111,7 @@ code
- item)",
Arena);
ASSERT_EQ(Nodes.size(), 3u);
- EXPECT_TRUE(isa<TextNode>(Nodes[0]));
+ EXPECT_TRUE(isa<ParagraphNode>(Nodes[0]));
EXPECT_TRUE(isa<FencedCodeNode>(Nodes[1]));
EXPECT_TRUE(isa<UnorderedListNode>(Nodes[2]));
}
@@ -210,18 +211,22 @@ TEST_F(MarkdownParserTest, ClosingFenceLengthTODO) {
TEST_F(MarkdownParserTest, EmphasisAsterisk) {
auto Nodes = parseMarkdown("an *important* word", Arena);
- ASSERT_EQ(Nodes.size(), 3u);
- EXPECT_EQ(cast<TextNode>(Nodes[0])->Text, "an ");
- auto *Em = cast<EmphasisNode>(Nodes[1]);
+ ASSERT_EQ(Nodes.size(), 1u);
+ auto *P = cast<ParagraphNode>(Nodes[0]);
+ ASSERT_EQ(P->Children.size(), 3u);
+ EXPECT_EQ(cast<TextNode>(P->Children[0])->Text, "an ");
+ auto *Em = cast<EmphasisNode>(P->Children[1]);
ASSERT_EQ(Em->Children.size(), 1u);
EXPECT_EQ(cast<TextNode>(Em->Children[0])->Text, "important");
- EXPECT_EQ(cast<TextNode>(Nodes[2])->Text, " word");
+ EXPECT_EQ(cast<TextNode>(P->Children[2])->Text, " word");
}
TEST_F(MarkdownParserTest, EmphasisUnderscore) {
auto Nodes = parseMarkdown("_em_", Arena);
ASSERT_EQ(Nodes.size(), 1u);
- auto *Em = cast<EmphasisNode>(Nodes[0]);
+ auto *P = cast<ParagraphNode>(Nodes[0]);
+ ASSERT_EQ(P->Children.size(), 1u);
+ auto *Em = cast<EmphasisNode>(P->Children[0]);
ASSERT_EQ(Em->Children.size(), 1u);
EXPECT_EQ(cast<TextNode>(Em->Children[0])->Text, "em");
}
@@ -229,7 +234,9 @@ TEST_F(MarkdownParserTest, EmphasisUnderscore) {
TEST_F(MarkdownParserTest, StrongAsterisk) {
auto Nodes = parseMarkdown("**bold**", Arena);
ASSERT_EQ(Nodes.size(), 1u);
- auto *St = cast<StrongNode>(Nodes[0]);
+ auto *P = cast<ParagraphNode>(Nodes[0]);
+ ASSERT_EQ(P->Children.size(), 1u);
+ auto *St = cast<StrongNode>(P->Children[0]);
ASSERT_EQ(St->Children.size(), 1u);
EXPECT_EQ(cast<TextNode>(St->Children[0])->Text, "bold");
}
@@ -237,7 +244,9 @@ TEST_F(MarkdownParserTest, StrongAsterisk) {
TEST_F(MarkdownParserTest, StrongUnderscore) {
auto Nodes = parseMarkdown("__bold__", Arena);
ASSERT_EQ(Nodes.size(), 1u);
- auto *St = cast<StrongNode>(Nodes[0]);
+ auto *P = cast<ParagraphNode>(Nodes[0]);
+ ASSERT_EQ(P->Children.size(), 1u);
+ auto *St = cast<StrongNode>(P->Children[0]);
ASSERT_EQ(St->Children.size(), 1u);
EXPECT_EQ(cast<TextNode>(St->Children[0])->Text, "bold");
}
@@ -246,15 +255,19 @@ TEST_F(MarkdownParserTest, StrongUnderscore) {
TEST_F(MarkdownParserTest, StrongBindsBeforeEmphasis) {
auto Nodes = parseMarkdown("**strong**", Arena);
ASSERT_EQ(Nodes.size(), 1u);
- EXPECT_TRUE(isa<StrongNode>(Nodes[0]));
+ auto *P = cast<ParagraphNode>(Nodes[0]);
+ ASSERT_EQ(P->Children.size(), 1u);
+ EXPECT_TRUE(isa<StrongNode>(P->Children[0]));
}
TEST_F(MarkdownParserTest, InlineCode) {
auto Nodes = parseMarkdown("call `foo()` here", Arena);
- ASSERT_EQ(Nodes.size(), 3u);
- EXPECT_EQ(cast<TextNode>(Nodes[0])->Text, "call ");
- EXPECT_EQ(cast<InlineCodeNode>(Nodes[1])->Code, "foo()");
- EXPECT_EQ(cast<TextNode>(Nodes[2])->Text, " here");
+ ASSERT_EQ(Nodes.size(), 1u);
+ auto *P = cast<ParagraphNode>(Nodes[0]);
+ ASSERT_EQ(P->Children.size(), 3u);
+ EXPECT_EQ(cast<TextNode>(P->Children[0])->Text, "call ");
+ EXPECT_EQ(cast<InlineCodeNode>(P->Children[1])->Code, "foo()");
+ EXPECT_EQ(cast<TextNode>(P->Children[2])->Text, " here");
}
// CommonMark §6.1: a doubled backtick fence lets the span contain a single
@@ -262,14 +275,18 @@ TEST_F(MarkdownParserTest, InlineCode) {
TEST_F(MarkdownParserTest, InlineCodeDoubleBacktick) {
auto Nodes = parseMarkdown("``a`b``", Arena);
ASSERT_EQ(Nodes.size(), 1u);
- EXPECT_EQ(cast<InlineCodeNode>(Nodes[0])->Code, "a`b");
+ auto *P = cast<ParagraphNode>(Nodes[0]);
+ ASSERT_EQ(P->Children.size(), 1u);
+ EXPECT_EQ(cast<InlineCodeNode>(P->Children[0])->Code, "a`b");
}
// Emphasis and strong recurse, so a code span inside emphasis is parsed.
TEST_F(MarkdownParserTest, CodeSpanInsideEmphasis) {
auto Nodes = parseMarkdown("*see `x`*", Arena);
ASSERT_EQ(Nodes.size(), 1u);
- auto *Em = cast<EmphasisNode>(Nodes[0]);
+ auto *P = cast<ParagraphNode>(Nodes[0]);
+ ASSERT_EQ(P->Children.size(), 1u);
+ auto *Em = cast<EmphasisNode>(P->Children[0]);
ASSERT_EQ(Em->Children.size(), 2u);
EXPECT_EQ(cast<TextNode>(Em->Children[0])->Text, "see ");
EXPECT_EQ(cast<InlineCodeNode>(Em->Children[1])->Code, "x");
@@ -278,7 +295,9 @@ TEST_F(MarkdownParserTest, CodeSpanInsideEmphasis) {
TEST_F(MarkdownParserTest, CodeSpanInsideStrong) {
auto Nodes = parseMarkdown("**a `b`**", Arena);
ASSERT_EQ(Nodes.size(), 1u);
- auto *St = cast<StrongNode>(Nodes[0]);
+ auto *P = cast<ParagraphNode>(Nodes[0]);
+ ASSERT_EQ(P->Children.size(), 1u);
+ auto *St = cast<StrongNode>(P->Children[0]);
ASSERT_EQ(St->Children.size(), 2u);
EXPECT_EQ(cast<TextNode>(St->Children[0])->Text, "a ");
EXPECT_EQ(cast<InlineCodeNode>(St->Children[1])->Code, "b");
@@ -288,21 +307,27 @@ TEST_F(MarkdownParserTest, CodeSpanInsideStrong) {
TEST_F(MarkdownParserTest, UnmatchedDelimiterIsText) {
auto Nodes = parseMarkdown("a * b", Arena);
ASSERT_EQ(Nodes.size(), 1u);
- EXPECT_EQ(cast<TextNode>(Nodes[0])->Text, "a * b");
+ auto *P = cast<ParagraphNode>(Nodes[0]);
+ ASSERT_EQ(P->Children.size(), 1u);
+ EXPECT_EQ(cast<TextNode>(P->Children[0])->Text, "a * b");
}
// An unterminated code span leaves the backtick as literal text.
TEST_F(MarkdownParserTest, UnterminatedCodeSpanIsText) {
auto Nodes = parseMarkdown("a `b c", Arena);
ASSERT_EQ(Nodes.size(), 1u);
- EXPECT_EQ(cast<TextNode>(Nodes[0])->Text, "a `b c");
+ auto *P = cast<ParagraphNode>(Nodes[0]);
+ ASSERT_EQ(P->Children.size(), 1u);
+ EXPECT_EQ(cast<TextNode>(P->Children[0])->Text, "a `b c");
}
// Inline parsing must not disturb plain text with no markers.
TEST_F(MarkdownParserTest, PlainTextHasNoInlineNodes) {
auto Nodes = parseMarkdown("just words", Arena);
ASSERT_EQ(Nodes.size(), 1u);
- EXPECT_EQ(cast<TextNode>(Nodes[0])->Text, "just words");
+ auto *P = cast<ParagraphNode>(Nodes[0]);
+ ASSERT_EQ(P->Children.size(), 1u);
+ EXPECT_EQ(cast<TextNode>(P->Children[0])->Text, "just words");
}
} // namespace
>From 7bb303ad25d10ba9540af4ee38f3aac0582d49df Mon Sep 17 00:00:00 2001
From: Neil-N4 <neilnair4 at gmail.com>
Date: Fri, 12 Jun 2026 23:18:11 -0400
Subject: [PATCH 14/23] [clang-doc] Add CharReader cursor for character-level
inline scanning
---
.../clang-doc/support/Markdown.cpp | 75 +++++++++++++++----
1 file changed, 60 insertions(+), 15 deletions(-)
diff --git a/clang-tools-extra/clang-doc/support/Markdown.cpp b/clang-tools-extra/clang-doc/support/Markdown.cpp
index 59e651d2b8b05..1eb6ad51eaf02 100644
--- a/clang-tools-extra/clang-doc/support/Markdown.cpp
+++ b/clang-tools-extra/clang-doc/support/Markdown.cpp
@@ -82,6 +82,49 @@ class LineReader {
size_t Pos = 0;
};
+// A forward cursor over the characters of a string. The character-level analog
+// of LineReader: the inline scanner inspects the current or an upcoming
+// character and consumes characters without manual index arithmetic. position()
+// and seek() let it interoperate with the index-based run and delimiter helpers
+// below, since inline constructs are not consumed one character at a time.
+class CharReader {
+public:
+ explicit CharReader(StringRef S) : S(S) {}
+
+ // True once every character has been consumed.
+ bool atEnd() const { return Pos >= S.size(); }
+
+ // The current character. Must not be called when atEnd().
+ char peek() const {
+ assert(!atEnd() && "peek past end of input");
+ return S[Pos];
+ }
+
+ // The character Offset positions ahead of the cursor, or '\0' when that
+ // position is past the end. peek(0) is the current character.
+ char peek(size_t Offset) const {
+ size_t Target = Pos + Offset;
+ return Target < S.size() ? S[Target] : '\0';
+ }
+
+ // Consume the current character and return it. Must not be called when
+ // atEnd().
+ char advance() {
+ assert(!atEnd() && "advance past end of input");
+ return S[Pos++];
+ }
+
+ // The current scan position, for substring, run, and delimiter computations.
+ size_t position() const { return Pos; }
+
+ // Move the cursor to an absolute position, used to skip past a matched span.
+ void seek(size_t NewPos) { Pos = NewPos; }
+
+private:
+ StringRef S;
+ size_t Pos = 0;
+};
+
// Returns the number of consecutive copies of C starting at S[Start].
static size_t countRun(StringRef S, size_t Start, char C) {
size_t I = Start;
@@ -137,7 +180,8 @@ static size_t findClosingDelim(StringRef S, size_t StartPos, char DelimChar,
static ArrayRef<MDNode *> parseInline(StringRef S, BumpPtrAllocator &Arena,
StringSaver &Saver) {
SmallVector<MDNode *> Nodes;
- size_t TextStart = 0, Pos = 0, E = S.size();
+ CharReader Reader(S);
+ size_t TextStart = 0;
auto flushText = [&](size_t End) {
if (End > TextStart)
@@ -145,42 +189,43 @@ static ArrayRef<MDNode *> parseInline(StringRef S, BumpPtrAllocator &Arena,
Saver.save(S.substr(TextStart, End - TextStart))));
};
- while (Pos < E) {
- char C = S[Pos];
+ while (!Reader.atEnd()) {
+ size_t Pos = Reader.position();
+ char C = Reader.peek();
// Inline code span: an opening backtick run closed by a run of the same
// length.
if (C == '`') {
size_t OpenLen = countRun(S, Pos, '`');
size_t ClosePos = Pos + OpenLen;
- while (ClosePos < E && countRun(S, ClosePos, '`') != OpenLen)
+ while (ClosePos < S.size() && countRun(S, ClosePos, '`') != OpenLen)
ClosePos += S[ClosePos] == '`' ? countRun(S, ClosePos, '`') : 1;
- if (ClosePos < E) {
+ if (ClosePos < S.size()) {
flushText(Pos);
StringRef Code =
trimCodeSpan(S.substr(Pos + OpenLen, ClosePos - (Pos + OpenLen)));
Nodes.push_back(new (Arena) InlineCodeNode(Saver.save(Code)));
- Pos = ClosePos + OpenLen;
- TextStart = Pos;
+ Reader.seek(ClosePos + OpenLen);
+ TextStart = Reader.position();
continue;
}
// No closing run; leave the backticks as literal text.
- Pos += OpenLen;
+ Reader.seek(Pos + OpenLen);
continue;
}
// Emphasis (*text*, _text_) and strong (**text**, __text__).
if (C == '*' || C == '_') {
// Strong binds the two-delimiter form before single-delimiter emphasis.
- if (Pos + 1 < E && S[Pos + 1] == C) {
+ if (Reader.peek(1) == C) {
size_t Close = findClosingDelim(S, Pos + 2, C, 2);
if (Close != StringRef::npos) {
flushText(Pos);
StringRef Inner = S.substr(Pos + 2, Close - (Pos + 2));
Nodes.push_back(new (Arena)
StrongNode(parseInline(Inner, Arena, Saver)));
- Pos = Close + 2;
- TextStart = Pos;
+ Reader.seek(Close + 2);
+ TextStart = Reader.position();
continue;
}
}
@@ -190,16 +235,16 @@ static ArrayRef<MDNode *> parseInline(StringRef S, BumpPtrAllocator &Arena,
StringRef Inner = S.substr(Pos + 1, Close - (Pos + 1));
Nodes.push_back(new (Arena)
EmphasisNode(parseInline(Inner, Arena, Saver)));
- Pos = Close + 1;
- TextStart = Pos;
+ Reader.seek(Close + 1);
+ TextStart = Reader.position();
continue;
}
}
- ++Pos;
+ Reader.advance();
}
- flushText(E);
+ flushText(S.size());
return allocateArray(Nodes, Arena);
}
>From 6864c7b552b37e64ee69c4660517da2cf2c22975 Mon Sep 17 00:00:00 2001
From: Neil-N4 <neilnair4 at gmail.com>
Date: Fri, 12 Jun 2026 23:22:06 -0400
Subject: [PATCH 15/23] [clang-doc] Extract block parse bodies into separate
functions
---
.../clang-doc/support/Markdown.cpp | 122 +++++++++++-------
1 file changed, 73 insertions(+), 49 deletions(-)
diff --git a/clang-tools-extra/clang-doc/support/Markdown.cpp b/clang-tools-extra/clang-doc/support/Markdown.cpp
index 1eb6ad51eaf02..625b3e6305ab9 100644
--- a/clang-tools-extra/clang-doc/support/Markdown.cpp
+++ b/clang-tools-extra/clang-doc/support/Markdown.cpp
@@ -248,6 +248,75 @@ static ArrayRef<MDNode *> parseInline(StringRef S, BumpPtrAllocator &Arena,
return allocateArray(Nodes, Arena);
}
+// Parses a fenced code block opened with ``` or ~~~. The cursor must be on the
+// opening fence; the fence, body lines, and closing fence are consumed.
+//
+// TODO: Follow CommonMark spec §4.5 more closely -- opening fences may be
+// indented up to 3 spaces, the closing fence must use the same character and be
+// at least as long as the opening fence, and the closing fence may only be
+// followed by spaces. Doxygen specifics should be handled on a case-by-case
+// basis.
+static FencedCodeNode *parseFencedCode(LineReader &Reader,
+ BumpPtrAllocator &Arena,
+ StringSaver &Saver) {
+ StringRef Open = Reader.peek().trim();
+ char Fence = Open[0];
+ StringRef Lang = Saver.save(Open.drop_front(3).trim());
+ Reader.advance(); // consume opening fence
+ SmallVector<StringRef> CodeLines;
+ while (!Reader.atEnd()) {
+ StringRef CodeLine = Reader.peek().trim();
+ if (CodeLine.size() >= 3 &&
+ all_of(CodeLine.take_front(3), [Fence](char C) { return C == Fence; }))
+ break;
+ CodeLines.push_back(Saver.save(Reader.advance()));
+ }
+ if (!Reader.atEnd())
+ Reader.advance(); // consume closing fence
+ auto *Code =
+ new (Arena) FencedCodeNode(Lang, allocateArray(CodeLines, Arena));
+ LDBG() << "emitting FencedCodeNode lang='" << Lang
+ << "' lines=" << CodeLines.size();
+ return Code;
+}
+
+// Parses a pipe table. The cursor must be on the header row, with a separator
+// row following; consecutive lines containing a | are taken as rows.
+static TableNode *parsePipeTable(LineReader &Reader, BumpPtrAllocator &Arena,
+ StringSaver &Saver) {
+ SmallVector<StringRef> Rows;
+ // TODO: Rows are kept as raw line text for now. Table cells may contain
+ // inline content (emphasis, code spans, links), so each row may need to be
+ // split on '|' and parsed further into structured cells.
+ while (!Reader.atEnd() && Reader.peek().trim().contains('|'))
+ Rows.push_back(Saver.save(Reader.advance().trim()));
+ auto *Table = new (Arena) TableNode(allocateArray(Rows, Arena));
+ LDBG() << "emitting TableNode rows=" << Rows.size();
+ return Table;
+}
+
+// Parses an unordered (bullet) list. The cursor must be on the first item;
+// consecutive bullet lines are consumed into list items.
+static UnorderedListNode *parseUnorderedList(LineReader &Reader,
+ BumpPtrAllocator &Arena,
+ StringSaver &Saver) {
+ SmallVector<ListItemNode *> Items;
+ while (!Reader.atEnd()) {
+ StringRef L = Reader.peek().trim();
+ if (!isListItem(L))
+ break;
+ StringRef ItemText = Saver.save(L.drop_front(2).trim());
+ SmallVector<MDNode *> ItemChildren;
+ ItemChildren.push_back(new (Arena) TextNode(ItemText));
+ auto *Item = new (Arena) ListItemNode(allocateArray(ItemChildren, Arena));
+ Items.push_back(Item);
+ Reader.advance();
+ }
+ auto *List = new (Arena) UnorderedListNode(allocateArray(Items, Arena));
+ LDBG() << "emitting UnorderedListNode items=" << Items.size();
+ return List;
+}
+
ArrayRef<MDNode *> parseMarkdown(StringRef ParagraphText,
BumpPtrAllocator &Arena) {
if (ParagraphText.trim().empty())
@@ -268,66 +337,21 @@ ArrayRef<MDNode *> parseMarkdown(StringRef ParagraphText,
continue;
}
- // TODO: Follow CommonMark spec §4.5 more closely -- opening fences may be
- // indented up to 3 spaces, the closing fence must use the same character
- // and be at least as long as the opening fence, and the closing fence may
- // only be followed by spaces. Doxygen specifics should be handled on a
- // case-by-case basis.
+ // Fenced code block.
if (Line.starts_with("```") || Line.starts_with("~~~")) {
- char Fence = Line[0];
- StringRef Lang = Saver.save(Line.drop_front(3).trim());
- Reader.advance(); // consume opening fence
- SmallVector<StringRef> CodeLines;
- while (!Reader.atEnd()) {
- StringRef CodeLine = Reader.peek().trim();
- if (CodeLine.size() >= 3 &&
- all_of(CodeLine.take_front(3),
- [Fence](char C) { return C == Fence; }))
- break;
- CodeLines.push_back(Saver.save(Reader.advance()));
- }
- if (!Reader.atEnd())
- Reader.advance(); // consume closing fence
- auto *Code =
- new (Arena) FencedCodeNode(Lang, allocateArray(CodeLines, Arena));
- LDBG() << "emitting FencedCodeNode lang='" << Lang
- << "' lines=" << CodeLines.size();
- Nodes.push_back(Code);
+ Nodes.push_back(parseFencedCode(Reader, Arena, Saver));
continue;
}
// Pipe table: current line has | and next line is a separator row.
if (Line.contains('|') && isSepRow(Reader.peek(1).trim())) {
- SmallVector<StringRef> Rows;
- // TODO: Rows are kept as raw line text for now. Table cells may contain
- // inline content (emphasis, code spans, links), so each row may need to
- // be split on '|' and parsed further into structured cells.
- while (!Reader.atEnd() && Reader.peek().trim().contains('|'))
- Rows.push_back(Saver.save(Reader.advance().trim()));
- auto *Table = new (Arena) TableNode(allocateArray(Rows, Arena));
- LDBG() << "emitting TableNode rows=" << Rows.size();
- Nodes.push_back(Table);
+ Nodes.push_back(parsePipeTable(Reader, Arena, Saver));
continue;
}
// Unordered list item.
if (isListItem(Line)) {
- SmallVector<ListItemNode *> Items;
- while (!Reader.atEnd()) {
- StringRef L = Reader.peek().trim();
- if (!isListItem(L))
- break;
- StringRef ItemText = Saver.save(L.drop_front(2).trim());
- SmallVector<MDNode *> ItemChildren;
- ItemChildren.push_back(new (Arena) TextNode(ItemText));
- auto *Item =
- new (Arena) ListItemNode(allocateArray(ItemChildren, Arena));
- Items.push_back(Item);
- Reader.advance();
- }
- auto *List = new (Arena) UnorderedListNode(allocateArray(Items, Arena));
- LDBG() << "emitting UnorderedListNode items=" << Items.size();
- Nodes.push_back(List);
+ Nodes.push_back(parseUnorderedList(Reader, Arena, Saver));
continue;
}
>From 86e45d603fefc440f3334516b5f4fcfd69354d7a Mon Sep 17 00:00:00 2001
From: Neil-N4 <neilnair4 at gmail.com>
Date: Fri, 12 Jun 2026 23:27:51 -0400
Subject: [PATCH 16/23] [clang-doc] Add ATX heading parsing with inline content
support
---
.../clang-doc/support/Markdown.cpp | 34 +++++++++++
.../clang-doc/MarkdownParserTest.cpp | 59 +++++++++++++++++++
2 files changed, 93 insertions(+)
diff --git a/clang-tools-extra/clang-doc/support/Markdown.cpp b/clang-tools-extra/clang-doc/support/Markdown.cpp
index 625b3e6305ab9..d59d95586e836 100644
--- a/clang-tools-extra/clang-doc/support/Markdown.cpp
+++ b/clang-tools-extra/clang-doc/support/Markdown.cpp
@@ -46,6 +46,16 @@ static bool isListItem(StringRef Line) {
Line.starts_with("+ ");
}
+// Returns the ATX heading level (1 to 6) when Line is an ATX heading: one to
+// six leading # characters followed by a space. Returns 0 otherwise, so seven
+// or more # characters fall back to plain text.
+static unsigned atxHeadingLevel(StringRef Line) {
+ size_t Level = Line.find_first_not_of('#');
+ if (Level == StringRef::npos || Level < 1 || Level > 6 || Line[Level] != ' ')
+ return 0;
+ return Level;
+}
+
// A forward cursor over the lines of a paragraph. Encapsulates the parse
// position so the loop can inspect the current or an upcoming line and consume
// lines without manual index arithmetic. Lines are stored untrimmed; callers
@@ -317,6 +327,24 @@ static UnorderedListNode *parseUnorderedList(LineReader &Reader,
return List;
}
+// Parses an ATX heading: one to six leading # characters and a space, followed
+// by inline content. The cursor must be on the heading line, which is consumed.
+//
+// TODO: CommonMark §4.2 also allows up to 3 leading spaces and an optional
+// closing run of # characters; neither is handled yet.
+static HeadingNode *parseHeading(LineReader &Reader, BumpPtrAllocator &Arena,
+ StringSaver &Saver) {
+ StringRef Line = Reader.peek().trim();
+ unsigned Level = atxHeadingLevel(Line);
+ assert(Level >= 1 && Level <= 6 && "parseHeading called on a non-heading");
+ StringRef Content = Line.drop_front(Level).trim();
+ Reader.advance();
+ auto *Heading =
+ new (Arena) HeadingNode(Level, parseInline(Content, Arena, Saver));
+ LDBG() << "emitting HeadingNode level=" << Level;
+ return Heading;
+}
+
ArrayRef<MDNode *> parseMarkdown(StringRef ParagraphText,
BumpPtrAllocator &Arena) {
if (ParagraphText.trim().empty())
@@ -343,6 +371,12 @@ ArrayRef<MDNode *> parseMarkdown(StringRef ParagraphText,
continue;
}
+ // ATX heading: 1 to 6 leading # characters and a space.
+ if (atxHeadingLevel(Line)) {
+ Nodes.push_back(parseHeading(Reader, Arena, Saver));
+ continue;
+ }
+
// Pipe table: current line has | and next line is a separator row.
if (Line.contains('|') && isSepRow(Reader.peek(1).trim())) {
Nodes.push_back(parsePipeTable(Reader, Arena, Saver));
diff --git a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
index 63d978061b99b..c48b7a463c3a0 100644
--- a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
+++ b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
@@ -330,4 +330,63 @@ TEST_F(MarkdownParserTest, PlainTextHasNoInlineNodes) {
EXPECT_EQ(cast<TextNode>(P->Children[0])->Text, "just words");
}
+TEST_F(MarkdownParserTest, Heading1) {
+ auto Nodes = parseMarkdown("# Title", Arena);
+ ASSERT_EQ(Nodes.size(), 1u);
+ auto *H = cast<HeadingNode>(Nodes[0]);
+ EXPECT_EQ(H->Level, 1u);
+ ASSERT_EQ(H->Children.size(), 1u);
+ EXPECT_EQ(cast<TextNode>(H->Children[0])->Text, "Title");
+}
+
+TEST_F(MarkdownParserTest, Heading2) {
+ auto Nodes = parseMarkdown("## Title", Arena);
+ ASSERT_EQ(Nodes.size(), 1u);
+ auto *H = cast<HeadingNode>(Nodes[0]);
+ EXPECT_EQ(H->Level, 2u);
+ ASSERT_EQ(H->Children.size(), 1u);
+ EXPECT_EQ(cast<TextNode>(H->Children[0])->Text, "Title");
+}
+
+TEST_F(MarkdownParserTest, Heading3) {
+ auto Nodes = parseMarkdown("### Title", Arena);
+ ASSERT_EQ(Nodes.size(), 1u);
+ auto *H = cast<HeadingNode>(Nodes[0]);
+ EXPECT_EQ(H->Level, 3u);
+ ASSERT_EQ(H->Children.size(), 1u);
+ EXPECT_EQ(cast<TextNode>(H->Children[0])->Text, "Title");
+}
+
+TEST_F(MarkdownParserTest, HeadingWithInlineCode) {
+ auto Nodes = parseMarkdown("# Use `foo()`", Arena);
+ ASSERT_EQ(Nodes.size(), 1u);
+ auto *H = cast<HeadingNode>(Nodes[0]);
+ EXPECT_EQ(H->Level, 1u);
+ ASSERT_EQ(H->Children.size(), 2u);
+ EXPECT_EQ(cast<TextNode>(H->Children[0])->Text, "Use ");
+ EXPECT_EQ(cast<InlineCodeNode>(H->Children[1])->Code, "foo()");
+}
+
+TEST_F(MarkdownParserTest, HeadingWithEmphasis) {
+ auto Nodes = parseMarkdown("## see *this*", Arena);
+ ASSERT_EQ(Nodes.size(), 1u);
+ auto *H = cast<HeadingNode>(Nodes[0]);
+ EXPECT_EQ(H->Level, 2u);
+ ASSERT_EQ(H->Children.size(), 2u);
+ EXPECT_EQ(cast<TextNode>(H->Children[0])->Text, "see ");
+ auto *Em = cast<EmphasisNode>(H->Children[1]);
+ ASSERT_EQ(Em->Children.size(), 1u);
+ EXPECT_EQ(cast<TextNode>(Em->Children[0])->Text, "this");
+}
+
+// Seven or more # characters are not a valid ATX heading, so the line falls
+// back to a plain-text paragraph.
+TEST_F(MarkdownParserTest, SevenHashesIsPlainText) {
+ auto Nodes = parseMarkdown("####### too many", Arena);
+ ASSERT_EQ(Nodes.size(), 1u);
+ auto *P = cast<ParagraphNode>(Nodes[0]);
+ ASSERT_EQ(P->Children.size(), 1u);
+ EXPECT_EQ(cast<TextNode>(P->Children[0])->Text, "####### too many");
+}
+
} // namespace
>From 2b14505cadb016131f53cbc3200973c3cee6ae04 Mon Sep 17 00:00:00 2001
From: Neil-N4 <neilnair4 at gmail.com>
Date: Fri, 12 Jun 2026 23:32:14 -0400
Subject: [PATCH 17/23] [clang-doc] Run list item text through parseInline for
inline markup support
---
.../clang-doc/support/Markdown.cpp | 6 ++---
.../clang-doc/MarkdownParserTest.cpp | 24 ++++++++++++++++---
2 files changed, 23 insertions(+), 7 deletions(-)
diff --git a/clang-tools-extra/clang-doc/support/Markdown.cpp b/clang-tools-extra/clang-doc/support/Markdown.cpp
index d59d95586e836..6901f6c2f40a5 100644
--- a/clang-tools-extra/clang-doc/support/Markdown.cpp
+++ b/clang-tools-extra/clang-doc/support/Markdown.cpp
@@ -315,10 +315,8 @@ static UnorderedListNode *parseUnorderedList(LineReader &Reader,
StringRef L = Reader.peek().trim();
if (!isListItem(L))
break;
- StringRef ItemText = Saver.save(L.drop_front(2).trim());
- SmallVector<MDNode *> ItemChildren;
- ItemChildren.push_back(new (Arena) TextNode(ItemText));
- auto *Item = new (Arena) ListItemNode(allocateArray(ItemChildren, Arena));
+ StringRef ItemText = L.drop_front(2).trim();
+ auto *Item = new (Arena) ListItemNode(parseInline(ItemText, Arena, Saver));
Items.push_back(Item);
Reader.advance();
}
diff --git a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
index c48b7a463c3a0..9a7d6d1fd0942 100644
--- a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
+++ b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
@@ -98,9 +98,27 @@ TEST_F(MarkdownParserTest, UnorderedList) {
ASSERT_EQ(Nodes.size(), 1u);
auto *N = cast<UnorderedListNode>(Nodes[0]);
ASSERT_EQ(N->Items.size(), 3u);
- EXPECT_EQ(cast<TextNode>(N->Items[0]->Children[0])->Text, "foo");
- EXPECT_EQ(cast<TextNode>(N->Items[1]->Children[0])->Text, "bar");
- EXPECT_EQ(cast<TextNode>(N->Items[2]->Children[0])->Text, "baz");
+ // Each item's children are the inline nodes from parseInline.
+ StringRef ExpectedText[] = {"foo", "bar", "baz"};
+ for (size_t I = 0; I < N->Items.size(); ++I) {
+ auto *Item = N->Items[I];
+ ASSERT_EQ(Item->Children.size(), 1u);
+ EXPECT_EQ(cast<TextNode>(Item->Children[0])->Text, ExpectedText[I]);
+ }
+}
+
+TEST_F(MarkdownParserTest, ListItemWithEmphasis) {
+ auto Nodes = parseMarkdown("- an *important* note", Arena);
+ ASSERT_EQ(Nodes.size(), 1u);
+ auto *N = cast<UnorderedListNode>(Nodes[0]);
+ ASSERT_EQ(N->Items.size(), 1u);
+ auto *Item = N->Items[0];
+ ASSERT_EQ(Item->Children.size(), 3u);
+ EXPECT_EQ(cast<TextNode>(Item->Children[0])->Text, "an ");
+ auto *Em = cast<EmphasisNode>(Item->Children[1]);
+ ASSERT_EQ(Em->Children.size(), 1u);
+ EXPECT_EQ(cast<TextNode>(Em->Children[0])->Text, "important");
+ EXPECT_EQ(cast<TextNode>(Item->Children[2])->Text, " note");
}
TEST_F(MarkdownParserTest, MixedContent) {
>From aaf4b6e2b1600bce25f625abb2caf9ad25b52f90 Mon Sep 17 00:00:00 2001
From: Neil-N4 <neilnair4 at gmail.com>
Date: Fri, 12 Jun 2026 23:42:27 -0400
Subject: [PATCH 18/23] [clang-doc] Add ordered list parsing with inline
content support
---
.../clang-doc/support/Markdown.cpp | 40 +++++++++++++++++
.../clang-doc/MarkdownParserTest.cpp | 44 +++++++++++++++++++
2 files changed, 84 insertions(+)
diff --git a/clang-tools-extra/clang-doc/support/Markdown.cpp b/clang-tools-extra/clang-doc/support/Markdown.cpp
index 6901f6c2f40a5..211fb0407578f 100644
--- a/clang-tools-extra/clang-doc/support/Markdown.cpp
+++ b/clang-tools-extra/clang-doc/support/Markdown.cpp
@@ -46,6 +46,14 @@ static bool isListItem(StringRef Line) {
Line.starts_with("+ ");
}
+// Returns true if Line begins with an ordered list marker: one or more digits
+// followed by a period and a space (e.g. "1. ", "42. ").
+static bool isOrderedListItem(StringRef Line) {
+ size_t Dot = Line.find_first_not_of("0123456789");
+ return Dot != StringRef::npos && Dot > 0 && Line[Dot] == '.' &&
+ Dot + 1 < Line.size() && Line[Dot + 1] == ' ';
+}
+
// Returns the ATX heading level (1 to 6) when Line is an ATX heading: one to
// six leading # characters followed by a space. Returns 0 otherwise, so seven
// or more # characters fall back to plain text.
@@ -325,6 +333,32 @@ static UnorderedListNode *parseUnorderedList(LineReader &Reader,
return List;
}
+// Parses an ordered (numbered) list. The cursor must be on the first item; the
+// start number is taken from that item's marker and consecutive numbered lines
+// are consumed. Item numbers after the first are not validated.
+static OrderedListNode *parseOrderedList(LineReader &Reader,
+ BumpPtrAllocator &Arena,
+ StringSaver &Saver) {
+ unsigned Start = 0;
+ Reader.peek().trim().take_while(isDigit).getAsInteger(10, Start);
+ SmallVector<ListItemNode *> Items;
+ while (!Reader.atEnd()) {
+ StringRef L = Reader.peek().trim();
+ if (!isOrderedListItem(L))
+ break;
+ // Drop the "<digits>. " marker: the digits, the period, and the space.
+ StringRef ItemText =
+ L.drop_front(L.find_first_not_of("0123456789") + 2).trim();
+ auto *Item = new (Arena) ListItemNode(parseInline(ItemText, Arena, Saver));
+ Items.push_back(Item);
+ Reader.advance();
+ }
+ auto *List = new (Arena) OrderedListNode(Start, allocateArray(Items, Arena));
+ LDBG() << "emitting OrderedListNode start=" << Start
+ << " items=" << Items.size();
+ return List;
+}
+
// Parses an ATX heading: one to six leading # characters and a space, followed
// by inline content. The cursor must be on the heading line, which is consumed.
//
@@ -387,6 +421,12 @@ ArrayRef<MDNode *> parseMarkdown(StringRef ParagraphText,
continue;
}
+ // Ordered list item: digits followed by a period and a space.
+ if (isOrderedListItem(Line)) {
+ Nodes.push_back(parseOrderedList(Reader, Arena, Saver));
+ continue;
+ }
+
// Plain text line: scan for inline constructs (emphasis, strong, code) and
// wrap the result in a paragraph.
auto Inlines = parseInline(Line, Arena, Saver);
diff --git a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
index 9a7d6d1fd0942..a0ba39c163a34 100644
--- a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
+++ b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
@@ -121,6 +121,50 @@ TEST_F(MarkdownParserTest, ListItemWithEmphasis) {
EXPECT_EQ(cast<TextNode>(Item->Children[2])->Text, " note");
}
+TEST_F(MarkdownParserTest, OrderedList) {
+ auto Nodes = parseMarkdown(R"(1. foo
+2. bar
+3. baz)",
+ Arena);
+ ASSERT_EQ(Nodes.size(), 1u);
+ auto *N = cast<OrderedListNode>(Nodes[0]);
+ EXPECT_EQ(N->Start, 1u);
+ ASSERT_EQ(N->Items.size(), 3u);
+ StringRef ExpectedText[] = {"foo", "bar", "baz"};
+ for (size_t I = 0; I < N->Items.size(); ++I) {
+ auto *Item = N->Items[I];
+ ASSERT_EQ(Item->Children.size(), 1u);
+ EXPECT_EQ(cast<TextNode>(Item->Children[0])->Text, ExpectedText[I]);
+ }
+}
+
+TEST_F(MarkdownParserTest, OrderedListCustomStart) {
+ auto Nodes = parseMarkdown(R"(5. five
+6. six)",
+ Arena);
+ ASSERT_EQ(Nodes.size(), 1u);
+ auto *N = cast<OrderedListNode>(Nodes[0]);
+ EXPECT_EQ(N->Start, 5u);
+ ASSERT_EQ(N->Items.size(), 2u);
+ EXPECT_EQ(cast<TextNode>(N->Items[0]->Children[0])->Text, "five");
+ EXPECT_EQ(cast<TextNode>(N->Items[1]->Children[0])->Text, "six");
+}
+
+TEST_F(MarkdownParserTest, OrderedListItemWithEmphasis) {
+ auto Nodes = parseMarkdown("1. an *important* note", Arena);
+ ASSERT_EQ(Nodes.size(), 1u);
+ auto *N = cast<OrderedListNode>(Nodes[0]);
+ EXPECT_EQ(N->Start, 1u);
+ ASSERT_EQ(N->Items.size(), 1u);
+ auto *Item = N->Items[0];
+ ASSERT_EQ(Item->Children.size(), 3u);
+ EXPECT_EQ(cast<TextNode>(Item->Children[0])->Text, "an ");
+ auto *Em = cast<EmphasisNode>(Item->Children[1]);
+ ASSERT_EQ(Em->Children.size(), 1u);
+ EXPECT_EQ(cast<TextNode>(Em->Children[0])->Text, "important");
+ EXPECT_EQ(cast<TextNode>(Item->Children[2])->Text, " note");
+}
+
TEST_F(MarkdownParserTest, MixedContent) {
auto Nodes = parseMarkdown(R"(some text
```````
>From 2ce9a89495e81eb5f0c67551f114e08eadefdabd Mon Sep 17 00:00:00 2001
From: Neil-N4 <neilnair4 at gmail.com>
Date: Fri, 12 Jun 2026 23:47:42 -0400
Subject: [PATCH 19/23] [clang-doc] Add thematic break parsing
---
.../clang-doc/support/Markdown.cpp | 26 +++++++++++++++++++
.../clang-doc/MarkdownParserTest.cpp | 18 +++++++++++++
2 files changed, 44 insertions(+)
diff --git a/clang-tools-extra/clang-doc/support/Markdown.cpp b/clang-tools-extra/clang-doc/support/Markdown.cpp
index 211fb0407578f..2f0cc5bffe566 100644
--- a/clang-tools-extra/clang-doc/support/Markdown.cpp
+++ b/clang-tools-extra/clang-doc/support/Markdown.cpp
@@ -54,6 +54,23 @@ static bool isOrderedListItem(StringRef Line) {
Dot + 1 < Line.size() && Line[Dot + 1] == ' ';
}
+// Returns true if Line is a thematic break: three or more matching -, *, or _
+// characters, optionally separated by spaces, with nothing else. Line is
+// expected to be trimmed.
+static bool isThematicBreak(StringRef Line) {
+ char Marker = Line.empty() ? '\0' : Line[0];
+ if (Marker != '-' && Marker != '*' && Marker != '_')
+ return false;
+ unsigned Count = 0;
+ for (char C : Line) {
+ if (C == Marker)
+ ++Count;
+ else if (C != ' ')
+ return false;
+ }
+ return Count >= 3;
+}
+
// Returns the ATX heading level (1 to 6) when Line is an ATX heading: one to
// six leading # characters followed by a space. Returns 0 otherwise, so seven
// or more # characters fall back to plain text.
@@ -409,6 +426,15 @@ ArrayRef<MDNode *> parseMarkdown(StringRef ParagraphText,
continue;
}
+ // Thematic break: 3 or more matching -, *, or _ characters. Checked before
+ // the list cases so that "* * *" and "- - -" are breaks, not list items.
+ if (isThematicBreak(Line)) {
+ Reader.advance();
+ Nodes.push_back(new (Arena) ThematicBreakNode());
+ LDBG() << "emitting ThematicBreakNode";
+ continue;
+ }
+
// Pipe table: current line has | and next line is a separator row.
if (Line.contains('|') && isSepRow(Reader.peek(1).trim())) {
Nodes.push_back(parsePipeTable(Reader, Arena, Saver));
diff --git a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
index a0ba39c163a34..188d1987ac06d 100644
--- a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
+++ b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
@@ -451,4 +451,22 @@ TEST_F(MarkdownParserTest, SevenHashesIsPlainText) {
EXPECT_EQ(cast<TextNode>(P->Children[0])->Text, "####### too many");
}
+TEST_F(MarkdownParserTest, ThematicBreakDashes) {
+ auto Nodes = parseMarkdown("---", Arena);
+ ASSERT_EQ(Nodes.size(), 1u);
+ EXPECT_TRUE(isa<ThematicBreakNode>(Nodes[0]));
+}
+
+TEST_F(MarkdownParserTest, ThematicBreakAsterisks) {
+ auto Nodes = parseMarkdown("***", Arena);
+ ASSERT_EQ(Nodes.size(), 1u);
+ EXPECT_TRUE(isa<ThematicBreakNode>(Nodes[0]));
+}
+
+TEST_F(MarkdownParserTest, ThematicBreakUnderscores) {
+ auto Nodes = parseMarkdown("___", Arena);
+ ASSERT_EQ(Nodes.size(), 1u);
+ EXPECT_TRUE(isa<ThematicBreakNode>(Nodes[0]));
+}
+
} // namespace
>From 843d0554dd1ad93c139d69f09fdc06800df7b078 Mon Sep 17 00:00:00 2001
From: Neil-N4 <neilnair4 at gmail.com>
Date: Fri, 12 Jun 2026 23:58:15 -0400
Subject: [PATCH 20/23] [clang-doc] Add CommonMark spec edge case tests with
section citations
---
.../clang-doc/MarkdownParserTest.cpp | 145 ++++++++++++++++++
1 file changed, 145 insertions(+)
diff --git a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
index 188d1987ac06d..350b15c2541ed 100644
--- a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
+++ b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
@@ -469,4 +469,149 @@ TEST_F(MarkdownParserTest, ThematicBreakUnderscores) {
EXPECT_TRUE(isa<ThematicBreakNode>(Nodes[0]));
}
+//===----------------------------------------------------------------------===//
+// CommonMark spec edge cases (spec.commonmark.org/0.31.2). Each test cites the
+// section and example it exercises. Cases marked DIVERGENCE document where this
+// simplified parser intentionally differs from full CommonMark.
+//===----------------------------------------------------------------------===//
+
+// CommonMark §4.1 Example 51: spaces are allowed between the characters.
+TEST_F(MarkdownParserTest, ThematicBreakSpacedDashes) {
+ auto Nodes = parseMarkdown("- - -", Arena);
+ ASSERT_EQ(Nodes.size(), 1u);
+ EXPECT_TRUE(isa<ThematicBreakNode>(Nodes[0]));
+}
+
+// CommonMark §4.1 Example 44: +++ is not a thematic break.
+TEST_F(MarkdownParserTest, PlusesAreNotThematicBreak) {
+ auto Nodes = parseMarkdown("+++", Arena);
+ ASSERT_EQ(Nodes.size(), 1u);
+ auto *P = cast<ParagraphNode>(Nodes[0]);
+ ASSERT_EQ(P->Children.size(), 1u);
+ EXPECT_EQ(cast<TextNode>(P->Children[0])->Text, "+++");
+}
+
+// CommonMark §4.1 Example 46: fewer than three characters is not a break.
+TEST_F(MarkdownParserTest, TwoDashesAreNotThematicBreak) {
+ auto Nodes = parseMarkdown("--", Arena);
+ ASSERT_EQ(Nodes.size(), 1u);
+ auto *P = cast<ParagraphNode>(Nodes[0]);
+ ASSERT_EQ(P->Children.size(), 1u);
+ EXPECT_EQ(cast<TextNode>(P->Children[0])->Text, "--");
+}
+
+// CommonMark §4.2 Example 64: a # not followed by a space is not a heading.
+TEST_F(MarkdownParserTest, HashWithoutSpaceIsNotHeading) {
+ auto Nodes = parseMarkdown("#5 bolt", Arena);
+ ASSERT_EQ(Nodes.size(), 1u);
+ auto *P = cast<ParagraphNode>(Nodes[0]);
+ ASSERT_EQ(P->Children.size(), 1u);
+ EXPECT_EQ(cast<TextNode>(P->Children[0])->Text, "#5 bolt");
+}
+
+// CommonMark §4.2 Example 64: "#hashtag" is a paragraph, not a heading.
+TEST_F(MarkdownParserTest, HashtagIsNotHeading) {
+ auto Nodes = parseMarkdown("#hashtag", Arena);
+ ASSERT_EQ(Nodes.size(), 1u);
+ auto *P = cast<ParagraphNode>(Nodes[0]);
+ ASSERT_EQ(P->Children.size(), 1u);
+ EXPECT_EQ(cast<TextNode>(P->Children[0])->Text, "#hashtag");
+}
+
+// CommonMark §4.2 Example 67: spaces around the heading content are stripped.
+TEST_F(MarkdownParserTest, HeadingStripsContentSpaces) {
+ auto Nodes = parseMarkdown("# foo", Arena);
+ ASSERT_EQ(Nodes.size(), 1u);
+ auto *H = cast<HeadingNode>(Nodes[0]);
+ EXPECT_EQ(H->Level, 1u);
+ ASSERT_EQ(H->Children.size(), 1u);
+ EXPECT_EQ(cast<TextNode>(H->Children[0])->Text, "foo");
+}
+
+// CommonMark §5.2: * is a valid bullet list marker.
+TEST_F(MarkdownParserTest, UnorderedListAsteriskMarker) {
+ auto Nodes = parseMarkdown("* foo", Arena);
+ ASSERT_EQ(Nodes.size(), 1u);
+ auto *N = cast<UnorderedListNode>(Nodes[0]);
+ ASSERT_EQ(N->Items.size(), 1u);
+ EXPECT_EQ(cast<TextNode>(N->Items[0]->Children[0])->Text, "foo");
+}
+
+// CommonMark §5.2 Example 301: + is a valid bullet list marker.
+TEST_F(MarkdownParserTest, UnorderedListPlusMarker) {
+ auto Nodes = parseMarkdown("+ foo", Arena);
+ ASSERT_EQ(Nodes.size(), 1u);
+ auto *N = cast<UnorderedListNode>(Nodes[0]);
+ ASSERT_EQ(N->Items.size(), 1u);
+ EXPECT_EQ(cast<TextNode>(N->Items[0]->Children[0])->Text, "foo");
+}
+
+// CommonMark §5.2 Example 267: an ordered list may start at 0.
+TEST_F(MarkdownParserTest, OrderedListStartZero) {
+ auto Nodes = parseMarkdown("0. ok", Arena);
+ ASSERT_EQ(Nodes.size(), 1u);
+ auto *N = cast<OrderedListNode>(Nodes[0]);
+ EXPECT_EQ(N->Start, 0u);
+ ASSERT_EQ(N->Items.size(), 1u);
+ EXPECT_EQ(cast<TextNode>(N->Items[0]->Children[0])->Text, "ok");
+}
+
+// CommonMark §5.2 Example 296: ordered lists may use a ) delimiter. DIVERGENCE:
+// this parser only recognizes the . delimiter, so "1) foo" is plain text.
+TEST_F(MarkdownParserTest, OrderedListParenDelimiterNotSupported) {
+ auto Nodes = parseMarkdown("1) foo", Arena);
+ ASSERT_EQ(Nodes.size(), 1u);
+ auto *P = cast<ParagraphNode>(Nodes[0]);
+ ASSERT_EQ(P->Children.size(), 1u);
+ EXPECT_EQ(cast<TextNode>(P->Children[0])->Text, "1) foo");
+}
+
+// CommonMark §6.2 Example 355: intraword emphasis with asterisks.
+TEST_F(MarkdownParserTest, IntrawordEmphasisAsterisk) {
+ auto Nodes = parseMarkdown("foo*bar*", Arena);
+ ASSERT_EQ(Nodes.size(), 1u);
+ auto *P = cast<ParagraphNode>(Nodes[0]);
+ ASSERT_EQ(P->Children.size(), 2u);
+ EXPECT_EQ(cast<TextNode>(P->Children[0])->Text, "foo");
+ auto *Em = cast<EmphasisNode>(P->Children[1]);
+ ASSERT_EQ(Em->Children.size(), 1u);
+ EXPECT_EQ(cast<TextNode>(Em->Children[0])->Text, "bar");
+}
+
+// CommonMark §6.2 Example 381: intraword strong with asterisks.
+TEST_F(MarkdownParserTest, IntrawordStrongAsterisk) {
+ auto Nodes = parseMarkdown("foo**bar**", Arena);
+ ASSERT_EQ(Nodes.size(), 1u);
+ auto *P = cast<ParagraphNode>(Nodes[0]);
+ ASSERT_EQ(P->Children.size(), 2u);
+ EXPECT_EQ(cast<TextNode>(P->Children[0])->Text, "foo");
+ auto *St = cast<StrongNode>(P->Children[1]);
+ ASSERT_EQ(St->Children.size(), 1u);
+ EXPECT_EQ(cast<TextNode>(St->Children[0])->Text, "bar");
+}
+
+// CommonMark §6.2 Example 360: intraword underscores do NOT open emphasis, so
+// "foo_bar_" is literal text. DIVERGENCE: this parser lacks the intraword
+// underscore rule (see the findClosingDelim TODO) and treats it as emphasis.
+TEST_F(MarkdownParserTest, IntrawordUnderscoreEmphasisDivergence) {
+ auto Nodes = parseMarkdown("foo_bar_", Arena);
+ ASSERT_EQ(Nodes.size(), 1u);
+ auto *P = cast<ParagraphNode>(Nodes[0]);
+ ASSERT_EQ(P->Children.size(), 2u);
+ EXPECT_EQ(cast<TextNode>(P->Children[0])->Text, "foo");
+ auto *Em = cast<EmphasisNode>(P->Children[1]);
+ ASSERT_EQ(Em->Children.size(), 1u);
+ EXPECT_EQ(cast<TextNode>(Em->Children[0])->Text, "bar");
+}
+
+// CommonMark §6.1 Example 331: a code span strips one leading and trailing
+// space when both are present.
+TEST_F(MarkdownParserTest, CodeSpanStripsSurroundingSpaces) {
+ auto Nodes = parseMarkdown("`` x ``", Arena);
+ ASSERT_EQ(Nodes.size(), 1u);
+ auto *P = cast<ParagraphNode>(Nodes[0]);
+ ASSERT_EQ(P->Children.size(), 1u);
+ EXPECT_EQ(cast<InlineCodeNode>(P->Children[0])->Code, "x");
+}
+
} // namespace
>From e9e6b8d7b1509d36d5c93f604f31b3e4ad9a63ea Mon Sep 17 00:00:00 2001
From: Neil-N4 <neilnair4 at gmail.com>
Date: Sat, 13 Jun 2026 00:03:54 -0400
Subject: [PATCH 21/23] [clang-doc] Add block quote parsing with recursive
inner parsing
---
.../clang-doc/support/Markdown.cpp | 38 ++++++++++++++
.../clang-doc/MarkdownParserTest.cpp | 50 +++++++++++++++++++
2 files changed, 88 insertions(+)
diff --git a/clang-tools-extra/clang-doc/support/Markdown.cpp b/clang-tools-extra/clang-doc/support/Markdown.cpp
index 2f0cc5bffe566..fdfc619e0ea05 100644
--- a/clang-tools-extra/clang-doc/support/Markdown.cpp
+++ b/clang-tools-extra/clang-doc/support/Markdown.cpp
@@ -14,6 +14,7 @@
#include "llvm/Support/DebugLog.h"
#include "llvm/Support/StringSaver.h"
#include <cassert>
+#include <string>
#define DEBUG_TYPE "clang-doc"
@@ -71,6 +72,12 @@ static bool isThematicBreak(StringRef Line) {
return Count >= 3;
}
+// Returns true if Line is a block quote line: it starts with "> ", or is a bare
+// ">" marking an empty quote line.
+static bool isBlockQuote(StringRef Line) {
+ return Line.starts_with("> ") || Line == ">";
+}
+
// Returns the ATX heading level (1 to 6) when Line is an ATX heading: one to
// six leading # characters followed by a space. Returns 0 otherwise, so seven
// or more # characters fall back to plain text.
@@ -394,6 +401,31 @@ static HeadingNode *parseHeading(LineReader &Reader, BumpPtrAllocator &Arena,
return Heading;
}
+// Parses a block quote: one or more consecutive lines beginning with "> ". The
+// > marker and one following space are stripped from each line, and the
+// collected text is parsed recursively, so a quote's children are block-level
+// nodes and nested quotes fall out naturally.
+static BlockQuoteNode *parseBlockQuote(LineReader &Reader,
+ BumpPtrAllocator &Arena) {
+ std::string Inner;
+ bool First = true;
+ while (!Reader.atEnd()) {
+ StringRef L = Reader.peek().trim();
+ if (!isBlockQuote(L))
+ break;
+ if (!First)
+ Inner += '\n';
+ First = false;
+ StringRef Content = L.starts_with("> ") ? L.drop_front(2) : L.drop_front(1);
+ Inner.append(Content.data(), Content.size());
+ Reader.advance();
+ }
+ ArrayRef<MDNode *> Children = parseMarkdown(Inner, Arena);
+ auto *Quote = new (Arena) BlockQuoteNode(Children);
+ LDBG() << "emitting BlockQuoteNode children=" << Children.size();
+ return Quote;
+}
+
ArrayRef<MDNode *> parseMarkdown(StringRef ParagraphText,
BumpPtrAllocator &Arena) {
if (ParagraphText.trim().empty())
@@ -435,6 +467,12 @@ ArrayRef<MDNode *> parseMarkdown(StringRef ParagraphText,
continue;
}
+ // Block quote: consecutive lines beginning with "> ".
+ if (isBlockQuote(Line)) {
+ Nodes.push_back(parseBlockQuote(Reader, Arena));
+ continue;
+ }
+
// Pipe table: current line has | and next line is a separator row.
if (Line.contains('|') && isSepRow(Reader.peek(1).trim())) {
Nodes.push_back(parsePipeTable(Reader, Arena, Saver));
diff --git a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
index 350b15c2541ed..aedcd9407b197 100644
--- a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
+++ b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
@@ -614,4 +614,54 @@ TEST_F(MarkdownParserTest, CodeSpanStripsSurroundingSpaces) {
EXPECT_EQ(cast<InlineCodeNode>(P->Children[0])->Code, "x");
}
+TEST_F(MarkdownParserTest, BlockQuote) {
+ auto Nodes = parseMarkdown("> hello", Arena);
+ ASSERT_EQ(Nodes.size(), 1u);
+ auto *Q = cast<BlockQuoteNode>(Nodes[0]);
+ ASSERT_EQ(Q->Children.size(), 1u);
+ auto *P = cast<ParagraphNode>(Q->Children[0]);
+ ASSERT_EQ(P->Children.size(), 1u);
+ EXPECT_EQ(cast<TextNode>(P->Children[0])->Text, "hello");
+}
+
+TEST_F(MarkdownParserTest, BlockQuoteWithFencedCode) {
+ auto Nodes = parseMarkdown(R"(> ```cpp
+> int x = 0;
+> ```)",
+ Arena);
+ ASSERT_EQ(Nodes.size(), 1u);
+ auto *Q = cast<BlockQuoteNode>(Nodes[0]);
+ ASSERT_EQ(Q->Children.size(), 1u);
+ auto *Code = cast<FencedCodeNode>(Q->Children[0]);
+ EXPECT_EQ(Code->Lang, "cpp");
+ ASSERT_EQ(Code->Lines.size(), 1u);
+ EXPECT_EQ(Code->Lines[0], "int x = 0;");
+}
+
+TEST_F(MarkdownParserTest, BlockQuoteWithEmphasis) {
+ auto Nodes = parseMarkdown("> an *important* note", Arena);
+ ASSERT_EQ(Nodes.size(), 1u);
+ auto *Q = cast<BlockQuoteNode>(Nodes[0]);
+ ASSERT_EQ(Q->Children.size(), 1u);
+ auto *P = cast<ParagraphNode>(Q->Children[0]);
+ ASSERT_EQ(P->Children.size(), 3u);
+ EXPECT_EQ(cast<TextNode>(P->Children[0])->Text, "an ");
+ auto *Em = cast<EmphasisNode>(P->Children[1]);
+ ASSERT_EQ(Em->Children.size(), 1u);
+ EXPECT_EQ(cast<TextNode>(Em->Children[0])->Text, "important");
+ EXPECT_EQ(cast<TextNode>(P->Children[2])->Text, " note");
+}
+
+TEST_F(MarkdownParserTest, NestedBlockQuote) {
+ auto Nodes = parseMarkdown("> > deep", Arena);
+ ASSERT_EQ(Nodes.size(), 1u);
+ auto *Outer = cast<BlockQuoteNode>(Nodes[0]);
+ ASSERT_EQ(Outer->Children.size(), 1u);
+ auto *Inner = cast<BlockQuoteNode>(Outer->Children[0]);
+ ASSERT_EQ(Inner->Children.size(), 1u);
+ auto *P = cast<ParagraphNode>(Inner->Children[0]);
+ ASSERT_EQ(P->Children.size(), 1u);
+ EXPECT_EQ(cast<TextNode>(P->Children[0])->Text, "deep");
+}
+
} // namespace
>From 9325916d90af28470b9c7dd634b3c12471cc6dc5 Mon Sep 17 00:00:00 2001
From: Neil-N4 <neilnair4 at gmail.com>
Date: Sat, 13 Jun 2026 12:22:13 -0400
Subject: [PATCH 22/23] [clang-doc] Address review feedback: review comments
---
.../clang-doc/support/Markdown.cpp | 2 +-
.../clang-doc/support/Markdown.h | 52 +++++++++----------
2 files changed, 27 insertions(+), 27 deletions(-)
diff --git a/clang-tools-extra/clang-doc/support/Markdown.cpp b/clang-tools-extra/clang-doc/support/Markdown.cpp
index fdfc619e0ea05..08277b1405e0b 100644
--- a/clang-tools-extra/clang-doc/support/Markdown.cpp
+++ b/clang-tools-extra/clang-doc/support/Markdown.cpp
@@ -293,7 +293,7 @@ static ArrayRef<MDNode *> parseInline(StringRef S, BumpPtrAllocator &Arena,
// Parses a fenced code block opened with ``` or ~~~. The cursor must be on the
// opening fence; the fence, body lines, and closing fence are consumed.
//
-// TODO: Follow CommonMark spec §4.5 more closely -- opening fences may be
+// TODO: Follow CommonMark spec §4.5 more closely. Opening fences may be
// indented up to 3 spaces, the closing fence must use the same character and be
// at least as long as the opening fence, and the closing fence may only be
// followed by spaces. Doxygen specifics should be handled on a case-by-case
diff --git a/clang-tools-extra/clang-doc/support/Markdown.h b/clang-tools-extra/clang-doc/support/Markdown.h
index 8c2055868671a..a9b00a5c10225 100644
--- a/clang-tools-extra/clang-doc/support/Markdown.h
+++ b/clang-tools-extra/clang-doc/support/Markdown.h
@@ -15,27 +15,27 @@
/// llvm::isa<>/cast<>/dyn_cast<> to check what a node is.
///
/// Inline nodes (appear inside ParagraphNode, HeadingNode, etc.):
-/// TextNode -- plain text run
-/// SoftBreakNode -- soft line break
-/// HardBreakNode -- hard line break (trailing spaces or backslash)
-/// InlineCodeNode -- inline code span (`code`)
-/// EmphasisNode -- emphasis (*text* or _text_)
-/// StrongNode -- strong emphasis (**text** or __text__)
+/// TextNode: plain text run
+/// SoftBreakNode: soft line break
+/// HardBreakNode: hard line break (trailing spaces or backslash)
+/// InlineCodeNode: inline code span (`code`)
+/// EmphasisNode: emphasis (*text* or _text_)
+/// StrongNode: strong emphasis (**text** or __text__)
///
/// Block nodes:
-/// ParagraphNode -- sequence of inline nodes
-/// HeadingNode -- ATX heading (# through ######), level 1-6
-/// FencedCodeNode -- fenced code block (``` or ~~~)
-/// TableNode -- pipe table (raw row text; TODO: structured cells)
-/// UnorderedListNode -- bullet list (-, *, +)
-/// OrderedListNode -- numbered list with explicit start number
-/// ListItemNode -- single item inside a list
-/// BlockQuoteNode -- block quote (>)
-/// ThematicBreakNode -- horizontal rule (---, ***, ___)
+/// ParagraphNode: sequence of inline nodes
+/// HeadingNode: ATX heading (# through ######), level 1-6
+/// FencedCodeNode: fenced code block (``` or ~~~)
+/// TableNode: pipe table (raw row text; TODO: structured cells)
+/// UnorderedListNode: bullet list (-, *, +)
+/// OrderedListNode: numbered list with explicit start number
+/// ListItemNode: single item inside a list
+/// BlockQuoteNode: block quote (>)
+/// ThematicBreakNode: horizontal rule (---, ***, ___)
///
/// All nodes are arena-allocated. The caller owns the arena and must keep it
-/// alive for the lifetime of any returned nodes. The parser never crashes on
-/// malformed input; unrecognized text falls back to TextNode.
+/// alive for the lifetime of any returned nodes. Malformed input is parsed as
+/// plain text rather than rejected; unrecognized text falls back to TextNode.
///
//===----------------------------------------------------------------------===//
@@ -58,7 +58,7 @@ enum class NodeKind {
NK_InlineCode,
NK_Emphasis,
NK_Strong,
- NK_LastInline = NK_Strong, // sentinel -- all inline kinds are <= this
+ NK_LastInline = NK_Strong, // sentinel: all inline kinds are <= this
// Block nodes
NK_Paragraph,
@@ -70,12 +70,12 @@ enum class NodeKind {
NK_ListItem,
NK_BlockQuote,
NK_ThematicBreak,
- NK_FirstBlock = NK_Paragraph, // sentinel -- all block kinds are >= this
+ NK_FirstBlock = NK_Paragraph, // sentinel: all block kinds are >= this
};
-/// Base type for all Markdown AST nodes. Carries only the kind discriminator.
-/// Nodes are arena-allocated and have no virtual destructor; use
-/// llvm::isa<>/cast<>/dyn_cast<> for type-safe downcasting.
+/// Base type for all Markdown AST nodes. Nodes are arena-allocated and have no
+/// virtual destructor; use llvm::isa<>/cast<>/dyn_cast<> for type-safe
+/// downcasting.
struct MDNode {
NodeKind Kind;
explicit MDNode(NodeKind K) : Kind(K) {}
@@ -93,7 +93,7 @@ struct TextNode : MDNode {
static bool classof(const MDNode *N) { return N->Kind == NodeKind::NK_Text; }
};
-/// Soft line break -- a newline that does not end the paragraph.
+/// Soft line break: a newline that does not end the paragraph.
struct SoftBreakNode : MDNode {
SoftBreakNode() : MDNode(NodeKind::NK_SoftBreak) {}
static bool classof(const MDNode *N) {
@@ -101,7 +101,7 @@ struct SoftBreakNode : MDNode {
}
};
-/// Hard line break -- two trailing spaces or a backslash before a newline.
+/// Hard line break: two trailing spaces or a backslash before a newline.
struct HardBreakNode : MDNode {
HardBreakNode() : MDNode(NodeKind::NK_HardBreak) {}
static bool classof(const MDNode *N) {
@@ -143,7 +143,7 @@ struct StrongNode : MDNode {
// Block nodes
//===----------------------------------------------------------------------===//
-/// A paragraph -- sequence of inline nodes separated from other blocks by
+/// A paragraph: sequence of inline nodes separated from other blocks by
/// blank lines.
struct ParagraphNode : MDNode {
llvm::ArrayRef<MDNode *> Children;
@@ -169,7 +169,7 @@ struct HeadingNode : MDNode {
/// "cpp"); empty when no language was specified. Lines contains the raw text
/// of each interior line, without the opening or closing fence.
///
-/// TODO: Follow CommonMark spec §4.5 -- the opening fence may be indented up
+/// TODO: Follow CommonMark spec §4.5. The opening fence may be indented up
/// to 3 spaces; the closing fence must use the same character and be at least
/// as long as the opening fence; only spaces may follow the closing fence.
struct FencedCodeNode : MDNode {
>From 9061cd48f9ec27d72f252414c047626bc1add513 Mon Sep 17 00:00:00 2001
From: Neil-N4 <neilnair4 at gmail.com>
Date: Sat, 13 Jun 2026 12:42:12 -0400
Subject: [PATCH 23/23] [clang-doc] Implement CommonMark delimiter stack for
emphasis and strong parsing
---
.../clang-doc/support/Markdown.cpp | 256 ++++++++++++++----
.../clang-doc/MarkdownParserTest.cpp | 14 +-
2 files changed, 202 insertions(+), 68 deletions(-)
diff --git a/clang-tools-extra/clang-doc/support/Markdown.cpp b/clang-tools-extra/clang-doc/support/Markdown.cpp
index 08277b1405e0b..9ce5339fc8cb6 100644
--- a/clang-tools-extra/clang-doc/support/Markdown.cpp
+++ b/clang-tools-extra/clang-doc/support/Markdown.cpp
@@ -11,6 +11,7 @@
#include "llvm/ADT/StringExtras.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/Support/Allocator.h"
+#include "llvm/Support/Casting.h"
#include "llvm/Support/DebugLog.h"
#include "llvm/Support/StringSaver.h"
#include <cassert>
@@ -184,50 +185,108 @@ static StringRef trimCodeSpan(StringRef Code) {
return Code;
}
-// Finds the start index of a closing emphasis run of exactly DelimLen copies of
-// DelimChar, searching forward from StartPos. Requires non-whitespace
-// immediately inside both the opening and closing delimiters and non-empty
-// content, a simplified take on the CommonMark §6.2 flanking rules. Returns
-// StringRef::npos if no valid closing run exists.
-static size_t findClosingDelim(StringRef S, size_t StartPos, char DelimChar,
- size_t DelimLen) {
- size_t E = S.size();
- // Opening delimiter is not left-flanking if whitespace follows it.
- if (StartPos >= E || isSpace(S[StartPos]))
- return StringRef::npos;
- for (size_t J = StartPos; J + DelimLen <= E; ++J) {
- if (S[J] != DelimChar)
- continue;
- size_t Run = countRun(S, J, DelimChar);
- if (Run != DelimLen) {
- J += Run - 1; // Skip the whole run; the loop's ++J lands past it.
- continue;
- }
- // Reject empty content and closing runs that are not right-flanking.
- if (J == StartPos || isSpace(S[J - 1]))
- continue;
- return J;
+// Treats the start and end of the string (passed as '\0') as whitespace for the
+// CommonMark flanking rules.
+static bool isFlankWhitespace(char C) { return C == '\0' || isSpace(C); }
+
+// Computes whether a delimiter run can open or close emphasis, from the
+// characters immediately before and after the run, per the CommonMark §6.2
+// flanking rules. Before and After are '\0' at the string boundaries.
+static void computeFlanking(char Before, char Marker, char After, bool &CanOpen,
+ bool &CanClose) {
+ bool AfterWS = isFlankWhitespace(After);
+ bool BeforeWS = isFlankWhitespace(Before);
+ bool AfterPunct = isPunct(After);
+ bool BeforePunct = isPunct(Before);
+ bool LeftFlanking = !AfterWS && (!AfterPunct || BeforeWS || BeforePunct);
+ bool RightFlanking = !BeforeWS && (!BeforePunct || AfterWS || AfterPunct);
+ if (Marker == '_') {
+ // Underscore does not open or close emphasis intraword.
+ CanOpen = LeftFlanking && (!RightFlanking || BeforePunct);
+ CanClose = RightFlanking && (!LeftFlanking || AfterPunct);
+ } else {
+ CanOpen = LeftFlanking;
+ CanClose = RightFlanking;
}
- return StringRef::npos;
}
+namespace {
+// One piece of inline content while emphasis is being resolved. A piece is
+// either a finished content node (text, code span, or a built emphasis or
+// strong node) or a run of delimiter characters that may still open or close
+// emphasis. Pieces form a doubly linked list through Prev/Next so matched runs
+// can be spliced out without shifting the others.
+struct InlinePiece {
+ MDNode *Node = nullptr; // content node, or null while this is a delimiter run
+ char Ch = 0; // '*' or '_' for a delimiter run
+ size_t Len = 0; // delimiters still available in the run
+ unsigned OrigLen = 0; // original run length, for the multiple-of-three rule
+ bool CanOpen = false;
+ bool CanClose = false;
+ int Prev = -1;
+ int Next = -1;
+};
+} // namespace
+
// Parses the inline content of a single line into a sequence of inline nodes:
-// inline code (`code`), strong (**text** or __text__), and emphasis (*text* or
-// _text_). Runs that match no construct become TextNodes. Emphasis and strong
-// recurse so their content may itself contain inline constructs. Text with no
-// markers yields a single TextNode.
+// inline code (`code`), emphasis (*text* or _text_), and strong (**text** or
+// __text__). Emphasis is resolved with a CommonMark-style delimiter stack: a
+// first pass tokenizes the line into text, code spans, and delimiter runs (each
+// tagged with its flanking flags), then a second pass walks closers back to
+// openers, honoring the multiple-of-three rule. Unmatched runs stay as text.
//
-// TODO: This covers the common cases but not the full CommonMark §6 inline
-// model (delimiter stacks, intraword underscore rules, links, autolinks).
+// TODO: This does not yet handle links, autolinks, or backslash escapes.
static ArrayRef<MDNode *> parseInline(StringRef S, BumpPtrAllocator &Arena,
StringSaver &Saver) {
- SmallVector<MDNode *> Nodes;
+ SmallVector<InlinePiece> Pool;
+ int Head = -1, Tail = -1;
+
+ auto makePiece = [&]() -> int {
+ Pool.emplace_back();
+ return Pool.size() - 1;
+ };
+ auto linkAtTail = [&](int Idx) {
+ Pool[Idx].Prev = Tail;
+ (Tail != -1 ? Pool[Tail].Next : Head) = Idx;
+ Tail = Idx;
+ };
+ auto appendNode = [&](MDNode *N) {
+ int Idx = makePiece();
+ Pool[Idx].Node = N;
+ linkAtTail(Idx);
+ };
+ // Content nodes pass through; a leftover delimiter run becomes a TextNode of
+ // its remaining characters.
+ auto pieceNode = [&](int P) -> MDNode * {
+ if (Pool[P].Node)
+ return Pool[P].Node;
+ return new (Arena)
+ TextNode(Saver.save(std::string(Pool[P].Len, Pool[P].Ch)));
+ };
+ // Merges adjacent TextNodes so unmatched delimiters coalesce with neighboring
+ // text, then copies the result into the arena.
+ auto finalize = [&](SmallVectorImpl<MDNode *> &Nodes) -> ArrayRef<MDNode *> {
+ SmallVector<MDNode *> Merged;
+ for (MDNode *Nd : Nodes) {
+ if (isa<TextNode>(Nd) && !Merged.empty() &&
+ isa<TextNode>(Merged.back())) {
+ StringRef Prev = cast<TextNode>(Merged.back())->Text;
+ StringRef Cur = cast<TextNode>(Nd)->Text;
+ Merged.back() =
+ new (Arena) TextNode(Saver.save(Prev.str() + Cur.str()));
+ } else {
+ Merged.push_back(Nd);
+ }
+ }
+ return allocateArray(Merged, Arena);
+ };
+
+ // Phase 1: tokenize the line into text, code spans, and delimiter runs.
CharReader Reader(S);
size_t TextStart = 0;
-
auto flushText = [&](size_t End) {
if (End > TextStart)
- Nodes.push_back(new (Arena) TextNode(
+ appendNode(new (Arena) TextNode(
Saver.save(S.substr(TextStart, End - TextStart))));
};
@@ -246,7 +305,7 @@ static ArrayRef<MDNode *> parseInline(StringRef S, BumpPtrAllocator &Arena,
flushText(Pos);
StringRef Code =
trimCodeSpan(S.substr(Pos + OpenLen, ClosePos - (Pos + OpenLen)));
- Nodes.push_back(new (Arena) InlineCodeNode(Saver.save(Code)));
+ appendNode(new (Arena) InlineCodeNode(Saver.save(Code)));
Reader.seek(ClosePos + OpenLen);
TextStart = Reader.position();
continue;
@@ -256,38 +315,117 @@ static ArrayRef<MDNode *> parseInline(StringRef S, BumpPtrAllocator &Arena,
continue;
}
- // Emphasis (*text*, _text_) and strong (**text**, __text__).
+ // Delimiter run for emphasis or strong.
if (C == '*' || C == '_') {
- // Strong binds the two-delimiter form before single-delimiter emphasis.
- if (Reader.peek(1) == C) {
- size_t Close = findClosingDelim(S, Pos + 2, C, 2);
- if (Close != StringRef::npos) {
- flushText(Pos);
- StringRef Inner = S.substr(Pos + 2, Close - (Pos + 2));
- Nodes.push_back(new (Arena)
- StrongNode(parseInline(Inner, Arena, Saver)));
- Reader.seek(Close + 2);
- TextStart = Reader.position();
- continue;
+ size_t RunLen = countRun(S, Pos, C);
+ flushText(Pos);
+ char Before = Pos == 0 ? '\0' : S[Pos - 1];
+ char After = Pos + RunLen < S.size() ? S[Pos + RunLen] : '\0';
+ int Idx = makePiece();
+ InlinePiece &D = Pool[Idx];
+ D.Ch = C;
+ D.Len = RunLen;
+ D.OrigLen = RunLen;
+ computeFlanking(Before, C, After, D.CanOpen, D.CanClose);
+ linkAtTail(Idx);
+ Reader.seek(Pos + RunLen);
+ TextStart = Reader.position();
+ continue;
+ }
+
+ Reader.advance();
+ }
+ flushText(S.size());
+
+ // Phase 2: match closers back to openers. OpenersBottom records, per closer
+ // kind, how far back a failed search needs to look, keyed by delimiter char,
+ // run length mod 3, and whether the closer can also open.
+ int OpenersBottom[12];
+ for (int &B : OpenersBottom)
+ B = -1;
+ auto bucket = [](const InlinePiece &P) {
+ return (P.Ch == '_' ? 6 : 0) + (P.OrigLen % 3) * 2 + (P.CanOpen ? 1 : 0);
+ };
+
+ int Current = Head;
+ while (Current != -1) {
+ // Advance to the next run that can close.
+ while (Current != -1 &&
+ !(Pool[Current].Ch && Pool[Current].CanClose && Pool[Current].Len))
+ Current = Pool[Current].Next;
+ if (Current == -1)
+ break;
+ int Closer = Current;
+ int Key = bucket(Pool[Closer]);
+
+ // Search back for the nearest matching opener.
+ int Opener = Pool[Closer].Prev;
+ bool Found = false;
+ while (Opener != -1 && Opener != OpenersBottom[Key]) {
+ InlinePiece &O = Pool[Opener];
+ if (O.Ch == Pool[Closer].Ch && O.Len && O.CanOpen) {
+ unsigned Sum = O.OrigLen + Pool[Closer].OrigLen;
+ bool OddMatch = (O.CanClose || Pool[Closer].CanOpen) && Sum % 3 == 0 &&
+ !(O.OrigLen % 3 == 0 && Pool[Closer].OrigLen % 3 == 0);
+ if (!OddMatch) {
+ Found = true;
+ break;
}
}
- size_t Close = findClosingDelim(S, Pos + 1, C, 1);
- if (Close != StringRef::npos) {
- flushText(Pos);
- StringRef Inner = S.substr(Pos + 1, Close - (Pos + 1));
- Nodes.push_back(new (Arena)
- EmphasisNode(parseInline(Inner, Arena, Saver)));
- Reader.seek(Close + 1);
- TextStart = Reader.position();
- continue;
- }
+ Opener = Pool[Opener].Prev;
}
- Reader.advance();
+ if (!Found) {
+ OpenersBottom[Key] = Pool[Closer].Prev;
+ // A run that cannot also open will never match anything; keep its text
+ // but stop treating it as a delimiter.
+ if (!Pool[Closer].CanOpen)
+ Pool[Closer].CanClose = false;
+ Current = Pool[Closer].Next;
+ continue;
+ }
+
+ // Wrap the pieces between opener and closer, consuming one delimiter from
+ // each side for emphasis or two for strong.
+ unsigned Use = Pool[Opener].Len >= 2 && Pool[Closer].Len >= 2 ? 2 : 1;
+ SmallVector<MDNode *> Inner;
+ for (int P = Pool[Opener].Next; P != Closer; P = Pool[P].Next)
+ Inner.push_back(pieceNode(P));
+ Pool[Opener].Len -= Use;
+ Pool[Closer].Len -= Use;
+ MDNode *Emph =
+ Use == 2
+ ? static_cast<MDNode *>(new (Arena) StrongNode(finalize(Inner)))
+ : static_cast<MDNode *>(new (Arena) EmphasisNode(finalize(Inner)));
+ int EP = makePiece();
+ Pool[EP].Node = Emph;
+ Pool[EP].Prev = Opener;
+ Pool[EP].Next = Closer;
+ Pool[Opener].Next = EP;
+ Pool[Closer].Prev = EP;
+
+ // Drop the opener or closer once its run is fully consumed.
+ if (Pool[Opener].Len == 0) {
+ int Pr = Pool[Opener].Prev;
+ Pool[EP].Prev = Pr;
+ (Pr != -1 ? Pool[Pr].Next : Head) = EP;
+ }
+ if (Pool[Closer].Len == 0) {
+ int Nx = Pool[Closer].Next;
+ Pool[EP].Next = Nx;
+ (Nx != -1 ? Pool[Nx].Prev : Tail) = EP;
+ Current = Nx;
+ } else {
+ Current = Closer;
+ }
}
- flushText(S.size());
- return allocateArray(Nodes, Arena);
+ // Phase 3: collect the surviving pieces, dropping fully consumed delimiters.
+ SmallVector<MDNode *> Result;
+ for (int P = Head; P != -1; P = Pool[P].Next)
+ if (Pool[P].Node || Pool[P].Len)
+ Result.push_back(pieceNode(P));
+ return finalize(Result);
}
// Parses a fenced code block opened with ``` or ~~~. The cursor must be on the
diff --git a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
index aedcd9407b197..49e61e8c129fa 100644
--- a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
+++ b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
@@ -590,18 +590,14 @@ TEST_F(MarkdownParserTest, IntrawordStrongAsterisk) {
EXPECT_EQ(cast<TextNode>(St->Children[0])->Text, "bar");
}
-// CommonMark §6.2 Example 360: intraword underscores do NOT open emphasis, so
-// "foo_bar_" is literal text. DIVERGENCE: this parser lacks the intraword
-// underscore rule (see the findClosingDelim TODO) and treats it as emphasis.
-TEST_F(MarkdownParserTest, IntrawordUnderscoreEmphasisDivergence) {
+// CommonMark §6.2 Example 360: intraword underscores do not open or close
+// emphasis, so "foo_bar_" stays as literal text.
+TEST_F(MarkdownParserTest, IntrawordUnderscoreIsText) {
auto Nodes = parseMarkdown("foo_bar_", Arena);
ASSERT_EQ(Nodes.size(), 1u);
auto *P = cast<ParagraphNode>(Nodes[0]);
- ASSERT_EQ(P->Children.size(), 2u);
- EXPECT_EQ(cast<TextNode>(P->Children[0])->Text, "foo");
- auto *Em = cast<EmphasisNode>(P->Children[1]);
- ASSERT_EQ(Em->Children.size(), 1u);
- EXPECT_EQ(cast<TextNode>(Em->Children[0])->Text, "bar");
+ ASSERT_EQ(P->Children.size(), 1u);
+ EXPECT_EQ(cast<TextNode>(P->Children[0])->Text, "foo_bar_");
}
// CommonMark §6.1 Example 331: a code span strips one leading and trailing
More information about the cfe-commits
mailing list