[clang-tools-extra] [clang-doc] Add standalone Markdown parsing library (PR #202991)
Neil Nair via cfe-commits
cfe-commits at lists.llvm.org
Thu Jun 11 23:34:47 PDT 2026
https://github.com/Neil-N4 updated https://github.com/llvm/llvm-project/pull/202991
>From b4623400ed04066b222882d46bbddf1819ffca9e Mon Sep 17 00:00:00 2001
From: Neil-N4 <neilnair4 at gmail.com>
Date: Wed, 10 Jun 2026 09:51:48 -0400
Subject: [PATCH 1/5] [clang-doc] Add standalone Markdown parsing library
---
.../clang-doc/support/CMakeLists.txt | 3 +-
.../clang-doc/support/Markdown.cpp | 145 ++++++++++++++++++
.../clang-doc/support/Markdown.h | 72 +++++++++
.../unittests/clang-doc/CMakeLists.txt | 4 +-
.../clang-doc/MarkdownParserTest.cpp | 94 ++++++++++++
5 files changed, 316 insertions(+), 2 deletions(-)
create mode 100644 clang-tools-extra/clang-doc/support/Markdown.cpp
create mode 100644 clang-tools-extra/clang-doc/support/Markdown.h
create mode 100644 clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
diff --git a/clang-tools-extra/clang-doc/support/CMakeLists.txt b/clang-tools-extra/clang-doc/support/CMakeLists.txt
index 8ac913ffbe998..acff865190ff9 100644
--- a/clang-tools-extra/clang-doc/support/CMakeLists.txt
+++ b/clang-tools-extra/clang-doc/support/CMakeLists.txt
@@ -6,5 +6,6 @@ set(LLVM_LINK_COMPONENTS
add_clang_library(clangDocSupport STATIC
File.cpp
+ Markdown.cpp
Utils.cpp
- )
+ )
\ No newline at end of file
diff --git a/clang-tools-extra/clang-doc/support/Markdown.cpp b/clang-tools-extra/clang-doc/support/Markdown.cpp
new file mode 100644
index 0000000000000..776150b939d27
--- /dev/null
+++ b/clang-tools-extra/clang-doc/support/Markdown.cpp
@@ -0,0 +1,145 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "Markdown.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/DebugLog.h"
+
+#define DEBUG_TYPE "clang-doc-markdown"
+
+using namespace llvm;
+
+namespace clang::doc::markdown {
+
+static MDNode makeText(StringRef S) {
+ return {NodeKind::NK_Text, S, {}};
+}
+
+// A line is a table separator if it only contains |, -, :, and spaces,
+// and has at least one -.
+static bool isSepRow(StringRef Line) {
+ return Line.contains('-') &&
+ Line.find_first_not_of("|-: ") == StringRef::npos;
+}
+
+// Returns true if Line begins with a bullet list marker (-, *, or +)
+// followed by a space.
+static bool isListItem(StringRef Line) {
+ return Line.starts_with("- ") || Line.starts_with("* ") ||
+ Line.starts_with("+ ");
+}
+
+static ArrayRef<MDNode> allocateNodes(const SmallVectorImpl<MDNode> &Nodes,
+ BumpPtrAllocator &Arena) {
+ if (Nodes.empty())
+ return {};
+ MDNode *Allocated = Arena.Allocate<MDNode>(Nodes.size());
+ std::uninitialized_copy(Nodes.begin(), Nodes.end(), Allocated);
+ return ArrayRef<MDNode>(Allocated, Nodes.size());
+}
+
+ArrayRef<MDNode> parseMarkdown(StringRef ParagraphText,
+ BumpPtrAllocator &Arena) {
+ if (ParagraphText.trim().empty())
+ return {};
+
+ SmallVector<StringRef, 16> Lines;
+ ParagraphText.split(Lines, '\n');
+
+ SmallVector<MDNode> Nodes;
+ size_t I = 0, E = Lines.size();
+
+ while (I < E) {
+ StringRef Line = Lines[I].trim();
+
+ if (Line.empty()) {
+ ++I;
+ continue;
+ }
+
+ // TODO: Follow CommonMark spec §4.5 more closely -- opening fences may be
+ // indented up to 3 spaces, the closing fence must use the same character
+ // and be at least as long as the opening fence, and the closing fence may
+ // only be followed by spaces. Doxygen specifics should be handled on a
+ // case-by-case basis.
+ if (Line.starts_with("```") || Line.starts_with("~~~")) {
+ char Fence = Line[0];
+ StringRef Lang = Line.drop_front(3).trim();
+ SmallVector<MDNode> CodeLines;
+ ++I;
+ while (I < E) {
+ StringRef CodeLine = Lines[I].trim();
+ if (CodeLine.size() >= 3 &&
+ all_of(CodeLine.take_front(3),
+ [Fence](char C) { return C == Fence; }))
+ break;
+ CodeLines.push_back(makeText(Lines[I]));
+ ++I;
+ }
+ ++I; // skip closing fence
+ MDNode Code;
+ Code.Kind = NodeKind::NK_FencedCode;
+ Code.Content = Lang;
+ Code.Children = allocateNodes(CodeLines, Arena);
+ LDBG() << "emitting NK_FencedCode lang='" << Lang
+ << "' lines=" << CodeLines.size();
+ Nodes.push_back(Code);
+ continue;
+ }
+
+ // Pipe table: current line has | and next line is a separator row.
+ if (Line.contains('|') && I + 1 < E && isSepRow(Lines[I + 1].trim())) {
+ SmallVector<MDNode> Rows;
+ while (I < E && Lines[I].trim().contains('|')) {
+ Rows.push_back(makeText(Lines[I].trim()));
+ ++I;
+ }
+ MDNode Table;
+ Table.Kind = NodeKind::NK_Table;
+ Table.Content = {};
+ Table.Children = allocateNodes(Rows, Arena);
+ LDBG() << "emitting NK_Table rows=" << Rows.size();
+ Nodes.push_back(Table);
+ continue;
+ }
+
+ // Unordered list item.
+ if (isListItem(Line)) {
+ SmallVector<MDNode> Items;
+ while (I < E) {
+ StringRef L = Lines[I].trim();
+ if (!isListItem(L))
+ break;
+ MDNode Item;
+ Item.Kind = NodeKind::NK_ListItem;
+ Item.Content = L.drop_front(2).trim();
+ Item.Children = {};
+ Items.push_back(Item);
+ ++I;
+ }
+ MDNode List;
+ List.Kind = NodeKind::NK_UnorderedList;
+ List.Content = {};
+ List.Children = allocateNodes(Items, Arena);
+ LDBG() << "emitting NK_UnorderedList items=" << Items.size();
+ Nodes.push_back(List);
+ continue;
+ }
+
+ // Plain text fallback.
+ Nodes.push_back(makeText(Line));
+ ++I;
+ }
+
+ LDBG() << "parseMarkdown done nodes=" << Nodes.size();
+ return allocateNodes(Nodes, Arena);
+}
+
+} // namespace clang::doc::markdown
\ No newline at end of file
diff --git a/clang-tools-extra/clang-doc/support/Markdown.h b/clang-tools-extra/clang-doc/support/Markdown.h
new file mode 100644
index 0000000000000..890f764f937b1
--- /dev/null
+++ b/clang-tools-extra/clang-doc/support/Markdown.h
@@ -0,0 +1,72 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines a standalone Markdown parsing library for the LLVM
+/// ecosystem. The parser takes plain text and returns a tree of typed nodes
+/// with no knowledge of comments, Doxygen, or Clang-Doc internals.
+///
+/// This is a simple Markdown parser for use inside Clang-Doc's comment
+/// pipeline. You give it a paragraph of text and an arena allocator, and it
+/// gives back a list of typed nodes describing the Markdown structure it found.
+///
+/// The main entry point is parseMarkdown(). If the text has no Markdown in it,
+/// you get back an empty list and can fall back to plain-text output. If it
+/// does, you get a tree of MDNode structs where each node has a kind, optional
+/// content (like the language tag on a code fence), and optional children.
+///
+/// All nodes are allocated in the arena you pass in. You own the arena and are
+/// responsible for keeping it alive as long as you use the nodes.
+///
+/// The parser handles fenced code blocks, pipe tables, and unordered lists.
+/// Anything it does not recognize comes back as a plain text node. It will
+/// never crash on bad input.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_DOC_MARKDOWN_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_DOC_MARKDOWN_H
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Allocator.h"
+
+namespace clang::doc::markdown {
+
+enum class NodeKind {
+ // Block nodes
+ NK_Paragraph,
+ NK_FencedCode,
+ NK_Table,
+ NK_UnorderedList,
+ NK_OrderedList,
+ NK_ListItem,
+ NK_ThematicBreak,
+ // Inline nodes
+ NK_Text,
+ NK_InlineCode,
+ NK_Emphasis,
+ NK_Strong,
+ NK_SoftBreak,
+};
+
+struct MDNode {
+ NodeKind Kind;
+ llvm::StringRef Content; // lang tag for FencedCode, leaf text for Text
+ llvm::ArrayRef<MDNode> Children; // arena allocated
+};
+
+/// Parses Markdown from a single comment paragraph's text.
+/// Returns an empty ArrayRef if no Markdown constructs are found,
+/// so generators can fall back to plain-text rendering at zero cost.
+llvm::ArrayRef<MDNode> parseMarkdown(llvm::StringRef ParagraphText,
+ llvm::BumpPtrAllocator &Arena);
+
+} // namespace clang::doc::markdown
+
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_DOC_MARKDOWN_H
\ No newline at end of file
diff --git a/clang-tools-extra/unittests/clang-doc/CMakeLists.txt b/clang-tools-extra/unittests/clang-doc/CMakeLists.txt
index 01b34ec9a791e..b74207ac88fa7 100644
--- a/clang-tools-extra/unittests/clang-doc/CMakeLists.txt
+++ b/clang-tools-extra/unittests/clang-doc/CMakeLists.txt
@@ -26,6 +26,7 @@ add_extra_unittest(ClangDocTests
ClangDocTest.cpp
GeneratorTest.cpp
HTMLGeneratorTest.cpp
+ MarkdownParserTest.cpp
MDGeneratorTest.cpp
MergeTest.cpp
SerializeTest.cpp
@@ -49,5 +50,6 @@ clang_target_link_libraries(ClangDocTests
target_link_libraries(ClangDocTests
PRIVATE
clangDoc
+ clangDocSupport
LLVMTestingSupport
- )
+ )
\ No newline at end of file
diff --git a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
new file mode 100644
index 0000000000000..8df5efc7f1d5f
--- /dev/null
+++ b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
@@ -0,0 +1,94 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "support/Markdown.h"
+#include "llvm/Support/Allocator.h"
+#include "gtest/gtest.h"
+
+using namespace clang::doc::markdown;
+
+namespace {
+
+TEST(MarkdownParserTest, EmptyInput) {
+ llvm::BumpPtrAllocator Arena;
+ auto Nodes = parseMarkdown("", Arena);
+ EXPECT_TRUE(Nodes.empty());
+}
+
+TEST(MarkdownParserTest, WhitespaceOnlyInput) {
+ llvm::BumpPtrAllocator Arena;
+ auto Nodes = parseMarkdown(" \n \n", Arena);
+ EXPECT_TRUE(Nodes.empty());
+}
+
+TEST(MarkdownParserTest, PlainText) {
+ llvm::BumpPtrAllocator Arena;
+ auto Nodes = parseMarkdown("hello world", Arena);
+ ASSERT_EQ(Nodes.size(), 1u);
+ EXPECT_EQ(Nodes[0].Kind, NodeKind::NK_Text);
+ EXPECT_EQ(Nodes[0].Content, "hello world");
+}
+
+TEST(MarkdownParserTest, FencedCodeBlock) {
+ llvm::BumpPtrAllocator Arena;
+ auto Nodes = parseMarkdown("```cpp\nint x = 0;\n```", Arena);
+ ASSERT_EQ(Nodes.size(), 1u);
+ EXPECT_EQ(Nodes[0].Kind, NodeKind::NK_FencedCode);
+ EXPECT_EQ(Nodes[0].Content, "cpp");
+ ASSERT_EQ(Nodes[0].Children.size(), 1u);
+}
+
+TEST(MarkdownParserTest, FencedCodeBlockNoLang) {
+ llvm::BumpPtrAllocator Arena;
+ auto Nodes = parseMarkdown("```\nsome code\n```", Arena);
+ ASSERT_EQ(Nodes.size(), 1u);
+ EXPECT_EQ(Nodes[0].Kind, NodeKind::NK_FencedCode);
+ EXPECT_TRUE(Nodes[0].Content.empty());
+}
+
+TEST(MarkdownParserTest, UnterminatedFenceReturnsEmpty) {
+ llvm::BumpPtrAllocator Arena;
+ auto Nodes = parseMarkdown("```cpp\nint x = 0;", Arena);
+ // Unterminated fence should not crash and should produce a code node
+ // with whatever lines were found.
+ EXPECT_FALSE(Nodes.empty());
+}
+
+TEST(MarkdownParserTest, PipeTable) {
+ llvm::BumpPtrAllocator Arena;
+ auto Nodes = parseMarkdown("| A | B |\n|---|---|\n| 1 | 2 |", Arena);
+ ASSERT_EQ(Nodes.size(), 1u);
+ EXPECT_EQ(Nodes[0].Kind, NodeKind::NK_Table);
+}
+
+TEST(MarkdownParserTest, PipeCharacterWithoutSepRowIsPlainText) {
+ llvm::BumpPtrAllocator Arena;
+ auto Nodes = parseMarkdown("a | b\nc | d", Arena);
+ // No separator row so should not be parsed as a table
+ for (const auto &Node : Nodes)
+ EXPECT_NE(Node.Kind, NodeKind::NK_Table);
+}
+
+TEST(MarkdownParserTest, UnorderedList) {
+ llvm::BumpPtrAllocator Arena;
+ auto Nodes = parseMarkdown("- foo\n- bar\n- baz", Arena);
+ ASSERT_EQ(Nodes.size(), 1u);
+ EXPECT_EQ(Nodes[0].Kind, NodeKind::NK_UnorderedList);
+ ASSERT_EQ(Nodes[0].Children.size(), 3u);
+ EXPECT_EQ(Nodes[0].Children[0].Content, "foo");
+ EXPECT_EQ(Nodes[0].Children[1].Content, "bar");
+ EXPECT_EQ(Nodes[0].Children[2].Content, "baz");
+}
+
+TEST(MarkdownParserTest, MixedContent) {
+ llvm::BumpPtrAllocator Arena;
+ auto Nodes = parseMarkdown("some text\n```\ncode\n```\n- item", Arena);
+ EXPECT_EQ(Nodes.size(), 3u);
+}
+
+} // namespace
\ No newline at end of file
>From f4cb4a28630e0f91289bfd4416c59114c5654ff7 Mon Sep 17 00:00:00 2001
From: Neil-N4 <neilnair4 at gmail.com>
Date: Wed, 10 Jun 2026 11:35:54 -0400
Subject: [PATCH 2/5] [clang-doc] Address review feedback: test fixture, raw
strings, DEBUG_TYPE, EOF newlines
---
.../clang-doc/support/Markdown.cpp | 4 +-
.../clang-doc/support/Markdown.h | 2 +-
.../clang-doc/MarkdownParserTest.cpp | 97 +++++++++++--------
3 files changed, 61 insertions(+), 42 deletions(-)
diff --git a/clang-tools-extra/clang-doc/support/Markdown.cpp b/clang-tools-extra/clang-doc/support/Markdown.cpp
index 776150b939d27..9e008abf8b08d 100644
--- a/clang-tools-extra/clang-doc/support/Markdown.cpp
+++ b/clang-tools-extra/clang-doc/support/Markdown.cpp
@@ -12,7 +12,7 @@
#include "llvm/Support/Allocator.h"
#include "llvm/Support/DebugLog.h"
-#define DEBUG_TYPE "clang-doc-markdown"
+#define DEBUG_TYPE "clang-doc"
using namespace llvm;
@@ -142,4 +142,4 @@ ArrayRef<MDNode> parseMarkdown(StringRef ParagraphText,
return allocateNodes(Nodes, Arena);
}
-} // namespace clang::doc::markdown
\ No newline at end of file
+} // namespace clang::doc::markdown
diff --git a/clang-tools-extra/clang-doc/support/Markdown.h b/clang-tools-extra/clang-doc/support/Markdown.h
index 890f764f937b1..09b79cc8f2437 100644
--- a/clang-tools-extra/clang-doc/support/Markdown.h
+++ b/clang-tools-extra/clang-doc/support/Markdown.h
@@ -69,4 +69,4 @@ llvm::ArrayRef<MDNode> parseMarkdown(llvm::StringRef ParagraphText,
} // namespace clang::doc::markdown
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_DOC_MARKDOWN_H
\ No newline at end of file
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_DOC_MARKDOWN_H
diff --git a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
index 8df5efc7f1d5f..ff9bad88da136 100644
--- a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
+++ b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
@@ -14,80 +14,99 @@ using namespace clang::doc::markdown;
namespace {
-TEST(MarkdownParserTest, EmptyInput) {
+struct MarkdownParserTest : public ::testing::Test {
llvm::BumpPtrAllocator Arena;
+};
+
+TEST_F(MarkdownParserTest, EmptyInput) {
auto Nodes = parseMarkdown("", Arena);
EXPECT_TRUE(Nodes.empty());
}
-TEST(MarkdownParserTest, WhitespaceOnlyInput) {
- llvm::BumpPtrAllocator Arena;
+TEST_F(MarkdownParserTest, WhitespaceOnlyInput) {
auto Nodes = parseMarkdown(" \n \n", Arena);
EXPECT_TRUE(Nodes.empty());
}
-TEST(MarkdownParserTest, PlainText) {
- llvm::BumpPtrAllocator Arena;
+TEST_F(MarkdownParserTest, PlainText) {
auto Nodes = parseMarkdown("hello world", Arena);
ASSERT_EQ(Nodes.size(), 1u);
- EXPECT_EQ(Nodes[0].Kind, NodeKind::NK_Text);
- EXPECT_EQ(Nodes[0].Content, "hello world");
+ const auto &N = Nodes[0];
+ EXPECT_EQ(N.Kind, NodeKind::NK_Text);
+ EXPECT_EQ(N.Content, "hello world");
}
-TEST(MarkdownParserTest, FencedCodeBlock) {
- llvm::BumpPtrAllocator Arena;
- auto Nodes = parseMarkdown("```cpp\nint x = 0;\n```", Arena);
+TEST_F(MarkdownParserTest, FencedCodeBlock) {
+ auto Nodes = parseMarkdown(R"(```cpp
+int x = 0;
+````)",
+ Arena);
ASSERT_EQ(Nodes.size(), 1u);
- EXPECT_EQ(Nodes[0].Kind, NodeKind::NK_FencedCode);
- EXPECT_EQ(Nodes[0].Content, "cpp");
- ASSERT_EQ(Nodes[0].Children.size(), 1u);
+ const auto &N = Nodes[0];
+ EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode);
+ EXPECT_EQ(N.Content, "cpp");
+ ASSERT_EQ(N.Children.size(), 1u);
}
-TEST(MarkdownParserTest, FencedCodeBlockNoLang) {
- llvm::BumpPtrAllocator Arena;
- auto Nodes = parseMarkdown("```\nsome code\n```", Arena);
+TEST_F(MarkdownParserTest, FencedCodeBlockNoLang) {
+ auto Nodes = parseMarkdown(R"(```
+some code
+```)",
+ Arena);
ASSERT_EQ(Nodes.size(), 1u);
- EXPECT_EQ(Nodes[0].Kind, NodeKind::NK_FencedCode);
- EXPECT_TRUE(Nodes[0].Content.empty());
+ const auto &N = Nodes[0];
+ EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode);
+ EXPECT_TRUE(N.Content.empty());
}
-TEST(MarkdownParserTest, UnterminatedFenceReturnsEmpty) {
- llvm::BumpPtrAllocator Arena;
- auto Nodes = parseMarkdown("```cpp\nint x = 0;", Arena);
+TEST_F(MarkdownParserTest, UnterminatedFenceReturnsEmpty) {
+ auto Nodes = parseMarkdown(R"(```cpp
+int x = 0;)",
+ Arena);
// Unterminated fence should not crash and should produce a code node
// with whatever lines were found.
EXPECT_FALSE(Nodes.empty());
}
-TEST(MarkdownParserTest, PipeTable) {
- llvm::BumpPtrAllocator Arena;
- auto Nodes = parseMarkdown("| A | B |\n|---|---|\n| 1 | 2 |", Arena);
+TEST_F(MarkdownParserTest, PipeTable) {
+ auto Nodes = parseMarkdown(R"(| A | B |
+|---|---|
+| 1 | 2 |)",
+ Arena);
ASSERT_EQ(Nodes.size(), 1u);
EXPECT_EQ(Nodes[0].Kind, NodeKind::NK_Table);
}
-TEST(MarkdownParserTest, PipeCharacterWithoutSepRowIsPlainText) {
- llvm::BumpPtrAllocator Arena;
- auto Nodes = parseMarkdown("a | b\nc | d", Arena);
- // No separator row so should not be parsed as a table
+TEST_F(MarkdownParserTest, PipeCharacterWithoutSepRowIsPlainText) {
+ auto Nodes = parseMarkdown(R"(a | b
+c | d)",
+ Arena);
+ // No separator row so should not be parsed as a table.
for (const auto &Node : Nodes)
EXPECT_NE(Node.Kind, NodeKind::NK_Table);
}
-TEST(MarkdownParserTest, UnorderedList) {
- llvm::BumpPtrAllocator Arena;
- auto Nodes = parseMarkdown("- foo\n- bar\n- baz", Arena);
+TEST_F(MarkdownParserTest, UnorderedList) {
+ auto Nodes = parseMarkdown(R"(- foo
+- bar
+- baz)",
+ Arena);
ASSERT_EQ(Nodes.size(), 1u);
- EXPECT_EQ(Nodes[0].Kind, NodeKind::NK_UnorderedList);
- ASSERT_EQ(Nodes[0].Children.size(), 3u);
- EXPECT_EQ(Nodes[0].Children[0].Content, "foo");
- EXPECT_EQ(Nodes[0].Children[1].Content, "bar");
- EXPECT_EQ(Nodes[0].Children[2].Content, "baz");
+ const auto &N = Nodes[0];
+ EXPECT_EQ(N.Kind, NodeKind::NK_UnorderedList);
+ ASSERT_EQ(N.Children.size(), 3u);
+ EXPECT_EQ(N.Children[0].Content, "foo");
+ EXPECT_EQ(N.Children[1].Content, "bar");
+ EXPECT_EQ(N.Children[2].Content, "baz");
}
-TEST(MarkdownParserTest, MixedContent) {
- llvm::BumpPtrAllocator Arena;
- auto Nodes = parseMarkdown("some text\n```\ncode\n```\n- item", Arena);
+TEST_F(MarkdownParserTest, MixedContent) {
+ auto Nodes = parseMarkdown(R"(some text
+```
+code
+````
+- item)",
+ Arena);
EXPECT_EQ(Nodes.size(), 3u);
}
>From 3ef8f62edab311caff0907ab2b9a0c3aaeb14353 Mon Sep 17 00:00:00 2001
From: Neil-N4 <neilnair4 at gmail.com>
Date: Wed, 10 Jun 2026 13:45:44 -0400
Subject: [PATCH 3/5] [clang-doc] Add CommonMark spec tests for fenced code
blocks
---
.../clang-doc/MarkdownParserTest.cpp | 112 +++++++++++++++++-
1 file changed, 108 insertions(+), 4 deletions(-)
diff --git a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
index ff9bad88da136..4ca979c1f1d24 100644
--- a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
+++ b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
@@ -39,7 +39,7 @@ TEST_F(MarkdownParserTest, PlainText) {
TEST_F(MarkdownParserTest, FencedCodeBlock) {
auto Nodes = parseMarkdown(R"(```cpp
int x = 0;
-````)",
+````````)",
Arena);
ASSERT_EQ(Nodes.size(), 1u);
const auto &N = Nodes[0];
@@ -51,7 +51,7 @@ int x = 0;
TEST_F(MarkdownParserTest, FencedCodeBlockNoLang) {
auto Nodes = parseMarkdown(R"(```
some code
-```)",
+```````)",
Arena);
ASSERT_EQ(Nodes.size(), 1u);
const auto &N = Nodes[0];
@@ -102,12 +102,116 @@ TEST_F(MarkdownParserTest, UnorderedList) {
TEST_F(MarkdownParserTest, MixedContent) {
auto Nodes = parseMarkdown(R"(some text
-```
+```````
code
-````
+````````
- item)",
Arena);
EXPECT_EQ(Nodes.size(), 3u);
}
+// CommonMark §4.5 example 120: tilde fences work the same as backtick fences.
+TEST_F(MarkdownParserTest, TildeFence) {
+ auto Nodes = parseMarkdown(R"(~~~
+int x = 0;
+~~~)",
+ Arena);
+ ASSERT_EQ(Nodes.size(), 1u);
+ const auto &N = Nodes[0];
+ EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode);
+ EXPECT_TRUE(N.Content.empty());
+ ASSERT_EQ(N.Children.size(), 1u);
+}
+
+// CommonMark §4.5 example 120: tilde fence with a language tag.
+TEST_F(MarkdownParserTest, TildeFenceWithLang) {
+ auto Nodes = parseMarkdown(R"(~~~cpp
+int x = 0;
+~~~)",
+ Arena);
+ ASSERT_EQ(Nodes.size(), 1u);
+ const auto &N = Nodes[0];
+ EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode);
+ EXPECT_EQ(N.Content, "cpp");
+ ASSERT_EQ(N.Children.size(), 1u);
+}
+
+// CommonMark §4.5 example 122: a tilde line does not close a backtick fence.
+TEST_F(MarkdownParserTest, ClosingFenceMustMatchOpeningChar) {
+ auto Nodes = parseMarkdown(R"(```
+aaa
+~~~
+````````)",
+ Arena);
+ ASSERT_EQ(Nodes.size(), 1u);
+ const auto &N = Nodes[0];
+ EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode);
+ // ~~~ is content, not a closing fence.
+ ASSERT_EQ(N.Children.size(), 2u);
+}
+
+// CommonMark §4.5 example 130: a code block can be empty.
+TEST_F(MarkdownParserTest, EmptyFencedCodeBlock) {
+ auto Nodes = parseMarkdown(R"(```
+```````)",
+ Arena);
+ ASSERT_EQ(Nodes.size(), 1u);
+ const auto &N = Nodes[0];
+ EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode);
+ EXPECT_TRUE(N.Children.empty());
+}
+
+// CommonMark §4.5 example 129: a code block may contain only blank lines.
+TEST_F(MarkdownParserTest, FencedCodeBlockBlankLineContent) {
+ auto Nodes = parseMarkdown("```\n\n \n```", Arena);
+ ASSERT_EQ(Nodes.size(), 1u);
+ const auto &N = Nodes[0];
+ EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode);
+ ASSERT_EQ(N.Children.size(), 2u);
+}
+
+// CommonMark §4.5 example 142: lang tag is captured from the info string.
+TEST_F(MarkdownParserTest, InfoStringLangTag) {
+ auto Nodes = parseMarkdown(R"(```ruby
+def foo(x)
+ return 3
+end
+``````)",
+ Arena);
+ ASSERT_EQ(Nodes.size(), 1u);
+ const auto &N = Nodes[0];
+ EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode);
+ EXPECT_EQ(N.Content, "ruby");
+ ASSERT_EQ(N.Children.size(), 3u);
+}
+
+// CommonMark §4.5 example 146: tilde fence info string may contain backticks.
+TEST_F(MarkdownParserTest, TildeFenceInfoStringWithBackticks) {
+ auto Nodes = parseMarkdown(R"(~~~ aa ``` ~~~
+foo
+~~~)",
+ Arena);
+ ASSERT_EQ(Nodes.size(), 1u);
+ const auto &N = Nodes[0];
+ EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode);
+ EXPECT_EQ(N.Content, "aa ``` ~~~");
+ ASSERT_EQ(N.Children.size(), 1u);
+}
+
+// CommonMark §4.5 example 124: closing fence must be at least as long as the
+// opening fence.
+// TODO: our parser currently closes on the first line with 3 matching fence
+// chars regardless of opening fence length. Fix as part of the CommonMark
+// TODO in parseMarkdown().
+TEST_F(MarkdownParserTest, ClosingFenceLengthTODO) {
+ auto Nodes = parseMarkdown("````\naaa\n```", Arena);
+ // The ``` line should not close the ```` fence per CommonMark, but our
+ // parser currently treats it as a closing fence. This test documents the
+ // current (non-conformant) behavior.
+ ASSERT_EQ(Nodes.size(), 1u);
+ const auto &N = Nodes[0];
+ EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode);
+ ASSERT_EQ(N.Children.size(), 1u);
+}
+
} // namespace
\ No newline at end of file
>From ffb56028d83a542a775119a7b0c2f88271b2df84 Mon Sep 17 00:00:00 2001
From: Neil-N4 <neilnair4 at gmail.com>
Date: Wed, 10 Jun 2026 13:59:52 -0400
Subject: [PATCH 4/5] [clang-doc] Replace flat MDNode with typed node hierarchy
using LLVM RTTI
---
.../clang-doc/support/Markdown.cpp | 84 +++---
.../clang-doc/support/Markdown.h | 264 ++++++++++++++++--
.../clang-doc/MarkdownParserTest.cpp | 84 +++---
3 files changed, 312 insertions(+), 120 deletions(-)
diff --git a/clang-tools-extra/clang-doc/support/Markdown.cpp b/clang-tools-extra/clang-doc/support/Markdown.cpp
index 9e008abf8b08d..bee15c3e23ec3 100644
--- a/clang-tools-extra/clang-doc/support/Markdown.cpp
+++ b/clang-tools-extra/clang-doc/support/Markdown.cpp
@@ -18,8 +18,24 @@ using namespace llvm;
namespace clang::doc::markdown {
-static MDNode makeText(StringRef S) {
- return {NodeKind::NK_Text, S, {}};
+// Allocates a contiguous array of T in the arena and returns an ArrayRef.
+template <typename T>
+static ArrayRef<T> allocateArray(SmallVectorImpl<T> &Vec,
+ BumpPtrAllocator &Arena) {
+ if (Vec.empty())
+ return {};
+ T *Allocated = Arena.Allocate<T>(Vec.size());
+ std::uninitialized_copy(Vec.begin(), Vec.end(), Allocated);
+ return ArrayRef<T>(Allocated, Vec.size());
+}
+
+// Interns a StringRef into the arena so it outlives the parse loop.
+static StringRef internString(StringRef S, BumpPtrAllocator &Arena) {
+ if (S.empty())
+ return {};
+ char *Buf = Arena.Allocate<char>(S.size());
+ std::copy(S.begin(), S.end(), Buf);
+ return StringRef(Buf, S.size());
}
// A line is a table separator if it only contains |, -, :, and spaces,
@@ -36,24 +52,15 @@ static bool isListItem(StringRef Line) {
Line.starts_with("+ ");
}
-static ArrayRef<MDNode> allocateNodes(const SmallVectorImpl<MDNode> &Nodes,
- BumpPtrAllocator &Arena) {
- if (Nodes.empty())
- return {};
- MDNode *Allocated = Arena.Allocate<MDNode>(Nodes.size());
- std::uninitialized_copy(Nodes.begin(), Nodes.end(), Allocated);
- return ArrayRef<MDNode>(Allocated, Nodes.size());
-}
-
-ArrayRef<MDNode> parseMarkdown(StringRef ParagraphText,
- BumpPtrAllocator &Arena) {
+ArrayRef<MDNode *> parseMarkdown(StringRef ParagraphText,
+ BumpPtrAllocator &Arena) {
if (ParagraphText.trim().empty())
return {};
SmallVector<StringRef, 16> Lines;
ParagraphText.split(Lines, '\n');
- SmallVector<MDNode> Nodes;
+ SmallVector<MDNode *> Nodes;
size_t I = 0, E = Lines.size();
while (I < E) {
@@ -71,8 +78,8 @@ ArrayRef<MDNode> parseMarkdown(StringRef ParagraphText,
// case-by-case basis.
if (Line.starts_with("```") || Line.starts_with("~~~")) {
char Fence = Line[0];
- StringRef Lang = Line.drop_front(3).trim();
- SmallVector<MDNode> CodeLines;
+ StringRef Lang = internString(Line.drop_front(3).trim(), Arena);
+ SmallVector<StringRef> CodeLines;
++I;
while (I < E) {
StringRef CodeLine = Lines[I].trim();
@@ -80,15 +87,13 @@ ArrayRef<MDNode> parseMarkdown(StringRef ParagraphText,
all_of(CodeLine.take_front(3),
[Fence](char C) { return C == Fence; }))
break;
- CodeLines.push_back(makeText(Lines[I]));
+ CodeLines.push_back(internString(Lines[I], Arena));
++I;
}
++I; // skip closing fence
- MDNode Code;
- Code.Kind = NodeKind::NK_FencedCode;
- Code.Content = Lang;
- Code.Children = allocateNodes(CodeLines, Arena);
- LDBG() << "emitting NK_FencedCode lang='" << Lang
+ auto *Code =
+ new (Arena) FencedCodeNode(Lang, allocateArray(CodeLines, Arena));
+ LDBG() << "emitting FencedCodeNode lang='" << Lang
<< "' lines=" << CodeLines.size();
Nodes.push_back(Code);
continue;
@@ -96,50 +101,45 @@ ArrayRef<MDNode> parseMarkdown(StringRef ParagraphText,
// Pipe table: current line has | and next line is a separator row.
if (Line.contains('|') && I + 1 < E && isSepRow(Lines[I + 1].trim())) {
- SmallVector<MDNode> Rows;
+ SmallVector<StringRef> Rows;
while (I < E && Lines[I].trim().contains('|')) {
- Rows.push_back(makeText(Lines[I].trim()));
+ Rows.push_back(internString(Lines[I].trim(), Arena));
++I;
}
- MDNode Table;
- Table.Kind = NodeKind::NK_Table;
- Table.Content = {};
- Table.Children = allocateNodes(Rows, Arena);
- LDBG() << "emitting NK_Table rows=" << Rows.size();
+ auto *Table = new (Arena) TableNode(allocateArray(Rows, Arena));
+ LDBG() << "emitting TableNode rows=" << Rows.size();
Nodes.push_back(Table);
continue;
}
// Unordered list item.
if (isListItem(Line)) {
- SmallVector<MDNode> Items;
+ SmallVector<ListItemNode *> Items;
while (I < E) {
StringRef L = Lines[I].trim();
if (!isListItem(L))
break;
- MDNode Item;
- Item.Kind = NodeKind::NK_ListItem;
- Item.Content = L.drop_front(2).trim();
- Item.Children = {};
+ StringRef ItemText = internString(L.drop_front(2).trim(), Arena);
+ SmallVector<MDNode *> ItemChildren;
+ ItemChildren.push_back(new (Arena) TextNode(ItemText));
+ auto *Item =
+ new (Arena) ListItemNode(allocateArray(ItemChildren, Arena));
Items.push_back(Item);
++I;
}
- MDNode List;
- List.Kind = NodeKind::NK_UnorderedList;
- List.Content = {};
- List.Children = allocateNodes(Items, Arena);
- LDBG() << "emitting NK_UnorderedList items=" << Items.size();
+ auto *List = new (Arena) UnorderedListNode(allocateArray(Items, Arena));
+ LDBG() << "emitting UnorderedListNode items=" << Items.size();
Nodes.push_back(List);
continue;
}
// Plain text fallback.
- Nodes.push_back(makeText(Line));
+ Nodes.push_back(new (Arena) TextNode(internString(Line, Arena)));
++I;
}
LDBG() << "parseMarkdown done nodes=" << Nodes.size();
- return allocateNodes(Nodes, Arena);
+ return allocateArray(Nodes, Arena);
}
-} // namespace clang::doc::markdown
+} // namespace clang::doc::markdown
\ No newline at end of file
diff --git a/clang-tools-extra/clang-doc/support/Markdown.h b/clang-tools-extra/clang-doc/support/Markdown.h
index 09b79cc8f2437..3d457bcddfac6 100644
--- a/clang-tools-extra/clang-doc/support/Markdown.h
+++ b/clang-tools-extra/clang-doc/support/Markdown.h
@@ -7,30 +7,50 @@
//===----------------------------------------------------------------------===//
///
/// \file
-/// This file defines a standalone Markdown parsing library for the LLVM
-/// ecosystem. The parser takes plain text and returns a tree of typed nodes
-/// with no knowledge of comments, Doxygen, or Clang-Doc internals.
+/// Standalone Markdown parsing library for the LLVM ecosystem.
///
-/// This is a simple Markdown parser for use inside Clang-Doc's comment
-/// pipeline. You give it a paragraph of text and an arena allocator, and it
-/// gives back a list of typed nodes describing the Markdown structure it found.
+/// The parser takes plain paragraph text and returns a polymorphic tree of
+/// MDNode-derived objects allocated in a caller-supplied BumpPtrAllocator.
+/// Node types form a closed class hierarchy rooted at MDNode. Each concrete
+/// type carries exactly the fields it needs -- no overloaded Content field,
+/// no unused arrays. Use llvm::isa<>/cast<>/dyn_cast<> for type-safe
+/// downcasting; each concrete type provides classof() for this purpose.
///
-/// The main entry point is parseMarkdown(). If the text has no Markdown in it,
-/// you get back an empty list and can fall back to plain-text output. If it
-/// does, you get a tree of MDNode structs where each node has a kind, optional
-/// content (like the language tag on a code fence), and optional children.
+/// See
+/// https://llvm.org/docs/ProgrammerManual.html#the-isa-cast-and-dyn-cast-templates
///
-/// All nodes are allocated in the arena you pass in. You own the arena and are
-/// responsible for keeping it alive as long as you use the nodes.
+/// Field ordering in each derived struct is chosen to minimize padding:
+/// 4-byte fields (like Level or Start) are declared before 16-byte fields
+/// (ArrayRef, StringRef) so that no implicit padding is inserted between the
+/// base class's 4-byte Kind and the first derived field.
///
-/// The parser handles fenced code blocks, pipe tables, and unordered lists.
-/// Anything it does not recognize comes back as a plain text node. It will
-/// never crash on bad input.
+/// Inline nodes (appear inside ParagraphNode, HeadingNode, etc.):
+/// TextNode -- plain text run
+/// SoftBreakNode -- soft line break
+/// HardBreakNode -- hard line break (trailing spaces or backslash)
+/// InlineCodeNode -- inline code span (`code`)
+/// EmphasisNode -- emphasis (*text* or _text_)
+/// StrongNode -- strong emphasis (**text** or __text__)
+///
+/// Block nodes:
+/// ParagraphNode -- sequence of inline nodes
+/// HeadingNode -- ATX heading (# through ######), level 1-6
+/// FencedCodeNode -- fenced code block (``` or ~~~)
+/// TableNode -- pipe table (raw row text; TODO: structured cells)
+/// UnorderedListNode -- bullet list (-, *, +)
+/// OrderedListNode -- numbered list with explicit start number
+/// ListItemNode -- single item inside a list
+/// BlockQuoteNode -- block quote (>)
+/// ThematicBreakNode -- horizontal rule (---, ***, ___)
+///
+/// All nodes are arena-allocated. The caller owns the arena and must keep it
+/// alive for the lifetime of any returned nodes. The parser never crashes on
+/// malformed input; unrecognized text falls back to TextNode.
///
//===----------------------------------------------------------------------===//
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_DOC_MARKDOWN_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_DOC_MARKDOWN_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_DOC_SUPPORT_MARKDOWN_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_DOC_SUPPORT_MARKDOWN_H
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/StringRef.h"
@@ -38,35 +58,217 @@
namespace clang::doc::markdown {
+/// Discriminator for all Markdown AST nodes. Inline kinds are grouped before
+/// block kinds so that the sentinels NK_LastInline and NK_FirstBlock enable
+/// cheap range-based checks in classof() implementations.
enum class NodeKind {
+ // Inline nodes
+ NK_Text,
+ NK_SoftBreak,
+ NK_HardBreak,
+ NK_InlineCode,
+ NK_Emphasis,
+ NK_Strong,
+ NK_LastInline = NK_Strong, // sentinel -- all inline kinds are <= this
+
// Block nodes
NK_Paragraph,
+ NK_Heading,
NK_FencedCode,
NK_Table,
NK_UnorderedList,
NK_OrderedList,
NK_ListItem,
+ NK_BlockQuote,
NK_ThematicBreak,
- // Inline nodes
- NK_Text,
- NK_InlineCode,
- NK_Emphasis,
- NK_Strong,
- NK_SoftBreak,
+ NK_FirstBlock = NK_Paragraph, // sentinel -- all block kinds are >= this
};
+/// Base type for all Markdown AST nodes. Carries only the kind discriminator.
+/// Nodes are arena-allocated and have no virtual destructor; use
+/// llvm::isa<>/cast<>/dyn_cast<> for type-safe downcasting.
struct MDNode {
NodeKind Kind;
- llvm::StringRef Content; // lang tag for FencedCode, leaf text for Text
- llvm::ArrayRef<MDNode> Children; // arena allocated
+ explicit MDNode(NodeKind K) : Kind(K) {}
+};
+
+//===----------------------------------------------------------------------===//
+// Inline nodes
+//===----------------------------------------------------------------------===//
+
+/// Plain text run.
+struct TextNode : MDNode {
+ llvm::StringRef Text;
+ explicit TextNode(llvm::StringRef Text)
+ : MDNode(NodeKind::NK_Text), Text(Text) {}
+ static bool classof(const MDNode *N) { return N->Kind == NodeKind::NK_Text; }
+};
+
+/// Soft line break -- a newline that does not end the paragraph.
+struct SoftBreakNode : MDNode {
+ SoftBreakNode() : MDNode(NodeKind::NK_SoftBreak) {}
+ static bool classof(const MDNode *N) {
+ return N->Kind == NodeKind::NK_SoftBreak;
+ }
+};
+
+/// Hard line break -- two trailing spaces or a backslash before a newline.
+struct HardBreakNode : MDNode {
+ HardBreakNode() : MDNode(NodeKind::NK_HardBreak) {}
+ static bool classof(const MDNode *N) {
+ return N->Kind == NodeKind::NK_HardBreak;
+ }
+};
+
+/// Inline code span: `code`. Code does not include the surrounding backticks.
+struct InlineCodeNode : MDNode {
+ llvm::StringRef Code;
+ explicit InlineCodeNode(llvm::StringRef Code)
+ : MDNode(NodeKind::NK_InlineCode), Code(Code) {}
+ static bool classof(const MDNode *N) {
+ return N->Kind == NodeKind::NK_InlineCode;
+ }
+};
+
+/// Emphasized text: *text* or _text_.
+struct EmphasisNode : MDNode {
+ llvm::ArrayRef<MDNode *> Children;
+ explicit EmphasisNode(llvm::ArrayRef<MDNode *> Children)
+ : MDNode(NodeKind::NK_Emphasis), Children(Children) {}
+ static bool classof(const MDNode *N) {
+ return N->Kind == NodeKind::NK_Emphasis;
+ }
};
-/// Parses Markdown from a single comment paragraph's text.
-/// Returns an empty ArrayRef if no Markdown constructs are found,
-/// so generators can fall back to plain-text rendering at zero cost.
-llvm::ArrayRef<MDNode> parseMarkdown(llvm::StringRef ParagraphText,
- llvm::BumpPtrAllocator &Arena);
+/// Strongly emphasized text: **text** or __text__.
+struct StrongNode : MDNode {
+ llvm::ArrayRef<MDNode *> Children;
+ explicit StrongNode(llvm::ArrayRef<MDNode *> Children)
+ : MDNode(NodeKind::NK_Strong), Children(Children) {}
+ static bool classof(const MDNode *N) {
+ return N->Kind == NodeKind::NK_Strong;
+ }
+};
+
+//===----------------------------------------------------------------------===//
+// Block nodes
+//===----------------------------------------------------------------------===//
+
+/// A paragraph -- sequence of inline nodes separated from other blocks by
+/// blank lines.
+struct ParagraphNode : MDNode {
+ llvm::ArrayRef<MDNode *> Children;
+ explicit ParagraphNode(llvm::ArrayRef<MDNode *> Children)
+ : MDNode(NodeKind::NK_Paragraph), Children(Children) {}
+ static bool classof(const MDNode *N) {
+ return N->Kind == NodeKind::NK_Paragraph;
+ }
+};
+
+/// ATX heading: one to six leading # characters. Level is declared before
+/// Children to avoid padding between the base class's 4-byte Kind and the
+/// 8-byte-aligned ArrayRef, keeping sizeof(HeadingNode) at 24 bytes.
+struct HeadingNode : MDNode {
+ unsigned Level; // 1-6
+ llvm::ArrayRef<MDNode *> Children; // inline content
+ HeadingNode(unsigned Level, llvm::ArrayRef<MDNode *> Children)
+ : MDNode(NodeKind::NK_Heading), Level(Level), Children(Children) {}
+ static bool classof(const MDNode *N) {
+ return N->Kind == NodeKind::NK_Heading;
+ }
+};
+
+/// Fenced code block opened with ``` or ~~~. Lang is the info string (e.g.
+/// "cpp"); empty when no language was specified. Lines contains the raw text
+/// of each interior line, without the opening or closing fence.
+///
+/// TODO: Follow CommonMark spec §4.5 -- the opening fence may be indented up
+/// to 3 spaces; the closing fence must use the same character and be at least
+/// as long as the opening fence; only spaces may follow the closing fence.
+struct FencedCodeNode : MDNode {
+ llvm::StringRef Lang;
+ llvm::ArrayRef<llvm::StringRef> Lines;
+ FencedCodeNode(llvm::StringRef Lang, llvm::ArrayRef<llvm::StringRef> Lines)
+ : MDNode(NodeKind::NK_FencedCode), Lang(Lang), Lines(Lines) {}
+ static bool classof(const MDNode *N) {
+ return N->Kind == NodeKind::NK_FencedCode;
+ }
+};
+
+/// Pipe table. Rows contains the raw text of each row line including the
+/// header and separator rows.
+/// TODO: replace with a structured header/body/cell representation.
+struct TableNode : MDNode {
+ llvm::ArrayRef<llvm::StringRef> Rows;
+ explicit TableNode(llvm::ArrayRef<llvm::StringRef> Rows)
+ : MDNode(NodeKind::NK_Table), Rows(Rows) {}
+ static bool classof(const MDNode *N) { return N->Kind == NodeKind::NK_Table; }
+};
+
+/// A single list item. Children may contain block-level nodes for loose
+/// lists, or a single inline sequence for tight lists.
+struct ListItemNode : MDNode {
+ llvm::ArrayRef<MDNode *> Children;
+ explicit ListItemNode(llvm::ArrayRef<MDNode *> Children)
+ : MDNode(NodeKind::NK_ListItem), Children(Children) {}
+ static bool classof(const MDNode *N) {
+ return N->Kind == NodeKind::NK_ListItem;
+ }
+};
+
+/// Unordered (bullet) list. Markers are -, *, or +.
+struct UnorderedListNode : MDNode {
+ llvm::ArrayRef<ListItemNode *> Items;
+ explicit UnorderedListNode(llvm::ArrayRef<ListItemNode *> Items)
+ : MDNode(NodeKind::NK_UnorderedList), Items(Items) {}
+ static bool classof(const MDNode *N) {
+ return N->Kind == NodeKind::NK_UnorderedList;
+ }
+};
+
+/// Ordered (numbered) list. Start is the number on the first item. Start is
+/// declared before Items to avoid padding, keeping sizeof at 24 bytes.
+struct OrderedListNode : MDNode {
+ unsigned Start;
+ llvm::ArrayRef<ListItemNode *> Items;
+ OrderedListNode(unsigned Start, llvm::ArrayRef<ListItemNode *> Items)
+ : MDNode(NodeKind::NK_OrderedList), Start(Start), Items(Items) {}
+ static bool classof(const MDNode *N) {
+ return N->Kind == NodeKind::NK_OrderedList;
+ }
+};
+
+/// Block quote (> ...). Children are block-level nodes inside the quote.
+struct BlockQuoteNode : MDNode {
+ llvm::ArrayRef<MDNode *> Children;
+ explicit BlockQuoteNode(llvm::ArrayRef<MDNode *> Children)
+ : MDNode(NodeKind::NK_BlockQuote), Children(Children) {}
+ static bool classof(const MDNode *N) {
+ return N->Kind == NodeKind::NK_BlockQuote;
+ }
+};
+
+/// Thematic break: a line of three or more ---, ***, or ___ characters.
+struct ThematicBreakNode : MDNode {
+ ThematicBreakNode() : MDNode(NodeKind::NK_ThematicBreak) {}
+ static bool classof(const MDNode *N) {
+ return N->Kind == NodeKind::NK_ThematicBreak;
+ }
+};
+
+//===----------------------------------------------------------------------===//
+// Parser entry point
+//===----------------------------------------------------------------------===//
+
+/// Parse Markdown from a single paragraph of plain text. Returns a list of
+/// top-level block nodes allocated in Arena. Returns an empty ArrayRef if no
+/// Markdown constructs are found, letting callers fall back to plain-text
+/// rendering at zero cost. The parser never crashes on malformed input.
+///
+/// The caller must keep Arena alive for the lifetime of any returned nodes.
+llvm::ArrayRef<MDNode *> parseMarkdown(llvm::StringRef ParagraphText,
+ llvm::BumpPtrAllocator &Arena);
} // namespace clang::doc::markdown
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_DOC_MARKDOWN_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_DOC_SUPPORT_MARKDOWN_H
diff --git a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
index 4ca979c1f1d24..b61094f034375 100644
--- a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
+++ b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
@@ -8,9 +8,11 @@
#include "support/Markdown.h"
#include "llvm/Support/Allocator.h"
+#include "llvm/Support/Casting.h"
#include "gtest/gtest.h"
using namespace clang::doc::markdown;
+using namespace llvm;
namespace {
@@ -31,9 +33,8 @@ TEST_F(MarkdownParserTest, WhitespaceOnlyInput) {
TEST_F(MarkdownParserTest, PlainText) {
auto Nodes = parseMarkdown("hello world", Arena);
ASSERT_EQ(Nodes.size(), 1u);
- const auto &N = Nodes[0];
- EXPECT_EQ(N.Kind, NodeKind::NK_Text);
- EXPECT_EQ(N.Content, "hello world");
+ auto *N = cast<TextNode>(Nodes[0]);
+ EXPECT_EQ(N->Text, "hello world");
}
TEST_F(MarkdownParserTest, FencedCodeBlock) {
@@ -42,10 +43,9 @@ int x = 0;
````````)",
Arena);
ASSERT_EQ(Nodes.size(), 1u);
- const auto &N = Nodes[0];
- EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode);
- EXPECT_EQ(N.Content, "cpp");
- ASSERT_EQ(N.Children.size(), 1u);
+ auto *N = cast<FencedCodeNode>(Nodes[0]);
+ EXPECT_EQ(N->Lang, "cpp");
+ ASSERT_EQ(N->Lines.size(), 1u);
}
TEST_F(MarkdownParserTest, FencedCodeBlockNoLang) {
@@ -54,9 +54,8 @@ some code
```````)",
Arena);
ASSERT_EQ(Nodes.size(), 1u);
- const auto &N = Nodes[0];
- EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode);
- EXPECT_TRUE(N.Content.empty());
+ auto *N = cast<FencedCodeNode>(Nodes[0]);
+ EXPECT_TRUE(N->Lang.empty());
}
TEST_F(MarkdownParserTest, UnterminatedFenceReturnsEmpty) {
@@ -74,7 +73,7 @@ TEST_F(MarkdownParserTest, PipeTable) {
| 1 | 2 |)",
Arena);
ASSERT_EQ(Nodes.size(), 1u);
- EXPECT_EQ(Nodes[0].Kind, NodeKind::NK_Table);
+ EXPECT_TRUE(isa<TableNode>(Nodes[0]));
}
TEST_F(MarkdownParserTest, PipeCharacterWithoutSepRowIsPlainText) {
@@ -82,8 +81,8 @@ TEST_F(MarkdownParserTest, PipeCharacterWithoutSepRowIsPlainText) {
c | d)",
Arena);
// No separator row so should not be parsed as a table.
- for (const auto &Node : Nodes)
- EXPECT_NE(Node.Kind, NodeKind::NK_Table);
+ for (const auto *Node : Nodes)
+ EXPECT_FALSE(isa<TableNode>(Node));
}
TEST_F(MarkdownParserTest, UnorderedList) {
@@ -92,12 +91,11 @@ TEST_F(MarkdownParserTest, UnorderedList) {
- baz)",
Arena);
ASSERT_EQ(Nodes.size(), 1u);
- const auto &N = Nodes[0];
- EXPECT_EQ(N.Kind, NodeKind::NK_UnorderedList);
- ASSERT_EQ(N.Children.size(), 3u);
- EXPECT_EQ(N.Children[0].Content, "foo");
- EXPECT_EQ(N.Children[1].Content, "bar");
- EXPECT_EQ(N.Children[2].Content, "baz");
+ auto *N = cast<UnorderedListNode>(Nodes[0]);
+ ASSERT_EQ(N->Items.size(), 3u);
+ EXPECT_EQ(cast<TextNode>(N->Items[0]->Children[0])->Text, "foo");
+ EXPECT_EQ(cast<TextNode>(N->Items[1]->Children[0])->Text, "bar");
+ EXPECT_EQ(cast<TextNode>(N->Items[2]->Children[0])->Text, "baz");
}
TEST_F(MarkdownParserTest, MixedContent) {
@@ -117,10 +115,9 @@ int x = 0;
~~~)",
Arena);
ASSERT_EQ(Nodes.size(), 1u);
- const auto &N = Nodes[0];
- EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode);
- EXPECT_TRUE(N.Content.empty());
- ASSERT_EQ(N.Children.size(), 1u);
+ auto *N = cast<FencedCodeNode>(Nodes[0]);
+ EXPECT_TRUE(N->Lang.empty());
+ ASSERT_EQ(N->Lines.size(), 1u);
}
// CommonMark §4.5 example 120: tilde fence with a language tag.
@@ -130,10 +127,9 @@ int x = 0;
~~~)",
Arena);
ASSERT_EQ(Nodes.size(), 1u);
- const auto &N = Nodes[0];
- EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode);
- EXPECT_EQ(N.Content, "cpp");
- ASSERT_EQ(N.Children.size(), 1u);
+ auto *N = cast<FencedCodeNode>(Nodes[0]);
+ EXPECT_EQ(N->Lang, "cpp");
+ ASSERT_EQ(N->Lines.size(), 1u);
}
// CommonMark §4.5 example 122: a tilde line does not close a backtick fence.
@@ -144,10 +140,9 @@ aaa
````````)",
Arena);
ASSERT_EQ(Nodes.size(), 1u);
- const auto &N = Nodes[0];
- EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode);
+ auto *N = cast<FencedCodeNode>(Nodes[0]);
// ~~~ is content, not a closing fence.
- ASSERT_EQ(N.Children.size(), 2u);
+ ASSERT_EQ(N->Lines.size(), 2u);
}
// CommonMark §4.5 example 130: a code block can be empty.
@@ -156,18 +151,16 @@ TEST_F(MarkdownParserTest, EmptyFencedCodeBlock) {
```````)",
Arena);
ASSERT_EQ(Nodes.size(), 1u);
- const auto &N = Nodes[0];
- EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode);
- EXPECT_TRUE(N.Children.empty());
+ auto *N = cast<FencedCodeNode>(Nodes[0]);
+ EXPECT_TRUE(N->Lines.empty());
}
// CommonMark §4.5 example 129: a code block may contain only blank lines.
TEST_F(MarkdownParserTest, FencedCodeBlockBlankLineContent) {
auto Nodes = parseMarkdown("```\n\n \n```", Arena);
ASSERT_EQ(Nodes.size(), 1u);
- const auto &N = Nodes[0];
- EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode);
- ASSERT_EQ(N.Children.size(), 2u);
+ auto *N = cast<FencedCodeNode>(Nodes[0]);
+ ASSERT_EQ(N->Lines.size(), 2u);
}
// CommonMark §4.5 example 142: lang tag is captured from the info string.
@@ -179,10 +172,9 @@ end
``````)",
Arena);
ASSERT_EQ(Nodes.size(), 1u);
- const auto &N = Nodes[0];
- EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode);
- EXPECT_EQ(N.Content, "ruby");
- ASSERT_EQ(N.Children.size(), 3u);
+ auto *N = cast<FencedCodeNode>(Nodes[0]);
+ EXPECT_EQ(N->Lang, "ruby");
+ ASSERT_EQ(N->Lines.size(), 3u);
}
// CommonMark §4.5 example 146: tilde fence info string may contain backticks.
@@ -192,10 +184,9 @@ foo
~~~)",
Arena);
ASSERT_EQ(Nodes.size(), 1u);
- const auto &N = Nodes[0];
- EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode);
- EXPECT_EQ(N.Content, "aa ``` ~~~");
- ASSERT_EQ(N.Children.size(), 1u);
+ auto *N = cast<FencedCodeNode>(Nodes[0]);
+ EXPECT_EQ(N->Lang, "aa ``` ~~~");
+ ASSERT_EQ(N->Lines.size(), 1u);
}
// CommonMark §4.5 example 124: closing fence must be at least as long as the
@@ -209,9 +200,8 @@ TEST_F(MarkdownParserTest, ClosingFenceLengthTODO) {
// parser currently treats it as a closing fence. This test documents the
// current (non-conformant) behavior.
ASSERT_EQ(Nodes.size(), 1u);
- const auto &N = Nodes[0];
- EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode);
- ASSERT_EQ(N.Children.size(), 1u);
+ auto *N = cast<FencedCodeNode>(Nodes[0]);
+ ASSERT_EQ(N->Lines.size(), 1u);
}
} // namespace
\ No newline at end of file
>From 25fe7daff183f51a8b31ed0d8481b9a2f1fbdbd8 Mon Sep 17 00:00:00 2001
From: Neil-N4 <neilnair4 at gmail.com>
Date: Fri, 12 Jun 2026 02:33:44 -0400
Subject: [PATCH 5/5] [clang-doc] Introduce LineReader cursor for the Markdown
parse loop
Replace the raw size_t I = 0, E = Lines.size() index arithmetic in
parseMarkdown() with a LineReader cursor that encapsulates the position
and exposes peek(), peek(Offset), advance(), and atEnd(). The parse
logic and emitted nodes are unchanged; this only removes manual index
bookkeeping. All 18 MarkdownParserTest cases still pass.
Co-Authored-By: Claude Opus 4.8 <noreply at anthropic.com>
---
.../clang-doc/support/Markdown.cpp | 73 ++++++++++++++-----
1 file changed, 54 insertions(+), 19 deletions(-)
diff --git a/clang-tools-extra/clang-doc/support/Markdown.cpp b/clang-tools-extra/clang-doc/support/Markdown.cpp
index bee15c3e23ec3..f171457e73046 100644
--- a/clang-tools-extra/clang-doc/support/Markdown.cpp
+++ b/clang-tools-extra/clang-doc/support/Markdown.cpp
@@ -11,6 +11,7 @@
#include "llvm/ADT/StringRef.h"
#include "llvm/Support/Allocator.h"
#include "llvm/Support/DebugLog.h"
+#include <cassert>
#define DEBUG_TYPE "clang-doc"
@@ -52,6 +53,42 @@ static bool isListItem(StringRef Line) {
Line.starts_with("+ ");
}
+// A forward cursor over the lines of a paragraph. Encapsulates the parse
+// position so the loop can inspect the current or an upcoming line and consume
+// lines without manual index arithmetic. Lines are stored untrimmed; callers
+// trim where they need a normalized view.
+class LineReader {
+public:
+ explicit LineReader(ArrayRef<StringRef> Lines) : Lines(Lines) {}
+
+ // True once every line has been consumed.
+ bool atEnd() const { return Pos >= Lines.size(); }
+
+ // The current line, untrimmed. Must not be called when atEnd().
+ StringRef peek() const {
+ assert(!atEnd() && "peek past end of input");
+ return Lines[Pos];
+ }
+
+ // The line Offset positions ahead of the cursor, or an empty StringRef when
+ // that position is past the end. peek(0) is the current line.
+ StringRef peek(size_t Offset) const {
+ size_t Target = Pos + Offset;
+ return Target < Lines.size() ? Lines[Target] : StringRef();
+ }
+
+ // Consume the current line and return it, untrimmed. Must not be called when
+ // atEnd().
+ StringRef advance() {
+ assert(!atEnd() && "advance past end of input");
+ return Lines[Pos++];
+ }
+
+private:
+ ArrayRef<StringRef> Lines;
+ size_t Pos = 0;
+};
+
ArrayRef<MDNode *> parseMarkdown(StringRef ParagraphText,
BumpPtrAllocator &Arena) {
if (ParagraphText.trim().empty())
@@ -61,13 +98,13 @@ ArrayRef<MDNode *> parseMarkdown(StringRef ParagraphText,
ParagraphText.split(Lines, '\n');
SmallVector<MDNode *> Nodes;
- size_t I = 0, E = Lines.size();
+ LineReader Reader(Lines);
- while (I < E) {
- StringRef Line = Lines[I].trim();
+ while (!Reader.atEnd()) {
+ StringRef Line = Reader.peek().trim();
if (Line.empty()) {
- ++I;
+ Reader.advance();
continue;
}
@@ -79,18 +116,18 @@ ArrayRef<MDNode *> parseMarkdown(StringRef ParagraphText,
if (Line.starts_with("```") || Line.starts_with("~~~")) {
char Fence = Line[0];
StringRef Lang = internString(Line.drop_front(3).trim(), Arena);
+ Reader.advance(); // consume opening fence
SmallVector<StringRef> CodeLines;
- ++I;
- while (I < E) {
- StringRef CodeLine = Lines[I].trim();
+ while (!Reader.atEnd()) {
+ StringRef CodeLine = Reader.peek().trim();
if (CodeLine.size() >= 3 &&
all_of(CodeLine.take_front(3),
[Fence](char C) { return C == Fence; }))
break;
- CodeLines.push_back(internString(Lines[I], Arena));
- ++I;
+ CodeLines.push_back(internString(Reader.advance(), Arena));
}
- ++I; // skip closing fence
+ if (!Reader.atEnd())
+ Reader.advance(); // consume closing fence
auto *Code =
new (Arena) FencedCodeNode(Lang, allocateArray(CodeLines, Arena));
LDBG() << "emitting FencedCodeNode lang='" << Lang
@@ -100,12 +137,10 @@ ArrayRef<MDNode *> parseMarkdown(StringRef ParagraphText,
}
// Pipe table: current line has | and next line is a separator row.
- if (Line.contains('|') && I + 1 < E && isSepRow(Lines[I + 1].trim())) {
+ if (Line.contains('|') && isSepRow(Reader.peek(1).trim())) {
SmallVector<StringRef> Rows;
- while (I < E && Lines[I].trim().contains('|')) {
- Rows.push_back(internString(Lines[I].trim(), Arena));
- ++I;
- }
+ while (!Reader.atEnd() && Reader.peek().trim().contains('|'))
+ Rows.push_back(internString(Reader.advance().trim(), Arena));
auto *Table = new (Arena) TableNode(allocateArray(Rows, Arena));
LDBG() << "emitting TableNode rows=" << Rows.size();
Nodes.push_back(Table);
@@ -115,8 +150,8 @@ ArrayRef<MDNode *> parseMarkdown(StringRef ParagraphText,
// Unordered list item.
if (isListItem(Line)) {
SmallVector<ListItemNode *> Items;
- while (I < E) {
- StringRef L = Lines[I].trim();
+ while (!Reader.atEnd()) {
+ StringRef L = Reader.peek().trim();
if (!isListItem(L))
break;
StringRef ItemText = internString(L.drop_front(2).trim(), Arena);
@@ -125,7 +160,7 @@ ArrayRef<MDNode *> parseMarkdown(StringRef ParagraphText,
auto *Item =
new (Arena) ListItemNode(allocateArray(ItemChildren, Arena));
Items.push_back(Item);
- ++I;
+ Reader.advance();
}
auto *List = new (Arena) UnorderedListNode(allocateArray(Items, Arena));
LDBG() << "emitting UnorderedListNode items=" << Items.size();
@@ -135,7 +170,7 @@ ArrayRef<MDNode *> parseMarkdown(StringRef ParagraphText,
// Plain text fallback.
Nodes.push_back(new (Arena) TextNode(internString(Line, Arena)));
- ++I;
+ Reader.advance();
}
LDBG() << "parseMarkdown done nodes=" << Nodes.size();
More information about the cfe-commits
mailing list