[clang-tools-extra] [clang-doc] Add standalone Markdown parsing library (PR #202991)
Neil Nair via cfe-commits
cfe-commits at lists.llvm.org
Wed Jun 10 07:20:15 PDT 2026
https://github.com/Neil-N4 created https://github.com/llvm/llvm-project/pull/202991
Adds a standalone Markdown parsing library under clang-doc/support. The parser takes plain paragraph text and returns a flat list of typed nodes (fenced code blocks, pipe tables, unordered lists, plain text fallback). No knowledge of Doxygen or Clang-Doc internals.All nodes are arena-allocated. Returns an empty ArrayRef if no Markdown constructs are found so generators can fall back to plain text at zero cost.
Currently handles: fenced code blocks (``` or ~~~), pipe tables, unordered lists, plain text.10 unit tests included. Integration into the JSON generator is tracked separately. Assisted-by: Claude
>From b4623400ed04066b222882d46bbddf1819ffca9e Mon Sep 17 00:00:00 2001
From: Neil-N4 <neilnair4 at gmail.com>
Date: Wed, 10 Jun 2026 09:51:48 -0400
Subject: [PATCH] [clang-doc] Add standalone Markdown parsing library
---
.../clang-doc/support/CMakeLists.txt | 3 +-
.../clang-doc/support/Markdown.cpp | 145 ++++++++++++++++++
.../clang-doc/support/Markdown.h | 72 +++++++++
.../unittests/clang-doc/CMakeLists.txt | 4 +-
.../clang-doc/MarkdownParserTest.cpp | 94 ++++++++++++
5 files changed, 316 insertions(+), 2 deletions(-)
create mode 100644 clang-tools-extra/clang-doc/support/Markdown.cpp
create mode 100644 clang-tools-extra/clang-doc/support/Markdown.h
create mode 100644 clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
diff --git a/clang-tools-extra/clang-doc/support/CMakeLists.txt b/clang-tools-extra/clang-doc/support/CMakeLists.txt
index 8ac913ffbe998..acff865190ff9 100644
--- a/clang-tools-extra/clang-doc/support/CMakeLists.txt
+++ b/clang-tools-extra/clang-doc/support/CMakeLists.txt
@@ -6,5 +6,6 @@ set(LLVM_LINK_COMPONENTS
add_clang_library(clangDocSupport STATIC
File.cpp
+ Markdown.cpp
Utils.cpp
- )
+ )
\ No newline at end of file
diff --git a/clang-tools-extra/clang-doc/support/Markdown.cpp b/clang-tools-extra/clang-doc/support/Markdown.cpp
new file mode 100644
index 0000000000000..776150b939d27
--- /dev/null
+++ b/clang-tools-extra/clang-doc/support/Markdown.cpp
@@ -0,0 +1,145 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "Markdown.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/DebugLog.h"
+
+#define DEBUG_TYPE "clang-doc-markdown"
+
+using namespace llvm;
+
+namespace clang::doc::markdown {
+
+static MDNode makeText(StringRef S) {
+ return {NodeKind::NK_Text, S, {}};
+}
+
+// A line is a table separator if it only contains |, -, :, and spaces,
+// and has at least one -.
+static bool isSepRow(StringRef Line) {
+ return Line.contains('-') &&
+ Line.find_first_not_of("|-: ") == StringRef::npos;
+}
+
+// Returns true if Line begins with a bullet list marker (-, *, or +)
+// followed by a space.
+static bool isListItem(StringRef Line) {
+ return Line.starts_with("- ") || Line.starts_with("* ") ||
+ Line.starts_with("+ ");
+}
+
+static ArrayRef<MDNode> allocateNodes(const SmallVectorImpl<MDNode> &Nodes,
+ BumpPtrAllocator &Arena) {
+ if (Nodes.empty())
+ return {};
+ MDNode *Allocated = Arena.Allocate<MDNode>(Nodes.size());
+ std::uninitialized_copy(Nodes.begin(), Nodes.end(), Allocated);
+ return ArrayRef<MDNode>(Allocated, Nodes.size());
+}
+
+ArrayRef<MDNode> parseMarkdown(StringRef ParagraphText,
+ BumpPtrAllocator &Arena) {
+ if (ParagraphText.trim().empty())
+ return {};
+
+ SmallVector<StringRef, 16> Lines;
+ ParagraphText.split(Lines, '\n');
+
+ SmallVector<MDNode> Nodes;
+ size_t I = 0, E = Lines.size();
+
+ while (I < E) {
+ StringRef Line = Lines[I].trim();
+
+ if (Line.empty()) {
+ ++I;
+ continue;
+ }
+
+ // TODO: Follow CommonMark spec ยง4.5 more closely -- opening fences may be
+ // indented up to 3 spaces, the closing fence must use the same character
+ // and be at least as long as the opening fence, and the closing fence may
+ // only be followed by spaces. Doxygen specifics should be handled on a
+ // case-by-case basis.
+ if (Line.starts_with("```") || Line.starts_with("~~~")) {
+ char Fence = Line[0];
+ StringRef Lang = Line.drop_front(3).trim();
+ SmallVector<MDNode> CodeLines;
+ ++I;
+ while (I < E) {
+ StringRef CodeLine = Lines[I].trim();
+ if (CodeLine.size() >= 3 &&
+ all_of(CodeLine.take_front(3),
+ [Fence](char C) { return C == Fence; }))
+ break;
+ CodeLines.push_back(makeText(Lines[I]));
+ ++I;
+ }
+ ++I; // skip closing fence
+ MDNode Code;
+ Code.Kind = NodeKind::NK_FencedCode;
+ Code.Content = Lang;
+ Code.Children = allocateNodes(CodeLines, Arena);
+ LDBG() << "emitting NK_FencedCode lang='" << Lang
+ << "' lines=" << CodeLines.size();
+ Nodes.push_back(Code);
+ continue;
+ }
+
+ // Pipe table: current line has | and next line is a separator row.
+ if (Line.contains('|') && I + 1 < E && isSepRow(Lines[I + 1].trim())) {
+ SmallVector<MDNode> Rows;
+ while (I < E && Lines[I].trim().contains('|')) {
+ Rows.push_back(makeText(Lines[I].trim()));
+ ++I;
+ }
+ MDNode Table;
+ Table.Kind = NodeKind::NK_Table;
+ Table.Content = {};
+ Table.Children = allocateNodes(Rows, Arena);
+ LDBG() << "emitting NK_Table rows=" << Rows.size();
+ Nodes.push_back(Table);
+ continue;
+ }
+
+ // Unordered list item.
+ if (isListItem(Line)) {
+ SmallVector<MDNode> Items;
+ while (I < E) {
+ StringRef L = Lines[I].trim();
+ if (!isListItem(L))
+ break;
+ MDNode Item;
+ Item.Kind = NodeKind::NK_ListItem;
+ Item.Content = L.drop_front(2).trim();
+ Item.Children = {};
+ Items.push_back(Item);
+ ++I;
+ }
+ MDNode List;
+ List.Kind = NodeKind::NK_UnorderedList;
+ List.Content = {};
+ List.Children = allocateNodes(Items, Arena);
+ LDBG() << "emitting NK_UnorderedList items=" << Items.size();
+ Nodes.push_back(List);
+ continue;
+ }
+
+ // Plain text fallback.
+ Nodes.push_back(makeText(Line));
+ ++I;
+ }
+
+ LDBG() << "parseMarkdown done nodes=" << Nodes.size();
+ return allocateNodes(Nodes, Arena);
+}
+
+} // namespace clang::doc::markdown
\ No newline at end of file
diff --git a/clang-tools-extra/clang-doc/support/Markdown.h b/clang-tools-extra/clang-doc/support/Markdown.h
new file mode 100644
index 0000000000000..890f764f937b1
--- /dev/null
+++ b/clang-tools-extra/clang-doc/support/Markdown.h
@@ -0,0 +1,72 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines a standalone Markdown parsing library for the LLVM
+/// ecosystem. The parser takes plain text and returns a tree of typed nodes
+/// with no knowledge of comments, Doxygen, or Clang-Doc internals.
+///
+/// This is a simple Markdown parser for use inside Clang-Doc's comment
+/// pipeline. You give it a paragraph of text and an arena allocator, and it
+/// gives back a list of typed nodes describing the Markdown structure it found.
+///
+/// The main entry point is parseMarkdown(). If the text has no Markdown in it,
+/// you get back an empty list and can fall back to plain-text output. If it
+/// does, you get a tree of MDNode structs where each node has a kind, optional
+/// content (like the language tag on a code fence), and optional children.
+///
+/// All nodes are allocated in the arena you pass in. You own the arena and are
+/// responsible for keeping it alive as long as you use the nodes.
+///
+/// The parser handles fenced code blocks, pipe tables, and unordered lists.
+/// Anything it does not recognize comes back as a plain text node. It will
+/// never crash on bad input.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_DOC_MARKDOWN_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_DOC_MARKDOWN_H
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Allocator.h"
+
+namespace clang::doc::markdown {
+
+enum class NodeKind {
+ // Block nodes
+ NK_Paragraph,
+ NK_FencedCode,
+ NK_Table,
+ NK_UnorderedList,
+ NK_OrderedList,
+ NK_ListItem,
+ NK_ThematicBreak,
+ // Inline nodes
+ NK_Text,
+ NK_InlineCode,
+ NK_Emphasis,
+ NK_Strong,
+ NK_SoftBreak,
+};
+
+struct MDNode {
+ NodeKind Kind;
+ llvm::StringRef Content; // lang tag for FencedCode, leaf text for Text
+ llvm::ArrayRef<MDNode> Children; // arena allocated
+};
+
+/// Parses Markdown from a single comment paragraph's text.
+/// Returns an empty ArrayRef if no Markdown constructs are found,
+/// so generators can fall back to plain-text rendering at zero cost.
+llvm::ArrayRef<MDNode> parseMarkdown(llvm::StringRef ParagraphText,
+ llvm::BumpPtrAllocator &Arena);
+
+} // namespace clang::doc::markdown
+
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_DOC_MARKDOWN_H
\ No newline at end of file
diff --git a/clang-tools-extra/unittests/clang-doc/CMakeLists.txt b/clang-tools-extra/unittests/clang-doc/CMakeLists.txt
index 01b34ec9a791e..b74207ac88fa7 100644
--- a/clang-tools-extra/unittests/clang-doc/CMakeLists.txt
+++ b/clang-tools-extra/unittests/clang-doc/CMakeLists.txt
@@ -26,6 +26,7 @@ add_extra_unittest(ClangDocTests
ClangDocTest.cpp
GeneratorTest.cpp
HTMLGeneratorTest.cpp
+ MarkdownParserTest.cpp
MDGeneratorTest.cpp
MergeTest.cpp
SerializeTest.cpp
@@ -49,5 +50,6 @@ clang_target_link_libraries(ClangDocTests
target_link_libraries(ClangDocTests
PRIVATE
clangDoc
+ clangDocSupport
LLVMTestingSupport
- )
+ )
\ No newline at end of file
diff --git a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
new file mode 100644
index 0000000000000..8df5efc7f1d5f
--- /dev/null
+++ b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
@@ -0,0 +1,94 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "support/Markdown.h"
+#include "llvm/Support/Allocator.h"
+#include "gtest/gtest.h"
+
+using namespace clang::doc::markdown;
+
+namespace {
+
+TEST(MarkdownParserTest, EmptyInput) {
+ llvm::BumpPtrAllocator Arena;
+ auto Nodes = parseMarkdown("", Arena);
+ EXPECT_TRUE(Nodes.empty());
+}
+
+TEST(MarkdownParserTest, WhitespaceOnlyInput) {
+ llvm::BumpPtrAllocator Arena;
+ auto Nodes = parseMarkdown(" \n \n", Arena);
+ EXPECT_TRUE(Nodes.empty());
+}
+
+TEST(MarkdownParserTest, PlainText) {
+ llvm::BumpPtrAllocator Arena;
+ auto Nodes = parseMarkdown("hello world", Arena);
+ ASSERT_EQ(Nodes.size(), 1u);
+ EXPECT_EQ(Nodes[0].Kind, NodeKind::NK_Text);
+ EXPECT_EQ(Nodes[0].Content, "hello world");
+}
+
+TEST(MarkdownParserTest, FencedCodeBlock) {
+ llvm::BumpPtrAllocator Arena;
+ auto Nodes = parseMarkdown("```cpp\nint x = 0;\n```", Arena);
+ ASSERT_EQ(Nodes.size(), 1u);
+ EXPECT_EQ(Nodes[0].Kind, NodeKind::NK_FencedCode);
+ EXPECT_EQ(Nodes[0].Content, "cpp");
+ ASSERT_EQ(Nodes[0].Children.size(), 1u);
+}
+
+TEST(MarkdownParserTest, FencedCodeBlockNoLang) {
+ llvm::BumpPtrAllocator Arena;
+ auto Nodes = parseMarkdown("```\nsome code\n```", Arena);
+ ASSERT_EQ(Nodes.size(), 1u);
+ EXPECT_EQ(Nodes[0].Kind, NodeKind::NK_FencedCode);
+ EXPECT_TRUE(Nodes[0].Content.empty());
+}
+
+TEST(MarkdownParserTest, UnterminatedFenceReturnsEmpty) {
+ llvm::BumpPtrAllocator Arena;
+ auto Nodes = parseMarkdown("```cpp\nint x = 0;", Arena);
+ // Unterminated fence should not crash and should produce a code node
+ // with whatever lines were found.
+ EXPECT_FALSE(Nodes.empty());
+}
+
+TEST(MarkdownParserTest, PipeTable) {
+ llvm::BumpPtrAllocator Arena;
+ auto Nodes = parseMarkdown("| A | B |\n|---|---|\n| 1 | 2 |", Arena);
+ ASSERT_EQ(Nodes.size(), 1u);
+ EXPECT_EQ(Nodes[0].Kind, NodeKind::NK_Table);
+}
+
+TEST(MarkdownParserTest, PipeCharacterWithoutSepRowIsPlainText) {
+ llvm::BumpPtrAllocator Arena;
+ auto Nodes = parseMarkdown("a | b\nc | d", Arena);
+ // No separator row so should not be parsed as a table
+ for (const auto &Node : Nodes)
+ EXPECT_NE(Node.Kind, NodeKind::NK_Table);
+}
+
+TEST(MarkdownParserTest, UnorderedList) {
+ llvm::BumpPtrAllocator Arena;
+ auto Nodes = parseMarkdown("- foo\n- bar\n- baz", Arena);
+ ASSERT_EQ(Nodes.size(), 1u);
+ EXPECT_EQ(Nodes[0].Kind, NodeKind::NK_UnorderedList);
+ ASSERT_EQ(Nodes[0].Children.size(), 3u);
+ EXPECT_EQ(Nodes[0].Children[0].Content, "foo");
+ EXPECT_EQ(Nodes[0].Children[1].Content, "bar");
+ EXPECT_EQ(Nodes[0].Children[2].Content, "baz");
+}
+
+TEST(MarkdownParserTest, MixedContent) {
+ llvm::BumpPtrAllocator Arena;
+ auto Nodes = parseMarkdown("some text\n```\ncode\n```\n- item", Arena);
+ EXPECT_EQ(Nodes.size(), 3u);
+}
+
+} // namespace
\ No newline at end of file
More information about the cfe-commits
mailing list